diff --git a/README.md b/README.md
index 3cdb6e478ddf4f18af7f81bb3e321510903beb9d..c66f7e3f3f49ed90e4e75475185585a932049f37 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
-between them.  This flexible architecture lets you deploy computation to one
+between them.  This flexible architecture enables you to deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
 code.  TensorFlow also includes TensorBoard, a data visualization toolkit.
 
@@ -86,6 +86,7 @@ The TensorFlow project strives to abide by generally accepted best practices in
 
 * [TensorFlow Website](https://www.tensorflow.org)
 * [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
+* [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
 * [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
diff --git a/RELEASE.md b/RELEASE.md
index 6f54dee58f75c29a16545ba25de12fe059baf1eb..e8459531748628fd822d876d79625fdd65798791 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,74 @@
+# Release 1.7.0
+
+## Major Features And Improvements
+* Eager mode is moving out of contrib, try `tf.enable_eager_execution()`.
+* Graph rewrites emulating fixed-point quantization compatible with TensorFlow Lite, supported by new `tf.contrib.quantize` package.
+* Easily customize gradient computation with `tf.custom_gradient`.
+* [TensorBoard Debugger Plugin](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md), the graphical user interface (GUI) of TensorFlow Debugger (tfdbg), is now in alpha.
+* Experimental support for reading a sqlite database as a `Dataset` with new `tf.contrib.data.SqlDataset`.
+* Distributed Mutex / CriticalSection added to `tf.contrib.framework.CriticalSection`.
+* Better text processing with `tf.regex_replace`.
+* Easy, efficient sequence input with `tf.contrib.data.bucket_by_sequence_length`
+* Initial support for `tf.contrib.tensorrt` that enables native TensorRT in
+  TensorFlow.
+
+## Bug Fixes and Other Changes
+* Accelerated Linear Algebra (XLA):
+  * Add `MaxPoolGradGrad` support for XLA
+  * CSE pass from Tensorflow is now disabled in XLA.
+* `tf.data`:
+  * `tf.data.Dataset`
+    * Add support for building C++ Dataset op kernels as external libraries, using the `tf.load_op_library()` mechanism.
+    * `Dataset.list_files()` now shuffles its output by default.
+    * `Dataset.shuffle(..., seed=tf.constant(0, dtype=tf.int64))` now yields the same sequence of elements as `Dataset.shuffle(..., seed=0)`.
+  * Add `num_parallel_reads` argument to `tf.data.TFRecordDataset`.
+* `tf.contrib`:
+  * `tf.contrib.bayesflow.halton_sequence` now supports randomization.
+  * Add support for scalars in `tf.contrib.all_reduce`.
+  * Add `effective_sample_size` to `tf.contrib.bayesflow.mcmc_diagnostics`.
+  * Add `potential_scale_reduction` to `tf.contrib.bayesflow.mcmc_diagnostics`.
+  * Add `BatchNormalization`, `Kumaraswamy` bijectors.
+  * Deprecate `tf.contrib.learn`. Please check contrib/learn/README.md for instructions on how to convert existing code.
+  * `tf.contrib.data`
+    * Remove deprecated `tf.contrib.data.Dataset`, `tf.contrib.data.Iterator`, `tf.contrib.data.FixedLengthRecordDataset`, `tf.contrib.data.TextLineDataset`, and `tf.contrib.data.TFRecordDataset` classes.
+    * Added `bucket_by_sequence_length`, `sliding_window_batch`, and `make_batched_features_dataset`
+  * Remove unmaintained `tf.contrib.ndlstm`. You can find it externally at https://github.com/tmbarchive/tfndlstm.
+  * Moved most of `tf.contrib.bayesflow` to its own repo: `tfp`
+* Other:
+  * tf.py_func now reports the full stack trace if an exception occurs.
+  * Integrate `TPUClusterResolver` with GKE's integration for Cloud TPUs.
+  * Add a library for statistical testing of samplers.
+  * Add Helpers to stream data from the GCE VM to a Cloud TPU.
+  * Integrate ClusterResolvers with TPUEstimator.
+  * Unify metropolis_hastings interface with HMC kernel.
+  * Move LIBXSMM convolutions to a separate --define flag so that they are disabled by default.
+  * Fix `MomentumOptimizer` lambda.
+  * Reduce `tfp.layers` boilerplate via programmable docstrings.
+  * Add `auc_with_confidence_intervals`, a method for computing the AUC and confidence interval with linearithmic time complexity.
+  * `regression_head` now accepts customized link function, to satisfy the usage that user can define their own link function if the `array_ops.identity` does not meet the requirement.
+  * Fix `initialized_value` and `initial_value` behaviors for `ResourceVariables` created from `VariableDef` protos.
+  * Add TensorSpec to represent the specification of Tensors.
+  * Constant folding pass is now deterministic.
+  * Support `float16` `dtype` in `tf.linalg.*`.
+  * Add `tf.estimator.export.TensorServingInputReceiver` that allows `tf.estimator.Estimator.export_savedmodel` to pass raw tensors to model functions.
+
+## Deprecations
+
+* TensorFlow 1.7 may be the last time we support Cuda versions below 8.0.
+  Starting with TensorFlow 1.8 release, 8.0 will be the minimum supported
+  version.
+* TensorFlow 1.7 may be the last time we support cuDNN versions below 6.0.
+  Starting with TensorFlow 1.8 release, 6.0 will be the minimum supported
+  version.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Abe, Alistair Low, Andy Kernahan, Appledore, Ben, Ben Barsdell, Boris Pfahringer, Brad Wannow, Brett Koonce, Carl Thomé, cclauss, Chengzhi Chen, Chris Drake, Christopher Yeh, Clayne Robison, Codrut Grosu, Daniel Trebbien, Danny Goodman, David Goodwin, David Norman, Deron Eriksson, Donggeon Lim, Donny Viszneki, DosLin, DylanDmitri, Francisco Guerrero, Fred Reiss, gdh1995, Giuseppe, Glenn Weidner, gracehoney, Guozhong Zhuang, Haichen "Hc" Li, Harald Husum, harumitsu.nobuta, Henry Spivey, hsm207, Jekyll Song, Jerome, Jiongyan Zhang, jjsjann123, John Sungjin Park, Johnson145, JoshVarty, Julian Wolff, Jun Wang, June-One, Kamil Sindi, Kb Sriram, Kdavis-Mozilla, Kenji, lazypanda1, Liang-Chi Hsieh, Loo Rong Jie, Mahesh Bhosale, MandarJKulkarni, ManHyuk, Marcus Ong, Marshal Hayes, Martin Pool, matthieudelaro, mdfaijul, mholzel, Michael Zhou, Ming Li, Minmin Sun, Myungjoo Ham, MyungsungKwak, Naman Kamra, Peng Yu, Penghao Cen, Phil, Raghuraman-K, resec, Rohin Mohanadas, Sandeep N Gupta, Scott Tseng, seaotterman, Seo Sanghyeon, Sergei Lebedev, Ted Chang, terrytangyuan, Tim H, tkunic, Tod, vihanjain, Yan Facai (颜发才), Yin Li, Yong Tang, Yukun Chen, Yusuke Yamada
+
+
+
 # Release 1.6.0
 
 ## Breaking Changes
diff --git a/SECURITY.md b/SECURITY.md
index 378e77696725e338e8289cda84dbc543303ae053..a5ce3a62ee202f6e7d83f0fedc2777d9c88ba9b5 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -168,7 +168,18 @@ below).
 
 Please use a descriptive subject line for your report email. After the initial
 reply to your report, the security team will endeavor to keep you informed of
-the progress being made towards a fix and announcement.
+the progress being made towards a fix and announcement. 
+
+In addition, please include the following information along with your report:
+
+* Your name and affiliation (if any).
+* A description the technical details of the vulnerabilities. It is very
+  important to let us know how we can reproduce your findings.
+* An explanation who can exploit this vulnerability, and what they gain when
+  doing so -- write an attack scenario. This will help us evaluate your report
+  quickly, especially if the issue is complex.
+* Whether this vulnerability public or known to third parties. If it is, please
+  provide details.
 
 If you believe that an existing (public) issue is security-related, please send
 an email to `security@tensorflow.org`. The email should include the issue ID and
@@ -233,7 +244,7 @@ v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
 
 ### Known vulnerabilities
 
-| Type              | Versions affected |        Reported by | Additional Information      |
-|-------------------|:-----------------:|--------------------|-----------------------------|
-| out of bounds read|             <=1.4 | TenCent Blade Team | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+| Type               | Versions affected | Reported by           | Additional Information      |
+|--------------------|:-----------------:|-----------------------|-----------------------------|
+| Out Of Bounds Read |             <=1.4 | Blade Team of Tencent | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
 
diff --git a/WORKSPACE b/WORKSPACE
index 1e38a9a8cd754886fc5232531816b875de0879a3..11c5cdb2070e79b16540a39f13cab28608962340 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -14,6 +14,12 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 
 closure_repositories()
 
+# We must check the bazel version before trying to parse any other BUILD
+# files, in case the parsing of those build files depends on the bazel
+# version we require here.
+load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
+check_bazel_version_at_least("0.10.0")
+
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
 # Uncomment and update the paths in these entries to build the Android demo.
diff --git a/configure.py b/configure.py
index d14edef1be9e31137c96bed7aebf7ba158b3274f..81d5ad77ee48b101c2f55baf5b3ee935dab756c8 100644
--- a/configure.py
+++ b/configure.py
@@ -35,12 +35,13 @@ except ImportError:
 
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
+_DEFAULT_NCCL_VERSION = '1.3'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
 _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
                           'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
-_DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/x86_64-linux-gnu'
+_DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/%s-linux-gnu' % platform.machine()
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
 _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
@@ -484,6 +485,8 @@ def set_cc_opt_flags(environ_cp):
   if is_ppc64le():
     # gcc on ppc64le does not support -march, use mcpu instead
     default_cc_opt_flags = '-mcpu=native'
+  elif is_windows():
+    default_cc_opt_flags = '/arch:AVX'
   else:
     default_cc_opt_flags = '-march=native'
   question = ('Please specify optimization flags to use during compilation when'
@@ -494,7 +497,7 @@ def set_cc_opt_flags(environ_cp):
   for opt in cc_opt_flags.split():
     write_to_bazelrc('build:opt --copt=%s' % opt)
   # It should be safe on the same build host.
-  if not is_ppc64le():
+  if not is_ppc64le() and not is_windows():
     write_to_bazelrc('build:opt --host_copt=-march=native')
   write_to_bazelrc('build:opt --define with_default_optimizations=true')
   # TODO(mikecase): Remove these default defines once we are able to get
@@ -502,7 +505,6 @@ def set_cc_opt_flags(environ_cp):
   write_to_bazelrc('build --copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
   write_to_bazelrc('build --host_copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
 
-
 def set_tf_cuda_clang(environ_cp):
   """set TF_CUDA_CLANG action_env.
 
@@ -524,7 +526,7 @@ def set_tf_cuda_clang(environ_cp):
 
 def set_tf_download_clang(environ_cp):
   """Set TF_DOWNLOAD_CLANG action_env."""
-  question = 'Do you want to download a fresh release of clang? (Experimental)'
+  question = 'Do you wish to download a fresh release of clang? (Experimental)'
   yes_reply = 'Clang will be downloaded and used to compile tensorflow.'
   no_reply = 'Clang will not be downloaded.'
   set_action_env_var(
@@ -1103,6 +1105,81 @@ def set_tf_tensorrt_install_path(environ_cp):
   write_action_env_to_bazelrc('TF_TENSORRT_VERSION', tf_tensorrt_version)
 
 
+def set_tf_nccl_install_path(environ_cp):
+  """Set NCCL_INSTALL_PATH and TF_NCCL_VERSION.
+
+  Args:
+    environ_cp: copy of the os.environ.
+
+  Raises:
+    ValueError: if this method was called under non-Linux platform.
+    UserInputError: if user has provided invalid input multiple times.
+  """
+  if not is_linux():
+    raise ValueError('Currently NCCL is only supported on Linux platforms.')
+
+  ask_nccl_version = (
+      'Please specify the NCCL version you want to use. '
+      '[Leave empty to default to NCCL %s]: ') % _DEFAULT_NCCL_VERSION
+
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
+    tf_nccl_version = get_from_env_or_user_or_default(
+        environ_cp, 'TF_NCCL_VERSION', ask_nccl_version, _DEFAULT_NCCL_VERSION)
+    tf_nccl_version = reformat_version_sequence(str(tf_nccl_version), 1)
+
+    if tf_nccl_version == '1':
+      break  # No need to get install path, NCCL 1 is a GitHub repo.
+
+    # TODO(csigg): Look with ldconfig first if we can find the library in paths
+    # like /usr/lib/x86_64-linux-gnu and the header file in the corresponding
+    # include directory. This is where the NCCL .deb packages install them.
+    # Then ask the user if we should use that. Instead of a single
+    # NCCL_INSTALL_PATH, pass separate NCCL_LIB_PATH and NCCL_HDR_PATH to
+    # nccl_configure.bzl
+    default_nccl_path = environ_cp.get('CUDA_TOOLKIT_PATH')
+    ask_nccl_path = (r'Please specify the location where NCCL %s library is '
+                     'installed. Refer to README.md for more details. [Default '
+                     'is %s]:') % (tf_nccl_version, default_nccl_path)
+    nccl_install_path = get_from_env_or_user_or_default(
+        environ_cp, 'NCCL_INSTALL_PATH', ask_nccl_path, default_nccl_path)
+
+    # Result returned from "read" will be used unexpanded. That make "~"
+    # unusable. Going through one more level of expansion to handle that.
+    nccl_install_path = os.path.realpath(os.path.expanduser(nccl_install_path))
+    if is_windows() or is_cygwin():
+      nccl_install_path = cygpath(nccl_install_path)
+
+    if is_windows():
+      nccl_lib_path = 'lib/x64/nccl.lib'
+    elif is_linux():
+      nccl_lib_path = 'lib/libnccl.so.%s' % tf_nccl_version
+    elif is_macos():
+      nccl_lib_path = 'lib/libnccl.%s.dylib' % tf_nccl_version
+
+    nccl_lib_path = os.path.join(nccl_install_path, nccl_lib_path)
+    nccl_hdr_path = os.path.join(nccl_install_path, 'include/nccl.h')
+    if os.path.exists(nccl_lib_path) and os.path.exists(nccl_hdr_path):
+      # Set NCCL_INSTALL_PATH
+      environ_cp['NCCL_INSTALL_PATH'] = nccl_install_path
+      write_action_env_to_bazelrc('NCCL_INSTALL_PATH', nccl_install_path)
+      break
+
+    # Reset and Retry
+    print('Invalid path to NCCL %s toolkit, %s or %s not found. Please use the '
+          'O/S agnostic package of NCCL 2' % (tf_nccl_version, nccl_lib_path,
+                                              nccl_hdr_path))
+
+    environ_cp['TF_NCCL_VERSION'] = ''
+  else:
+    raise UserInputError('Invalid TF_NCCL setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
+
+  # Set TF_NCCL_VERSION
+  environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
+  write_action_env_to_bazelrc('TF_NCCL_VERSION', tf_nccl_version)
+
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1397,6 +1474,9 @@ def main():
     environ_cp['TF_NEED_OPENCL'] = '0'
     environ_cp['TF_CUDA_CLANG'] = '0'
     environ_cp['TF_NEED_TENSORRT'] = '0'
+    # TODO(ibiryukov): Investigate using clang as a cpu or cuda compiler on
+    # Windows.
+    environ_cp['TF_DOWNLOAD_CLANG'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_JEMALLOC'] = '0'
@@ -1411,7 +1491,7 @@ def main():
   set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
                 'with_s3_support', True, 's3')
   set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
-                'with_kafka_support', False, 'kafka')
+                'with_kafka_support', True, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
@@ -1436,6 +1516,7 @@ def main():
     set_tf_cudnn_version(environ_cp)
     if is_linux():
       set_tf_tensorrt_install_path(environ_cp)
+    set_tf_nccl_install_path(environ_cp)
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
         'LD_LIBRARY_PATH') != '1':
@@ -1444,16 +1525,8 @@ def main():
 
     set_tf_cuda_clang(environ_cp)
     if environ_cp.get('TF_CUDA_CLANG') == '1':
-      if not is_windows():
-        # Ask if we want to download clang release while building.
-        set_tf_download_clang(environ_cp)
-      else:
-        # We use bazel's generated crosstool on Windows and there is no
-        # way to provide downloaded toolchain for that yet.
-        # TODO(ibiryukov): Investigate using clang as a cuda compiler on
-        # Windows.
-        environ_cp['TF_DOWNLOAD_CLANG'] = '0'
-
+      # Ask whether we should download the clang toolchain.
+      set_tf_download_clang(environ_cp)
       if environ_cp.get('TF_DOWNLOAD_CLANG') != '1':
         # Set up which clang we should use as the cuda / host compiler.
         set_clang_cuda_compiler_path(environ_cp)
@@ -1463,6 +1536,13 @@ def main():
       if not is_windows():
         set_gcc_host_compiler_path(environ_cp)
     set_other_cuda_vars(environ_cp)
+  else:
+    # CUDA not required. Ask whether we should download the clang toolchain and
+    # use it for the CPU build.
+    set_tf_download_clang(environ_cp)
+    if environ_cp.get('TF_DOWNLOAD_CLANG') == '1':
+      write_to_bazelrc('build --config=download_clang')
+      write_to_bazelrc('test --config=download_clang')
 
   set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False)
   if environ_cp.get('TF_NEED_MPI') == '1':
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 9932e5607685b5b8f5900bdfa42363151e57d3f1..823393ebdf1f4b658361f31963a275a683e61002 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -240,6 +240,13 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_kafka_support_windows_override",
+    define_values = {"with_kafka_support": "true"},
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "with_gcp_support_android_override",
     define_values = {"with_gcp_support": "true"},
@@ -394,19 +401,6 @@ package_group(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "tensorflow_py",
     srcs = ["__init__.py"],
@@ -426,289 +420,6 @@ py_library(
     ],
 )
 
-filegroup(
-    name = "all_opensource_files",
-    data = [
-        ":all_files",
-        "//tensorflow/c:all_files",
-        "//tensorflow/cc:all_files",
-        "//tensorflow/cc/saved_model:all_files",
-        "//tensorflow/cc/saved_model/python:all_files",
-        "//tensorflow/cc/tools:all_files",
-        "//tensorflow/compiler/aot:all_files",
-        "//tensorflow/compiler/aot/tests:all_files",
-        "//tensorflow/compiler/jit:all_files",
-        "//tensorflow/compiler/jit/graphcycles:all_files",
-        "//tensorflow/compiler/jit/kernels:all_files",
-        "//tensorflow/compiler/jit/legacy_flags:all_files",
-        "//tensorflow/compiler/jit/ops:all_files",
-        "//tensorflow/compiler/plugin:all_files",
-        "//tensorflow/compiler/tests:all_files",
-        "//tensorflow/compiler/tf2xla:all_files",
-        "//tensorflow/compiler/tf2xla/cc:all_files",
-        "//tensorflow/compiler/tf2xla/kernels:all_files",
-        "//tensorflow/compiler/tf2xla/lib:all_files",
-        "//tensorflow/compiler/tf2xla/ops:all_files",
-        "//tensorflow/compiler/xla:all_files",
-        "//tensorflow/compiler/xla/client:all_files",
-        "//tensorflow/compiler/xla/client/lib:all_files",
-        "//tensorflow/compiler/xla/client/xla_client:all_files",
-        "//tensorflow/compiler/xla/legacy_flags:all_files",
-        "//tensorflow/compiler/xla/python:all_files",
-        "//tensorflow/compiler/xla/service:all_files",
-        "//tensorflow/compiler/xla/service/cpu:all_files",
-        "//tensorflow/compiler/xla/service/gpu:all_files",
-        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend:all_files",
-        "//tensorflow/compiler/xla/service/interpreter:all_files",
-        "//tensorflow/compiler/xla/service/llvm_ir:all_files",
-        "//tensorflow/compiler/xla/tests:all_files",
-        "//tensorflow/compiler/xla/tools:all_files",
-        "//tensorflow/compiler/xla/tools/parser:all_files",
-        "//tensorflow/contrib:all_files",
-        "//tensorflow/contrib/all_reduce:all_files",
-        "//tensorflow/contrib/android:all_files",
-        "//tensorflow/contrib/batching:all_files",
-        "//tensorflow/contrib/bayesflow:all_files",
-        "//tensorflow/contrib/boosted_trees:all_files",
-        "//tensorflow/contrib/boosted_trees/estimator_batch:all_files",
-        "//tensorflow/contrib/boosted_trees/lib:all_files",
-        "//tensorflow/contrib/boosted_trees/proto:all_files",
-        "//tensorflow/contrib/boosted_trees/resources:all_files",
-        "//tensorflow/contrib/cloud:all_files",
-        "//tensorflow/contrib/cloud/kernels:all_files",
-        "//tensorflow/contrib/cluster_resolver:all_files",
-        "//tensorflow/contrib/coder:all_files",
-        "//tensorflow/contrib/compiler:all_files",
-        "//tensorflow/contrib/copy_graph:all_files",
-        "//tensorflow/contrib/crf:all_files",
-        "//tensorflow/contrib/cudnn_rnn:all_files",
-        "//tensorflow/contrib/data:all_files",
-        "//tensorflow/contrib/data/kernels:all_files",
-        "//tensorflow/contrib/data/python/kernel_tests:all_files",
-        "//tensorflow/contrib/data/python/ops:all_files",
-        "//tensorflow/contrib/decision_trees/proto:all_files",
-        "//tensorflow/contrib/deprecated:all_files",
-        "//tensorflow/contrib/distributions:all_files",
-        "//tensorflow/contrib/eager/proto:all_files",
-        "//tensorflow/contrib/eager/python:all_files",
-        "//tensorflow/contrib/estimator:all_files",
-        "//tensorflow/contrib/factorization:all_files",
-        "//tensorflow/contrib/factorization/examples:all_files",
-        "//tensorflow/contrib/factorization/kernels:all_files",
-        "//tensorflow/contrib/feature_column:all_files",
-        "//tensorflow/contrib/ffmpeg:all_files",
-        "//tensorflow/contrib/ffmpeg/default:all_files",
-        "//tensorflow/contrib/framework:all_files",
-        "//tensorflow/contrib/fused_conv:all_files",
-        "//tensorflow/contrib/gan:all_files",
-        "//tensorflow/contrib/gdr:all_files",
-        "//tensorflow/contrib/graph_editor:all_files",
-        "//tensorflow/contrib/grid_rnn:all_files",
-        "//tensorflow/contrib/hooks:all_files",
-        "//tensorflow/contrib/hvx/clock_cycle_profiling:all_files",
-        "//tensorflow/contrib/hvx/hvx_ops_support_checker:all_files",
-        "//tensorflow/contrib/image:all_files",
-        "//tensorflow/contrib/input_pipeline:all_files",
-        "//tensorflow/contrib/input_pipeline/kernels:all_files",
-        "//tensorflow/contrib/integrate:all_files",
-        "//tensorflow/contrib/keras:all_files",
-        "//tensorflow/contrib/kernel_methods:all_files",
-        "//tensorflow/contrib/kfac:all_files",
-        "//tensorflow/contrib/kfac/examples:all_files",
-        "//tensorflow/contrib/kfac/examples/tests:all_files",
-        "//tensorflow/contrib/kfac/python/kernel_tests:all_files",
-        "//tensorflow/contrib/kfac/python/ops:all_files",
-        "//tensorflow/contrib/labeled_tensor:all_files",
-        "//tensorflow/contrib/layers:all_files",
-        "//tensorflow/contrib/layers/kernels:all_files",
-        "//tensorflow/contrib/learn:all_files",
-        "//tensorflow/contrib/learn/python/learn/datasets:all_files",
-        "//tensorflow/contrib/legacy_seq2seq:all_files",
-        "//tensorflow/contrib/libsvm:all_files",
-        "//tensorflow/contrib/linalg:all_files",
-        "//tensorflow/contrib/linear_optimizer:all_files",
-        "//tensorflow/contrib/lite:all_files",
-        "//tensorflow/contrib/lite/java:all_files",
-        "//tensorflow/contrib/lite/java/demo/app/src/main:all_files",
-        "//tensorflow/contrib/lite/java/demo/app/src/main/assets:all_files",
-        "//tensorflow/contrib/lite/java/src/main/native:all_files",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:all_files",
-        "//tensorflow/contrib/lite/kernels:all_files",
-        "//tensorflow/contrib/lite/kernels/internal:all_files",
-        "//tensorflow/contrib/lite/models/smartreply:all_files",
-        "//tensorflow/contrib/lite/nnapi:all_files",
-        "//tensorflow/contrib/lite/python:all_files",
-        "//tensorflow/contrib/lite/schema:all_files",
-        "//tensorflow/contrib/lite/testing:all_files",
-        "//tensorflow/contrib/lite/toco:all_files",
-        "//tensorflow/contrib/lite/toco/graph_transformations/tests:all_files",
-        "//tensorflow/contrib/lite/toco/python:all_files",
-        "//tensorflow/contrib/lite/toco/tensorflow_graph_matching:all_files",
-        "//tensorflow/contrib/lite/toco/tflite:all_files",
-        "//tensorflow/contrib/lite/tools:all_files",
-        "//tensorflow/contrib/lookup:all_files",
-        "//tensorflow/contrib/losses:all_files",
-        "//tensorflow/contrib/makefile:all_files",
-        "//tensorflow/contrib/memory_stats:all_files",
-        "//tensorflow/contrib/meta_graph_transform:all_files",
-        "//tensorflow/contrib/metrics:all_files",
-        "//tensorflow/contrib/model_pruning:all_files",
-        "//tensorflow/contrib/model_pruning/examples/cifar10:all_files",
-        "//tensorflow/contrib/nccl:all_files",
-        "//tensorflow/contrib/nearest_neighbor:all_files",
-        "//tensorflow/contrib/nn:all_files",
-        "//tensorflow/contrib/opt:all_files",
-        "//tensorflow/contrib/periodic_resample:all_files",
-        "//tensorflow/contrib/predictor:all_files",
-        "//tensorflow/contrib/py2tf:all_files",
-        "//tensorflow/contrib/py2tf/converters:all_files",
-        "//tensorflow/contrib/py2tf/impl:all_files",
-        "//tensorflow/contrib/py2tf/pyct:all_files",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis:all_files",
-        "//tensorflow/contrib/py2tf/utils:all_files",
-        "//tensorflow/contrib/quantize:all_files",
-        "//tensorflow/contrib/receptive_field:all_files",
-        "//tensorflow/contrib/reduce_slice_ops:all_files",
-        "//tensorflow/contrib/remote_fused_graph/pylib:all_files",
-        "//tensorflow/contrib/resampler:all_files",
-        "//tensorflow/contrib/rnn:all_files",
-        "//tensorflow/contrib/saved_model:all_files",
-        "//tensorflow/contrib/saved_model/cc/saved_model:all_files",
-        "//tensorflow/contrib/seq2seq:all_files",
-        "//tensorflow/contrib/session_bundle:all_files",
-        "//tensorflow/contrib/session_bundle/example:all_files",
-        "//tensorflow/contrib/signal:all_files",
-        "//tensorflow/contrib/slim:all_files",
-        "//tensorflow/contrib/slim/python/slim/data:all_files",
-        "//tensorflow/contrib/slim/python/slim/nets:all_files",
-        "//tensorflow/contrib/solvers:all_files",
-        "//tensorflow/contrib/sparsemax:all_files",
-        "//tensorflow/contrib/specs:all_files",
-        "//tensorflow/contrib/staging:all_files",
-        "//tensorflow/contrib/stat_summarizer:all_files",
-        "//tensorflow/contrib/stateless:all_files",
-        "//tensorflow/contrib/summary:all_files",
-        "//tensorflow/contrib/tensor_forest:all_files",
-        "//tensorflow/contrib/tensor_forest/hybrid:all_files",
-        "//tensorflow/contrib/tensor_forest/kernels/v4:all_files",
-        "//tensorflow/contrib/tensor_forest/proto:all_files",
-        "//tensorflow/contrib/tensorboard:all_files",
-        "//tensorflow/contrib/tensorboard/db:all_files",
-        "//tensorflow/contrib/tensorrt:all_files",
-        "//tensorflow/contrib/testing:all_files",
-        "//tensorflow/contrib/text:all_files",
-        "//tensorflow/contrib/tfprof:all_files",
-        "//tensorflow/contrib/timeseries:all_files",
-        "//tensorflow/contrib/timeseries/examples:all_files",
-        "//tensorflow/contrib/timeseries/python/timeseries:all_files",
-        "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:all_files",
-        "//tensorflow/contrib/tpu:all_files",
-        "//tensorflow/contrib/tpu/profiler:all_files",
-        "//tensorflow/contrib/tpu/proto:all_files",
-        "//tensorflow/contrib/training:all_files",
-        "//tensorflow/contrib/util:all_files",
-        "//tensorflow/contrib/verbs:all_files",
-        "//tensorflow/core:all_files",
-        "//tensorflow/core/api_def:all_files",
-        "//tensorflow/core/debug:all_files",
-        "//tensorflow/core/distributed_runtime:all_files",
-        "//tensorflow/core/distributed_runtime/rpc:all_files",
-        "//tensorflow/core/grappler:all_files",
-        "//tensorflow/core/grappler/clusters:all_files",
-        "//tensorflow/core/grappler/costs:all_files",
-        "//tensorflow/core/grappler/inputs:all_files",
-        "//tensorflow/core/grappler/optimizers:all_files",
-        "//tensorflow/core/grappler/utils:all_files",
-        "//tensorflow/core/kernels:all_files",
-        "//tensorflow/core/kernels/batching_util:all_files",
-        "//tensorflow/core/kernels/data:all_files",
-        "//tensorflow/core/kernels/data/sql:all_files",
-        "//tensorflow/core/kernels/fuzzing:all_files",
-        "//tensorflow/core/kernels/hexagon:all_files",
-        "//tensorflow/core/kernels/neon:all_files",
-        "//tensorflow/core/lib/db:all_files",
-        "//tensorflow/core/ops/compat:all_files",
-        "//tensorflow/core/platform/cloud:all_files",
-        "//tensorflow/core/platform/default/build_config:all_files",
-        "//tensorflow/core/platform/hadoop:all_files",
-        "//tensorflow/core/platform/s3:all_files",
-        "//tensorflow/core/profiler:all_files",
-        "//tensorflow/core/profiler/internal:all_files",
-        "//tensorflow/core/profiler/internal/advisor:all_files",
-        "//tensorflow/core/util/ctc:all_files",
-        "//tensorflow/core/util/tensor_bundle:all_files",
-        "//tensorflow/examples/adding_an_op:all_files",
-        "//tensorflow/examples/android:all_files",
-        "//tensorflow/examples/benchmark:all_files",
-        "//tensorflow/examples/get_started/regression:all_files",
-        "//tensorflow/examples/how_tos/reading_data:all_files",
-        "//tensorflow/examples/image_retraining:all_files",
-        "//tensorflow/examples/label_image:all_files",
-        "//tensorflow/examples/learn:all_files",
-        "//tensorflow/examples/multibox_detector:all_files",
-        "//tensorflow/examples/saved_model:all_files",
-        "//tensorflow/examples/speech_commands:all_files",
-        "//tensorflow/examples/tutorials/estimators:all_files",
-        "//tensorflow/examples/tutorials/layers:all_files",
-        "//tensorflow/examples/tutorials/mnist:all_files",
-        "//tensorflow/examples/tutorials/monitors:all_files",
-        "//tensorflow/examples/tutorials/word2vec:all_files",
-        "//tensorflow/examples/wav_to_spectrogram:all_files",
-        "//tensorflow/go:all_files",
-        "//tensorflow/java:all_files",
-        "//tensorflow/java/src/main/java/org/tensorflow/examples:all_files",
-        "//tensorflow/java/src/main/native:all_files",
-        "//tensorflow/python:all_files",
-        "//tensorflow/python/data:all_files",
-        "//tensorflow/python/data/kernel_tests:all_files",
-        "//tensorflow/python/data/ops:all_files",
-        "//tensorflow/python/data/util:all_files",
-        "//tensorflow/python/debug:all_files",
-        "//tensorflow/python/eager:all_files",
-        "//tensorflow/python/estimator:all_files",
-        "//tensorflow/python/feature_column:all_files",
-        "//tensorflow/python/keras:all_files",
-        "//tensorflow/python/kernel_tests:all_files",
-        "//tensorflow/python/kernel_tests/distributions:all_files",
-        "//tensorflow/python/kernel_tests/linalg:all_files",
-        "//tensorflow/python/kernel_tests/random:all_files",
-        "//tensorflow/python/ops/distributions:all_files",
-        "//tensorflow/python/ops/linalg:all_files",
-        "//tensorflow/python/ops/losses:all_files",
-        "//tensorflow/python/profiler:all_files",
-        "//tensorflow/python/profiler/internal:all_files",
-        "//tensorflow/python/saved_model:all_files",
-        "//tensorflow/python/tools:all_files",
-        "//tensorflow/tools/api/generator:all_files",
-        "//tensorflow/tools/api/golden:all_files",
-        "//tensorflow/tools/api/lib:all_files",
-        "//tensorflow/tools/api/tests:all_files",
-        "//tensorflow/tools/benchmark:all_files",
-        "//tensorflow/tools/build_info:all_files",
-        "//tensorflow/tools/ci_build/gpu_build:all_files",
-        "//tensorflow/tools/common:all_files",
-        "//tensorflow/tools/compatibility:all_files",
-        "//tensorflow/tools/dist_test/server:all_files",
-        "//tensorflow/tools/docker:all_files",
-        "//tensorflow/tools/docker/notebooks:all_files",
-        "//tensorflow/tools/docs:all_files",
-        "//tensorflow/tools/git:all_files",
-        "//tensorflow/tools/graph_transforms:all_files",
-        "//tensorflow/tools/mlpbtxt:all_files",
-        "//tensorflow/tools/proto_text:all_files",
-        "//tensorflow/tools/quantization:all_files",
-        "//tensorflow/tools/test:all_files",
-        "//tensorflow/user_ops:all_files",
-        "//third_party/eigen3:all_files",
-        "//third_party/fft2d:all_files",
-        "//third_party/flatbuffers:all_files",
-        "//third_party/hadoop:all_files",
-        "//third_party/sycl:all_files",
-        "//third_party/sycl/sycl:all_files",
-    ],
-    visibility = ["//visibility:public"],
-)
-
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
@@ -785,7 +496,7 @@ tf_cc_shared_object(
     linkopts = select({
         "//tensorflow:darwin": [
             "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "//tensorflow/c:exported_symbols.lds",
+            "$(location //tensorflow/c:exported_symbols.lds)",
             "-Wl,-install_name,@rpath/libtensorflow.so",
         ],
         "//tensorflow:windows": [],
@@ -794,7 +505,7 @@ tf_cc_shared_object(
             "-z defs",
             "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "//tensorflow/c:version_script.lds",
+            "$(location //tensorflow/c:version_script.lds)",
         ],
     }),
     deps = [
@@ -812,7 +523,7 @@ tf_cc_shared_object(
     linkopts = select({
         "//tensorflow:darwin": [
             "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "//tensorflow:tf_exported_symbols.lds",
+            "$(location //tensorflow:tf_exported_symbols.lds)",
         ],
         "//tensorflow:windows": [],
         "//tensorflow:windows_msvc": [],
@@ -820,7 +531,7 @@ tf_cc_shared_object(
             "-z defs",
             "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "//tensorflow:tf_version_script.lds",
+            "$(location //tensorflow:tf_version_script.lds)",
         ],
     }),
     deps = [
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 29ed957c9aa8cbe515f5f43bdccbf8c94f47c459..2367014cd02c721ea96581919c3efc96e772d9a6 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -34,6 +34,8 @@ filegroup(
         exclude = [
             "c_api_experimental.cc",
             "c_api_experimental.h",
+            "python_api.cc",
+            "python_api.h",
             "*test*",
         ],
     ),
@@ -116,6 +118,10 @@ tf_cuda_library(
         ":c_api",
         ":c_api_internal",
         "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
+        "//tensorflow/contrib/tpu:all_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -212,6 +218,27 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "c_api_experimental_test",
+    size = "small",
+    srcs = ["c_api_experimental_test.cc"],
+    data = ["testdata/tf_record"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api_experimental",
+        ":c_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "c_api_function_test",
     size = "small",
@@ -256,20 +283,7 @@ tf_cuda_library(
     deps = [
         ":c_api",
         ":c_api_internal",
+        # TODO(b/74620627): remove when _USE_C_SHAPES is removed
+        "//tensorflow/python:cpp_shape_inference_proto_cc",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 778cb667e2c0015c6a768ecf3b12b82601764117..18eeb2816807ec9986999cfc2c9a4c0f032683c0 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -647,11 +647,11 @@ void RecordMutation(TF_Graph* graph, const TF_Operation& op,
   for (auto it : graph->sessions) {
     mutex_lock session_lock(it.first->mu);
     if (it.first->last_num_graph_nodes > op.node.id()) {
-      it.second = FailedPrecondition(
+      it.second = strings::StrCat(
           "Operation '", op.node.DebugString(), "' was changed by ",
           mutation_type,
-          " after it was run by a session. Nodes can be mutated "
-          "only before they are executed by a session. Either don't modify "
+          " after it was run by a session. This mutation will have no effect, "
+          "and will trigger an error in the future. Either don't modify "
           "nodes after running them or create a new session.");
     }
   }
@@ -722,10 +722,11 @@ bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
     mutex_lock session_lock(session->mu);
     const Graph& graph = session->graph->graph;
 
-    status->status = session->graph->sessions[session];
-    if (!status->status.ok()) {
-      session->graph->mu.unlock();
-      return false;
+    const string& mutation_warning = session->graph->sessions[session];
+    if (!mutation_warning.empty()) {
+      // TODO(b/74949947): turn this back into an error status
+      LOG(WARNING) << mutation_warning;
+      session->graph->sessions[session].clear();
     }
 
     const auto num_nodes = graph.num_node_ids();
@@ -2475,7 +2476,7 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
     TF_Session* new_session = new TF_Session(session, graph);
     if (graph != nullptr) {
       mutex_lock l(graph->mu);
-      graph->sessions[new_session] = Status::OK();
+      graph->sessions[new_session] = "";
     }
     return new_session;
   } else {
@@ -2541,7 +2542,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
 
   TF_Session* session = new TF_Session(bundle.session.release(), graph);
 
-  graph->sessions[session] = Status::OK();
+  graph->sessions[session] = "";
   session->last_num_graph_nodes = graph->graph.num_node_ids();
   return session;
 #endif  // __ANDROID__
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index b32f574628c4d1dc5c3bb3f1265a1b12adee28bc..fe85f8ee0ed2c58c3ba9201a9ca895c9ec48c022 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -1496,7 +1496,8 @@ TF_CAPI_EXPORT extern int TF_DeviceListCount(const TF_DeviceList* list);
 // If index is out of bounds, an error code will be set in the status object,
 // and a null pointer will be returned.
 TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
-                                                    int index, TF_Status*);
+                                                    int index,
+                                                    TF_Status* status);
 
 // Retrieves the type of the device at the given index.
 //
@@ -1506,14 +1507,15 @@ TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
 // If index is out of bounds, an error code will be set in the status object,
 // and a null pointer will be returned.
 TF_CAPI_EXPORT extern const char* TF_DeviceListType(const TF_DeviceList* list,
-                                                    int index, TF_Status*);
+                                                    int index,
+                                                    TF_Status* status);
 
 // Retrieve the amount of memory associated with a given device.
 //
 // If index is out of bounds, an error code will be set in the status object,
 // and -1 will be returned.
 TF_CAPI_EXPORT extern int64_t TF_DeviceListMemoryBytes(
-    const TF_DeviceList* list, int index, TF_Status*);
+    const TF_DeviceList* list, int index, TF_Status* status);
 
 // --------------------------------------------------------------------------
 // Load plugins containing custom ops and kernels
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index be7f85a5bb06dce84579b109d506ded049042b50..bea93785717e2161fcec941485ac3c3f7f3e3ed5 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -17,8 +17,26 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
+using tensorflow::FunctionDef;
+using tensorflow::Node;
+using tensorflow::NodeBuilder;
+using tensorflow::Status;
+
+namespace {
+typedef std::unique_ptr<TF_Function, decltype(&TF_DeleteFunction)>
+    UniqueFuncPtr;
+}
+
+// struct TF_Operation { tensorflow::Node node; };
+static TF_Operation* ToTF_Operation(Node* node) {
+  return static_cast<TF_Operation*>(static_cast<void*>(node));
+}
+
 void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
   tensorflow::ConfigProto& config = options->options.config;
   auto* optimizer_options =
@@ -37,3 +55,8340 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
     optimizer_options->set_global_jit_level(tensorflow::OptimizerOptions::OFF);
   }
 }
+
+void TF_InitializeTPU(TF_Session* session, TF_Status* status) {
+  VLOG(1) << "Initializing TPU";
+  TF_Operation* config_op =
+      TF_GraphOperationByName(session->graph, "ConfigureDistributedTPU");
+  if (config_op == nullptr) {
+    status->status = tensorflow::errors::Internal(
+        "Unable to find node ConfigureDistributedTPU in the TF graph.");
+    return;
+  }
+
+  TF_Output config_node{config_op, 0};
+
+  TF_Tensor* dummy_output;
+  TF_SessionRun(session, /*run_options*/ nullptr,
+                // input related parameters
+                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
+                // output related parameters
+                /*outputs*/ &config_node, /*output_values*/ &dummy_output,
+                /*noutputs*/ 1,
+                /*targets*/ nullptr, /*ntargets*/ 0,
+                /*run_metadata*/ nullptr, status);
+  if (status->status.ok()) {
+    TF_DeleteTensor(dummy_output);
+  }
+}
+
+void TF_ShutdownTPU(TF_Session* session, TF_Status* status) {
+  {
+    tensorflow::mutex_lock c(session->graph->mu);
+    VLOG(1) << "Shutting down TPU, with input graph: "
+            << session->graph->graph.ToGraphDefDebug().DebugString();
+  }
+
+  TF_Operation* shutdown_op =
+      TF_GraphOperationByName(session->graph, "ShutdownDistributedTPU");
+  if (shutdown_op == nullptr) {
+    status->status = tensorflow::errors::Internal(
+        "Unable to find node ShutdownDistributedTPU in the TF graph.");
+    return;
+  }
+
+  TF_SessionRun(session, /*run_options*/ nullptr,
+                // input related parameters
+                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
+                // output related parameters
+                /*outputs*/ nullptr, /*output_values*/ nullptr,
+                /*noutputs*/ 0,
+                /*targets*/ &shutdown_op, /*ntargets*/ 1,
+                /*run_metadata*/ nullptr, status);
+}
+
+const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
+  tensorflow::mutex_lock c(graph->mu);
+  const auto& debug_str = graph->graph.ToGraphDefDebug().DebugString();
+  *len = debug_str.size();
+  char* ret = static_cast<char*>(malloc(*len + 1));
+  memcpy(ret, debug_str.c_str(), *len + 1);
+  return ret;
+}
+
+// On success, returns a set of TF_Function instances from `text_proto` of
+// GraphDef type. These functions must be deleted by calling TF_DeleteFunction.
+//
+// If `mutate_proto_func` is non-NULL, run it over each FunctionDef proto,
+// before creating a TF_Function out of the possibly mutated proto.
+static std::vector<UniqueFuncPtr> CreateFunctionsFromTextProto(
+    const char* text_proto,
+    std::function<void(FunctionDef*)>* mutate_proto_func, TF_Status* status) {
+  tensorflow::GraphDef gdef;
+  if (!tensorflow::protobuf::TextFormat::ParseFromString(text_proto, &gdef)) {
+    status->status = tensorflow::errors::Internal(
+        "Invalid text proto for GraphDef: ", text_proto);
+    return {};
+  }
+  const auto& fdef_lib = gdef.library();
+  if (fdef_lib.gradient_size() > 0) {
+    status->status = tensorflow::errors::Internal(
+        "GradientDef is not supported in reading Dataset related functions: ",
+        text_proto);
+    return {};
+  }
+  std::vector<UniqueFuncPtr> ret;
+  for (const FunctionDef& fdef : fdef_lib.function()) {
+    // Make a copy so that we can mutate it.
+    FunctionDef fdef_to_load = fdef;
+    if (mutate_proto_func) {
+      (*mutate_proto_func)(&fdef_to_load);
+    }
+    VLOG(1) << "Adding func to graph: " << fdef_to_load.DebugString();
+    std::vector<char> binary_proto_buf(fdef_to_load.ByteSizeLong());
+    fdef_to_load.SerializeToArray(binary_proto_buf.data(),
+                                  binary_proto_buf.size());
+    TF_Function* func = TF_FunctionImportFunctionDef(
+        binary_proto_buf.data(), binary_proto_buf.size(), status);
+    if (!status->status.ok()) return {};
+    ret.push_back(UniqueFuncPtr(func, TF_DeleteFunction));
+  }
+  return ret;
+}
+
+//  On success, returns a newly created TF_Function instance encoding a dataset
+//  node stack that returns a sequence of 3 floats, and sets `dataset_name` to
+//  the created dataset name. The returned function must be deleted by calling
+//  TF_DeleteFunction.
+static UniqueFuncPtr CreateFakeDatasetFunction(std::string* dataset_name,
+                                               TF_Status* status) {
+  const char* func_def = R"PREFIX(
+library {
+  function {
+    signature {
+      name: "_make_dataset_d8de2712"
+      output_arg {
+        name: "TensorSliceDataset"
+        type: DT_VARIANT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "TensorSliceDataset/tensors/component_0"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+              dim {
+                size: 3
+              }
+            }
+       tensor_content: "\000\000(B\000\000,B\000\0000B"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "TensorSliceDataset"
+      op: "TensorSliceDataset"
+      input: "TensorSliceDataset/tensors/component_0:output:0"
+      attr {
+        key: "Toutput_types"
+        value {
+          list {
+            type: DT_FLOAT
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+    }
+    ret {
+      key: "TensorSliceDataset"
+      value: "TensorSliceDataset:handle:0"
+    }
+  }
+}
+)PREFIX";
+
+  *dataset_name = "_make_dataset_d8de2712";
+  auto functions = CreateFunctionsFromTextProto(
+      func_def, /*mutate_proto_func*/ nullptr, status);
+  DCHECK_EQ(functions.size(), 1);
+  return std::move(functions[0]);
+}
+
+//  On success, returns a set of TF_Function instances encoding a dataset
+//  node stack that reads a Imagenet TFRecordFile dataset from `file_path`, and
+//  sets `dataset_name` to the created dataset name. The returned functions must
+//  be deleted by calling TF_DeleteFunction.
+static std::vector<UniqueFuncPtr> CreateImagenetDatasetFunctions(
+    const char* file_path, std::string* dataset_name, TF_Status* status) {
+  const char* func_def = R"PREFIX(
+library {
+  function {
+    signature {
+      name: "tf_map_func_91295dea"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "FlatMapDataset"
+        type: DT_VARIANT
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+      is_stateful: true
+    }
+    node_def {
+      name: "flat_filenames/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: -1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "flat_filenames"
+      op: "Reshape"
+      input: "arg0"
+      input: "flat_filenames/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "TensorSliceDataset"
+      op: "TensorSliceDataset"
+      input: "flat_filenames:output:0"
+      attr {
+        key: "Toutput_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FlatMapDataset"
+      op: "FlatMapDataset"
+      input: "TensorSliceDataset:handle:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_0cc8c35b"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+    }
+    ret {
+      key: "FlatMapDataset"
+      value: "FlatMapDataset:handle:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_map_func_0cc8c35b"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "TFRecordDataset"
+        type: DT_VARIANT
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+      is_stateful: true
+    }
+    node_def {
+      name: "compression_type"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: ""
+          }
+        }
+      }
+    }
+    node_def {
+      name: "buffer_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 8388608
+          }
+        }
+      }
+    }
+    node_def {
+      name: "TFRecordDataset"
+      op: "TFRecordDataset"
+      input: "arg0"
+      input: "compression_type:output:0"
+      input: "buffer_size:output:0"
+    }
+    ret {
+      key: "TFRecordDataset"
+      value: "TFRecordDataset:handle:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_map_func_74b6b15c"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "Reshape_1"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "sub_1"
+        type: DT_INT32
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+      is_stateful: true
+    }
+    node_def {
+      name: "ParseSingleExample/key_image/class/label"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape"
+      op: "Reshape"
+      input: "ParseSingleExample/key_image/class/label:output:0"
+      input: "ParseSingleExample/Reshape/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/key_image/class/text"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: ""
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_1/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_1"
+      op: "Reshape"
+      input: "ParseSingleExample/key_image/class/text:output:0"
+      input: "ParseSingleExample/Reshape_1/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/key_image/encoded"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: ""
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_2/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_2"
+      op: "Reshape"
+      input: "ParseSingleExample/key_image/encoded:output:0"
+      input: "ParseSingleExample/Reshape_2/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/key_image/format"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "jpeg"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_3/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/Reshape_3"
+      op: "Reshape"
+      input: "ParseSingleExample/key_image/format:output:0"
+      input: "ParseSingleExample/Reshape_3/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ParseSingleExample/ParseSingleExample"
+      op: "ParseSingleExample"
+      input: "arg0"
+      input: "ParseSingleExample/Reshape:output:0"
+      input: "ParseSingleExample/Reshape_1:output:0"
+      input: "ParseSingleExample/Reshape_2:output:0"
+      input: "ParseSingleExample/Reshape_3:output:0"
+      attr {
+        key: "Tdense"
+        value {
+          list {
+            type: DT_INT64
+            type: DT_STRING
+            type: DT_STRING
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "dense_keys"
+        value {
+          list {
+            s: "image/class/label"
+            s: "image/class/text"
+            s: "image/encoded"
+            s: "image/format"
+          }
+        }
+      }
+      attr {
+        key: "dense_shapes"
+        value {
+          list {
+            shape {
+            }
+            shape {
+            }
+            shape {
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "num_sparse"
+        value {
+          i: 5
+        }
+      }
+      attr {
+        key: "sparse_keys"
+        value {
+          list {
+            s: "image/object/bbox/xmax"
+            s: "image/object/bbox/xmin"
+            s: "image/object/bbox/ymax"
+            s: "image/object/bbox/ymin"
+            s: "image/object/class/label"
+          }
+        }
+      }
+      attr {
+        key: "sparse_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_INT64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape"
+      op: "Reshape"
+      input: "ParseSingleExample/ParseSingleExample:dense_values:2"
+      input: "Reshape/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/Substr/pos"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/Substr/len"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/Substr"
+      op: "Substr"
+      input: "Reshape:output:0"
+      input: "decode_image/Substr/pos:output:0"
+      input: "decode_image/Substr/len:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Substr/pos"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Substr/len"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Substr"
+      op: "Substr"
+      input: "Reshape:output:0"
+      input: "decode_image/is_jpeg/Substr/pos:output:0"
+      input: "decode_image/is_jpeg/Substr/len:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Equal/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "\377\330\377"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/is_jpeg/Equal"
+      op: "Equal"
+      input: "decode_image/is_jpeg/Substr:output:0"
+      input: "decode_image/is_jpeg/Equal/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Switch"
+      op: "Switch"
+      input: "decode_image/is_jpeg/Equal:z:0"
+      input: "decode_image/is_jpeg/Equal:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/switch_t"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/switch_f"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/pred_id"
+      op: "Identity"
+      input: "decode_image/is_jpeg/Equal:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/check_jpeg_channels/x"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/check_jpeg_channels/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 4
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/check_jpeg_channels"
+      op: "NotEqual"
+      input: "decode_image/cond_jpeg/check_jpeg_channels/x:output:0"
+      input: "decode_image/cond_jpeg/check_jpeg_channels/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Assert/Const"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 1, 3) when decoding JPEG images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Assert/Assert/data_0"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 1, 3) when decoding JPEG images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Assert/Assert"
+      op: "Assert"
+      input: "decode_image/cond_jpeg/check_jpeg_channels:z:0"
+      input: "decode_image/cond_jpeg/Assert/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/DecodeJpeg"
+      op: "DecodeJpeg"
+      input: "decode_image/cond_jpeg/DecodeJpeg/Switch:output_true:0"
+      input: "^decode_image/cond_jpeg/Assert/Assert"
+      attr {
+        key: "acceptable_fraction"
+        value {
+          f: 1.0
+        }
+      }
+      attr {
+        key: "channels"
+        value {
+          i: 3
+        }
+      }
+      attr {
+        key: "dct_method"
+        value {
+          s: ""
+        }
+      }
+      attr {
+        key: "fancy_upscaling"
+        value {
+          b: true
+        }
+      }
+      attr {
+        key: "ratio"
+        value {
+          i: 1
+        }
+      }
+      attr {
+        key: "try_recover_truncated"
+        value {
+          b: false
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/DecodeJpeg/Switch"
+      op: "Switch"
+      input: "Reshape:output:0"
+      input: "decode_image/cond_jpeg/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/is_png/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "\211PN"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/is_png"
+      op: "Equal"
+      input: "decode_image/cond_jpeg/is_png/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/is_png/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/is_png/Switch"
+      op: "Switch"
+      input: "decode_image/Substr:output:0"
+      input: "decode_image/cond_jpeg/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@decode_image/Substr"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/is_png:z:0"
+      input: "decode_image/cond_jpeg/is_png:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/switch_t"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/switch_f"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/pred_id"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/is_png:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/DecodePng"
+      op: "DecodePng"
+      input: "decode_image/cond_jpeg/cond_png/DecodePng/Switch_1:output_true:0"
+      attr {
+        key: "channels"
+        value {
+          i: 3
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/DecodePng/Switch"
+      op: "Switch"
+      input: "Reshape:output:0"
+      input: "decode_image/cond_jpeg/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/DecodePng/Switch_1"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/DecodePng/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/is_gif/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "GIF"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/is_gif"
+      op: "Equal"
+      input: "decode_image/cond_jpeg/cond_png/is_gif/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/is_gif/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/is_gif/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/is_png/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@decode_image/Substr"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/is_gif:z:0"
+      input: "decode_image/cond_jpeg/cond_png/is_gif:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/pred_id"
+      op: "Identity"
+      input: "decode_image/cond_jpeg/cond_png/is_gif:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels/x"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels"
+      op: "NotEqual"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels/x:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1/x"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 4
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1"
+      op: "NotEqual"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1/x:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/LogicalAnd"
+      op: "LogicalAnd"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels:z:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_gif_channels_1:z:0"
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert/Const"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 3) when decoding GIF images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert/Assert/data_0"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 3) when decoding GIF images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert/Assert"
+      op: "Assert"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/LogicalAnd:z:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Assert/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif"
+      op: "DecodeGif"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch_1:output_true:0"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/Assert/Assert"
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/DecodePng/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch_1"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/pos"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/len"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Substr"
+      op: "Substr"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/pos:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/len:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/Switch"
+      op: "Switch"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif/Switch:output_false:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@Reshape"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/is_bmp/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "BM"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/is_bmp"
+      op: "Equal"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/is_bmp/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Const"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Unable to decode bytes as JPEG, PNG, GIF, or BMP"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Assert/data_0"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Unable to decode bytes as JPEG, PNG, GIF, or BMP"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Assert"
+      op: "Assert"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/is_bmp:z:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels/x"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels/y"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels"
+      op: "NotEqual"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels/x:output:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Const"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 3) when decoding BMP images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Assert/data_0"
+      op: "Const"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Channels must be in (None, 0, 3) when decoding BMP images"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Assert"
+      op: "Assert"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/check_channels:z:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeBmp"
+      op: "DecodeBmp"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Substr/Switch:output_false:0"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/Assert_1/Assert"
+      input: "^decode_image/cond_jpeg/cond_png/cond_gif/Assert_2/Assert"
+      attr {
+        key: "channels"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/cond_gif/Merge"
+      op: "Merge"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeBmp:image:0"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/DecodeGif:image:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/cond_png/Merge"
+      op: "Merge"
+      input: "decode_image/cond_jpeg/cond_png/cond_gif/Merge:output:0"
+      input: "decode_image/cond_jpeg/cond_png/DecodePng:image:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "decode_image/cond_jpeg/Merge"
+      op: "Merge"
+      input: "decode_image/cond_jpeg/cond_png/Merge:output:0"
+      input: "decode_image/cond_jpeg/DecodeJpeg:image:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "convert_image/Cast"
+      op: "Cast"
+      input: "decode_image/cond_jpeg/Merge:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "convert_image/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.00392156885937
+          }
+        }
+      }
+    }
+    node_def {
+      name: "convert_image"
+      op: "Mul"
+      input: "convert_image/Cast:y:0"
+      input: "convert_image/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Const"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+              dim {
+                size: 1
+              }
+              dim {
+                size: 1
+              }
+              dim {
+                size: 4
+              }
+            }
+            tensor_content: "\000\000\000\000\000\000\000\000\000\000\200?\000\000\200?"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "distorted_bounding_box_crop/Shape"
+      op: "Shape"
+      input: "convert_image:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2/min_object_covered"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.10000000149
+          }
+        }
+      }
+    }
+    node_def {
+      name: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2"
+      op: "SampleDistortedBoundingBoxV2"
+      input: "distorted_bounding_box_crop/Shape:output:0"
+      input: "Const:output:0"
+      input: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2/min_object_covered:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "area_range"
+        value {
+          list {
+            f: 0.0799999982119
+            f: 1.0
+          }
+        }
+      }
+      attr {
+        key: "aspect_ratio_range"
+        value {
+          list {
+            f: 0.75
+            f: 1.33333337307
+          }
+        }
+      }
+      attr {
+        key: "max_attempts"
+        value {
+          i: 1
+        }
+      }
+      attr {
+        key: "seed"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "seed2"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "use_image_if_no_bounding_boxes"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "distorted_bounding_box_crop/Slice"
+      op: "Slice"
+      input: "convert_image:z:0"
+      input: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2:begin:0"
+      input: "distorted_bounding_box_crop/sample_distorted_bounding_box/SampleDistortedBoundingBoxV2:size:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Shape"
+      op: "Shape"
+      input: "convert_image:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Shape_1"
+      op: "Shape"
+      input: "distorted_bounding_box_crop/Slice:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Equal"
+      op: "Equal"
+      input: "Shape:output:0"
+      input: "Shape_1:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Cast"
+      op: "Cast"
+      input: "Equal:z:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "Const_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Sum"
+      op: "Sum"
+      input: "Cast:y:0"
+      input: "Const_1:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "Tidx"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "keep_dims"
+        value {
+          b: false
+        }
+      }
+    }
+    node_def {
+      name: "GreaterEqual/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "GreaterEqual"
+      op: "GreaterEqual"
+      input: "Sum:output:0"
+      input: "GreaterEqual/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/Switch"
+      op: "Switch"
+      input: "GreaterEqual:z:0"
+      input: "GreaterEqual:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/switch_t"
+      op: "Identity"
+      input: "cond/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/switch_f"
+      op: "Identity"
+      input: "cond/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/pred_id"
+      op: "Identity"
+      input: "GreaterEqual:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape"
+      op: "Shape"
+      input: "cond/Shape/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape/Switch"
+      op: "Switch"
+      input: "convert_image:z:0"
+      input: "cond/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@convert_image"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Cast"
+      op: "Cast"
+      input: "cond/Shape:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice"
+      op: "StridedSlice"
+      input: "cond/Cast:y:0"
+      input: "cond/strided_slice/stack:output:0"
+      input: "cond/strided_slice/stack_1:output:0"
+      input: "cond/strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_1/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_1/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_1/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_1"
+      op: "StridedSlice"
+      input: "cond/Cast:y:0"
+      input: "cond/strided_slice_1/stack:output:0"
+      input: "cond/strided_slice_1/stack_1:output:0"
+      input: "cond/strided_slice_1/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/Greater"
+      op: "Greater"
+      input: "cond/strided_slice:output:0"
+      input: "cond/strided_slice_1:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Switch"
+      op: "Switch"
+      input: "cond/Greater:z:0"
+      input: "cond/Greater:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/switch_t"
+      op: "Identity"
+      input: "cond/cond/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/switch_f"
+      op: "Identity"
+      input: "cond/cond/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/pred_id"
+      op: "Identity"
+      input: "cond/Greater:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice/stack"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice/stack_1"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice/stack_2"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice"
+      op: "StridedSlice"
+      input: "cond/cond/strided_slice/Switch:output_true:0"
+      input: "cond/cond/strided_slice/stack:output:0"
+      input: "cond/cond/strided_slice/stack_1:output:0"
+      input: "cond/cond/strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice/Switch"
+      op: "Switch"
+      input: "cond/Cast:y:0"
+      input: "cond/cond/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@cond/Cast"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_1/stack"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_1/stack_1"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_1/stack_2"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_1"
+      op: "StridedSlice"
+      input: "cond/cond/strided_slice/Switch:output_true:0"
+      input: "cond/cond/strided_slice_1/stack:output:0"
+      input: "cond/cond/strided_slice_1/stack_1:output:0"
+      input: "cond/cond/strided_slice_1/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/truediv"
+      op: "RealDiv"
+      input: "cond/cond/strided_slice:output:0"
+      input: "cond/cond/strided_slice_1:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/mul/y"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 224.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/mul"
+      op: "Mul"
+      input: "cond/cond/truediv:z:0"
+      input: "cond/cond/mul/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast/x/1"
+      op: "Const"
+      input: "^cond/cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 224.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast/x"
+      op: "Pack"
+      input: "cond/cond/mul:z:0"
+      input: "cond/cond/Cast/x/1:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast"
+      op: "Cast"
+      input: "cond/cond/Cast/x:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2/stack"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2/stack_1"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2/stack_2"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2"
+      op: "StridedSlice"
+      input: "cond/cond/strided_slice_2/Switch:output_false:0"
+      input: "cond/cond/strided_slice_2/stack:output:0"
+      input: "cond/cond/strided_slice_2/stack_1:output:0"
+      input: "cond/cond/strided_slice_2/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_2/Switch"
+      op: "Switch"
+      input: "cond/Cast:y:0"
+      input: "cond/cond/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@cond/Cast"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_3/stack"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_3/stack_1"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_3/stack_2"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/strided_slice_3"
+      op: "StridedSlice"
+      input: "cond/cond/strided_slice_2/Switch:output_false:0"
+      input: "cond/cond/strided_slice_3/stack:output:0"
+      input: "cond/cond/strided_slice_3/stack_1:output:0"
+      input: "cond/cond/strided_slice_3/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/truediv_1"
+      op: "RealDiv"
+      input: "cond/cond/strided_slice_2:output:0"
+      input: "cond/cond/strided_slice_3:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/mul_1/y"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 224.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/mul_1"
+      op: "Mul"
+      input: "cond/cond/truediv_1:z:0"
+      input: "cond/cond/mul_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast_1/x/0"
+      op: "Const"
+      input: "^cond/cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 224.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast_1/x"
+      op: "Pack"
+      input: "cond/cond/Cast_1/x/0:output:0"
+      input: "cond/cond/mul_1:z:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Cast_1"
+      op: "Cast"
+      input: "cond/cond/Cast_1/x:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/cond/Merge"
+      op: "Merge"
+      input: "cond/cond/Cast_1:y:0"
+      input: "cond/cond/Cast:y:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic/images"
+      op: "Pack"
+      input: "cond/Shape/Switch:output_true:0"
+      attr {
+        key: "N"
+        value {
+          i: 1
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic"
+      op: "ResizeBicubic"
+      input: "cond/ResizeBicubic/images:output:0"
+      input: "cond/cond/Merge:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "align_corners"
+        value {
+          b: false
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_2/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_2/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_2/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_2"
+      op: "StridedSlice"
+      input: "cond/ResizeBicubic:resized_images:0"
+      input: "cond/strided_slice_2/stack:output:0"
+      input: "cond/strided_slice_2/stack_1:output:0"
+      input: "cond/strided_slice_2/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape_1"
+      op: "Shape"
+      input: "cond/strided_slice_2:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_3/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_3/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_3/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_3"
+      op: "StridedSlice"
+      input: "cond/Shape_1:output:0"
+      input: "cond/strided_slice_3/stack:output:0"
+      input: "cond/strided_slice_3/stack_1:output:0"
+      input: "cond/strided_slice_3/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape_2"
+      op: "Shape"
+      input: "cond/strided_slice_2:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_4/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_4/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_4/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_4"
+      op: "StridedSlice"
+      input: "cond/Shape_2:output:0"
+      input: "cond/strided_slice_4/stack:output:0"
+      input: "cond/strided_slice_4/stack_1:output:0"
+      input: "cond/strided_slice_4/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/sub/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/sub"
+      op: "Sub"
+      input: "cond/strided_slice_3:output:0"
+      input: "cond/sub/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/add/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/add"
+      op: "Add"
+      input: "cond/sub:z:0"
+      input: "cond/add/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv/Cast"
+      op: "Cast"
+      input: "cond/add:z:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv/Cast_1"
+      op: "Cast"
+      input: "cond/truediv/y:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv"
+      op: "RealDiv"
+      input: "cond/truediv/Cast:y:0"
+      input: "cond/truediv/Cast_1:y:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+    }
+    node_def {
+      name: "cond/sub_1/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/sub_1"
+      op: "Sub"
+      input: "cond/strided_slice_4:output:0"
+      input: "cond/sub_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/add_1/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/add_1"
+      op: "Add"
+      input: "cond/sub_1:z:0"
+      input: "cond/add_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv_1/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv_1/Cast"
+      op: "Cast"
+      input: "cond/add_1:z:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv_1/Cast_1"
+      op: "Cast"
+      input: "cond/truediv_1/y:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/truediv_1"
+      op: "RealDiv"
+      input: "cond/truediv_1/Cast:y:0"
+      input: "cond/truediv_1/Cast_1:y:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+    }
+    node_def {
+      name: "cond/Shape_3"
+      op: "Shape"
+      input: "cond/strided_slice_2:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/Rank"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Equal/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Equal"
+      op: "Equal"
+      input: "cond/Rank:output:0"
+      input: "cond/Equal/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert/Const"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Rank of image must be equal to 3."
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert/Assert/data_0"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Rank of image must be equal to 3."
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert/Assert"
+      op: "Assert"
+      input: "cond/Equal:z:0"
+      input: "cond/Assert/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_5/stack"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_5/stack_1"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 3
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_5/stack_2"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_5"
+      op: "StridedSlice"
+      input: "cond/Shape_3:output:0"
+      input: "cond/strided_slice_5/stack:output:0"
+      input: "cond/strided_slice_5/stack_1:output:0"
+      input: "cond/strided_slice_5/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack/0"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack/1"
+      op: "Const"
+      input: "^cond/Assert/Assert"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack"
+      op: "Pack"
+      input: "cond/stack/0:output:0"
+      input: "cond/stack/1:output:0"
+      input: "cond/strided_slice_5:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 3
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_6/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_6/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_6/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_6"
+      op: "StridedSlice"
+      input: "cond/Shape_3:output:0"
+      input: "cond/strided_slice_6/stack:output:0"
+      input: "cond/strided_slice_6/stack_1:output:0"
+      input: "cond/strided_slice_6/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/GreaterEqual/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/GreaterEqual"
+      op: "GreaterEqual"
+      input: "cond/strided_slice_6:output:0"
+      input: "cond/GreaterEqual/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_7/stack"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_7/stack_1"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_7/stack_2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_7"
+      op: "StridedSlice"
+      input: "cond/Shape_3:output:0"
+      input: "cond/strided_slice_7/stack:output:0"
+      input: "cond/strided_slice_7/stack_1:output:0"
+      input: "cond/strided_slice_7/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/GreaterEqual_1/y"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 224
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/GreaterEqual_1"
+      op: "GreaterEqual"
+      input: "cond/strided_slice_7:output:0"
+      input: "cond/GreaterEqual_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/LogicalAnd"
+      op: "LogicalAnd"
+      input: "cond/GreaterEqual:z:0"
+      input: "cond/GreaterEqual_1:z:0"
+    }
+    node_def {
+      name: "cond/Assert_1/Const"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Crop size greater than the image size."
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert_1/Assert/data_0"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "Crop size greater than the image size."
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/Assert_1/Assert"
+      op: "Assert"
+      input: "cond/LogicalAnd:z:0"
+      input: "cond/Assert_1/Assert/data_0:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "summarize"
+        value {
+          i: 3
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack_1/2"
+      op: "Const"
+      input: "^cond/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_DOUBLE
+            tensor_shape {
+            }
+            double_val: 0.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/stack_1"
+      op: "Pack"
+      input: "cond/truediv:z:0"
+      input: "cond/truediv_1:z:0"
+      input: "cond/stack_1/2:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 3
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/ToInt32"
+      op: "Cast"
+      input: "cond/stack_1:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+    }
+    node_def {
+      name: "cond/Slice"
+      op: "Slice"
+      input: "cond/strided_slice_2:output:0"
+      input: "cond/ToInt32:y:0"
+      input: "cond/stack:output:0"
+      input: "^cond/Assert_1/Assert"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "cond/Reshape"
+      op: "Reshape"
+      input: "cond/Slice:output:0"
+      input: "cond/stack:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic_1/images"
+      op: "Pack"
+      input: "cond/ResizeBicubic_1/images/Switch:output_false:0"
+      attr {
+        key: "N"
+        value {
+          i: 1
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic_1/images/Switch"
+      op: "Switch"
+      input: "distorted_bounding_box_crop/Slice:output:0"
+      input: "cond/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@distorted_bounding_box_crop/Slice"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic_1/size"
+      op: "Const"
+      input: "^cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 2
+              }
+            }
+            tensor_content: "\340\000\000\000\340\000\000\000"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/ResizeBicubic_1"
+      op: "ResizeBicubic"
+      input: "cond/ResizeBicubic_1/images:output:0"
+      input: "cond/ResizeBicubic_1/size:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "align_corners"
+        value {
+          b: false
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_8/stack"
+      op: "Const"
+      input: "^cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_8/stack_1"
+      op: "Const"
+      input: "^cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_8/stack_2"
+      op: "Const"
+      input: "^cond/switch_f"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "cond/strided_slice_8"
+      op: "StridedSlice"
+      input: "cond/ResizeBicubic_1:resized_images:0"
+      input: "cond/strided_slice_8/stack:output:0"
+      input: "cond/strided_slice_8/stack_1:output:0"
+      input: "cond/strided_slice_8/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "cond/Merge"
+      op: "Merge"
+      input: "cond/strided_slice_8:output:0"
+      input: "cond/Reshape:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Const_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+              dim {
+                size: 1
+              }
+              dim {
+                size: 1
+              }
+              dim {
+                size: 3
+              }
+            }
+            tensor_content: "\354Q\370>\325x\351>;\337\317>"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "sub"
+      op: "Sub"
+      input: "cond/Merge:output:0"
+      input: "Const_2:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Const_3"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+              dim {
+                size: 1
+              }
+              dim {
+                size: 1
+              }
+              dim {
+                size: 3
+              }
+            }
+            tensor_content: "\372~j>B`e>fff>"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "truediv"
+      op: "RealDiv"
+      input: "sub:z:0"
+      input: "Const_3:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/control_dependency"
+      op: "Identity"
+      input: "truediv:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@truediv"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/min"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/max"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 1.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/RandomUniform"
+      op: "RandomUniform"
+      input: "random_flip_left_right/random_uniform/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "seed"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "seed2"
+        value {
+          i: 0
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/sub"
+      op: "Sub"
+      input: "random_flip_left_right/random_uniform/max:output:0"
+      input: "random_flip_left_right/random_uniform/min:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform/mul"
+      op: "Mul"
+      input: "random_flip_left_right/random_uniform/RandomUniform:output:0"
+      input: "random_flip_left_right/random_uniform/sub:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/random_uniform"
+      op: "Add"
+      input: "random_flip_left_right/random_uniform/mul:z:0"
+      input: "random_flip_left_right/random_uniform/min:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Less/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.5
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Less"
+      op: "Less"
+      input: "random_flip_left_right/random_uniform:z:0"
+      input: "random_flip_left_right/Less/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Switch"
+      op: "Switch"
+      input: "random_flip_left_right/Less:z:0"
+      input: "random_flip_left_right/Less:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/switch_t"
+      op: "Identity"
+      input: "random_flip_left_right/Switch:output_true:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/switch_f"
+      op: "Identity"
+      input: "random_flip_left_right/Switch:output_false:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/pred_id"
+      op: "Identity"
+      input: "random_flip_left_right/Less:z:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_BOOL
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/ReverseV2/axis"
+      op: "Const"
+      input: "^random_flip_left_right/switch_t"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/ReverseV2"
+      op: "ReverseV2"
+      input: "random_flip_left_right/ReverseV2/Switch:output_true:0"
+      input: "random_flip_left_right/ReverseV2/axis:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "Tidx"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/ReverseV2/Switch"
+      op: "Switch"
+      input: "random_flip_left_right/control_dependency:output:0"
+      input: "random_flip_left_right/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@truediv"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Switch_1"
+      op: "Switch"
+      input: "random_flip_left_right/control_dependency:output:0"
+      input: "random_flip_left_right/pred_id:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@truediv"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "random_flip_left_right/Merge"
+      op: "Merge"
+      input: "random_flip_left_right/Switch_1:output_false:0"
+      input: "random_flip_left_right/ReverseV2:output:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "Reshape_1/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 3
+              }
+            }
+            tensor_content: "\340\000\000\000\340\000\000\000\003\000\000\000"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape_1"
+      op: "Reshape"
+      input: "random_flip_left_right/Merge:output:0"
+      input: "Reshape_1/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Reshape_2/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape_2"
+      op: "Reshape"
+      input: "ParseSingleExample/ParseSingleExample:dense_values:0"
+      input: "Reshape_2/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "Cast_1"
+      op: "Cast"
+      input: "Reshape_2:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "sub_1/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "sub_1"
+      op: "Sub"
+      input: "Cast_1:y:0"
+      input: "sub_1/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    ret {
+      key: "Reshape_1"
+      value: "Reshape_1:output:0"
+    }
+    ret {
+      key: "sub_1"
+      value: "sub_1:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_predicate_7089b845"
+      input_arg {
+        name: "arg0"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "arg1"
+        type: DT_INT32
+      }
+      input_arg {
+        name: "Equal/Placeholder"
+        type: DT_INT64
+      }
+      output_arg {
+        name: "Equal"
+        type: DT_BOOL
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+    }
+    node_def {
+      name: "Shape"
+      op: "Shape"
+      input: "arg0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice"
+      op: "StridedSlice"
+      input: "Shape:output:0"
+      input: "strided_slice/stack:output:0"
+      input: "strided_slice/stack_1:output:0"
+      input: "strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "Equal"
+      op: "Equal"
+      input: "strided_slice:output:0"
+      input: "Equal/Placeholder"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    ret {
+      key: "Equal"
+      value: "Equal:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "_make_dataset_5fa5e1f4"
+      output_arg {
+        name: "PrefetchDataset_1"
+        type: DT_VARIANT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "TensorSliceDataset/MatchingFiles/pattern"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "$(DATA_DIR)"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "TensorSliceDataset/MatchingFiles"
+      op: "MatchingFiles"
+      input: "TensorSliceDataset/MatchingFiles/pattern:output:0"
+    }
+    node_def {
+      name: "TensorSliceDataset"
+      op: "TensorSliceDataset"
+      input: "TensorSliceDataset/MatchingFiles:filenames:0"
+      attr {
+        key: "Toutput_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/MatchingFiles/pattern"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "$(DATA_DIR)"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/MatchingFiles"
+      op: "MatchingFiles"
+      input: "ShuffleDataset/MatchingFiles/pattern:output:0"
+    }
+    node_def {
+      name: "ShuffleDataset/Shape"
+      op: "Shape"
+      input: "ShuffleDataset/MatchingFiles:filenames:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/strided_slice/stack"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/strided_slice/stack_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/strided_slice/stack_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/strided_slice"
+      op: "StridedSlice"
+      input: "ShuffleDataset/Shape:output:0"
+      input: "ShuffleDataset/strided_slice/stack:output:0"
+      input: "ShuffleDataset/strided_slice/stack_1:output:0"
+      input: "ShuffleDataset/strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/Maximum/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/Maximum"
+      op: "Maximum"
+      input: "ShuffleDataset/strided_slice:output:0"
+      input: "ShuffleDataset/Maximum/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/seed"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/seed2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset"
+      op: "ShuffleDataset"
+      input: "TensorSliceDataset:handle:0"
+      input: "ShuffleDataset/Maximum:z:0"
+      input: "ShuffleDataset/seed:output:0"
+      input: "ShuffleDataset/seed2:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "reshuffle_each_iteration"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_1/buffer_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1024
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_1/seed_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_1/seed2_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_1"
+      op: "ShuffleDataset"
+      input: "ShuffleDataset:handle:0"
+      input: "ShuffleDataset_1/buffer_size:output:0"
+      input: "ShuffleDataset_1/seed_1:output:0"
+      input: "ShuffleDataset_1/seed2_1:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "reshuffle_each_iteration"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "RepeatDataset/count"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "RepeatDataset"
+      op: "RepeatDataset"
+      input: "ShuffleDataset_1:handle:0"
+      input: "RepeatDataset/count:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/cycle_length"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 8
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/block_length"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/sloppy"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_BOOL
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_BOOL
+            tensor_shape {
+            }
+            bool_val: true
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/buffer_output_elements"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset/prefetch_input_elements"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 16
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelInterleaveDataset"
+      op: "ParallelInterleaveDataset"
+      input: "RepeatDataset:handle:0"
+      input: "ParallelInterleaveDataset/cycle_length:output:0"
+      input: "ParallelInterleaveDataset/block_length:output:0"
+      input: "ParallelInterleaveDataset/sloppy:output:0"
+      input: "ParallelInterleaveDataset/buffer_output_elements:output:0"
+      input: "ParallelInterleaveDataset/prefetch_input_elements:output:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_91295dea"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_2/buffer_size_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1024
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_2/seed_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_2/seed2_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset_2"
+      op: "ShuffleDataset"
+      input: "ParallelInterleaveDataset:handle:0"
+      input: "ShuffleDataset_2/buffer_size_1:output:0"
+      input: "ShuffleDataset_2/seed_2:output:0"
+      input: "ShuffleDataset_2/seed2_2:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "reshuffle_each_iteration"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "ParallelMapDataset/num_parallel_calls"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ParallelMapDataset"
+      op: "ParallelMapDataset"
+      input: "ShuffleDataset_2:handle:0"
+      input: "ParallelMapDataset/num_parallel_calls:output:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_74b6b15c"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "PrefetchDataset/buffer_size_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "PrefetchDataset"
+      op: "PrefetchDataset"
+      input: "ParallelMapDataset:handle:0"
+      input: "PrefetchDataset/buffer_size_2:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "BatchDataset/batch_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "BatchDataset"
+      op: "BatchDataset"
+      input: "PrefetchDataset:handle:0"
+      input: "BatchDataset/batch_size:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: -1
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+              dim {
+                size: -1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FilterDataset/batch_size_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FilterDataset"
+      op: "FilterDataset"
+      input: "BatchDataset:handle:0"
+      input: "FilterDataset/batch_size_1:output:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+            type: DT_INT64
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: -1
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+              dim {
+                size: -1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+      attr {
+        key: "predicate"
+        value {
+          func {
+            name: "tf_predicate_7089b845"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "PrefetchDataset_1/buffer_size_3"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "PrefetchDataset_1"
+      op: "PrefetchDataset"
+      input: "FilterDataset:handle:0"
+      input: "PrefetchDataset_1/buffer_size_3:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 64
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 224
+              }
+              dim {
+                size: 3
+              }
+            }
+            shape {
+              dim {
+                size: 64
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    ret {
+      key: "PrefetchDataset_1"
+      value: "PrefetchDataset_1:handle:0"
+    }
+  }
+}
+)PREFIX";
+
+  *dataset_name = "_make_dataset_5fa5e1f4";
+  std::function<void(FunctionDef*)> mutate_proto_func =
+      [dataset_name, file_path](FunctionDef* fdef) {
+        VLOG(1) << "Processsing function " << fdef->DebugString();
+        if (std::string(fdef->signature().name()) != *dataset_name) return;
+        // Change the input file pattern to `file_path`.
+        bool found = false;
+        for (auto& node_def : *fdef->mutable_node_def()) {
+          if (node_def.name() != "TensorSliceDataset/MatchingFiles/pattern" &&
+              node_def.name() != "ShuffleDataset/MatchingFiles/pattern")
+            continue;
+          DCHECK_EQ(node_def.op(), "Const");
+          DCHECK_GT(node_def.attr().count("value"), 0);
+          found = true;
+          DCHECK_EQ(node_def.attr().at("value").tensor().string_val(0),
+                    "$(DATA_DIR)");
+          VLOG(1) << "Setting the value of node_def "
+                     "TensorSliceDataset/MatchingFiles/pattern to "
+                  << file_path;
+          auto* tensor = (*node_def.mutable_attr())["value"].mutable_tensor();
+          tensor->clear_string_val();
+          tensor->add_string_val(file_path);
+        }
+        VLOG(1) << "Rewrote function to " << fdef->DebugString();
+        DCHECK(found);
+      };
+  return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+}
+
+//  On success, returns a set of TF_Function instances encoding a dataset
+//  node stack that reads an MNIST file dataset from `file_path`, and
+//  sets `dataset_name` to the created dataset name. The returned functions must
+//  be deleted by calling TF_DeleteFunction.
+static std::vector<UniqueFuncPtr> CreateMNISTDatasetFunctions(
+    const char* file_path, int batch_size, std::string* dataset_name,
+    TF_Status* status) {
+  const char* func_def = R"PREFIX(
+library {
+  function {
+    signature {
+      name: "tf_map_func_521bfd08"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "truediv"
+        type: DT_FLOAT
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+    }
+    node_def {
+      name: "DecodeRaw"
+      op: "DecodeRaw"
+      input: "arg0"
+      attr {
+        key: "little_endian"
+        value {
+          b: true
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "Cast"
+      op: "Cast"
+      input: "DecodeRaw:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "Reshape/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 784
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape"
+      op: "Reshape"
+      input: "Cast:y:0"
+      input: "Reshape/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "truediv/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 255.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "truediv"
+      op: "RealDiv"
+      input: "Reshape:output:0"
+      input: "truediv/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "truediv"
+      value: "truediv:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_map_func_9a08860d"
+      input_arg {
+        name: "arg0"
+        type: DT_STRING
+      }
+      output_arg {
+        name: "ToInt32"
+        type: DT_INT32
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+    }
+    node_def {
+      name: "DecodeRaw"
+      op: "DecodeRaw"
+      input: "arg0"
+      attr {
+        key: "little_endian"
+        value {
+          b: true
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    node_def {
+      name: "Reshape/shape"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+              }
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Reshape"
+      op: "Reshape"
+      input: "DecodeRaw:output:0"
+      input: "Reshape/shape:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_UINT8
+        }
+      }
+      attr {
+        key: "Tshape"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    node_def {
+      name: "ToInt32"
+      op: "Cast"
+      input: "Reshape:output:0"
+      attr {
+        key: "DstT"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_UINT8
+        }
+      }
+    }
+    ret {
+      key: "ToInt32"
+      value: "ToInt32:y:0"
+    }
+  }
+  function {
+    signature {
+      name: "tf_predicate_7089b845"
+      input_arg {
+        name: "arg0"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "arg1"
+        type: DT_INT32
+      }
+      input_arg {
+        name: "Equal/Placeholder"
+        type: DT_INT64
+      }
+      output_arg {
+        name: "Equal"
+        type: DT_BOOL
+      }
+      description: "A wrapper for Defun that facilitates shape inference."
+    }
+    node_def {
+      name: "Shape"
+      op: "Shape"
+      input: "arg0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "out_type"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice/stack_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+              dim {
+                size: 1
+              }
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "strided_slice"
+      op: "StridedSlice"
+      input: "Shape:output:0"
+      input: "strided_slice/stack:output:0"
+      input: "strided_slice/stack_1:output:0"
+      input: "strided_slice/stack_2:output:0"
+      attr {
+        key: "Index"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "begin_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "ellipsis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "end_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "new_axis_mask"
+        value {
+          i: 0
+        }
+      }
+      attr {
+        key: "shrink_axis_mask"
+        value {
+          i: 1
+        }
+      }
+    }
+    node_def {
+      name: "Equal"
+      op: "Equal"
+      input: "strided_slice:output:0"
+      input: "Equal/Placeholder"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    ret {
+      key: "Equal"
+      value: "Equal:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "_make_dataset_2451e43a"
+      output_arg {
+        name: "FilterDataset"
+        type: DT_VARIANT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/filenames"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "$(DATA_DIR)/train-images-idx3-ubyte"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/header_bytes"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 16
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/record_bytes"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 784
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/footer_bytes"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset/buffer_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 262144
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset"
+      op: "FixedLengthRecordDataset"
+      input: "FixedLengthRecordDataset/filenames:output:0"
+      input: "FixedLengthRecordDataset/header_bytes:output:0"
+      input: "FixedLengthRecordDataset/record_bytes:output:0"
+      input: "FixedLengthRecordDataset/footer_bytes:output:0"
+      input: "FixedLengthRecordDataset/buffer_size:output:0"
+    }
+    node_def {
+      name: "MapDataset"
+      op: "MapDataset"
+      input: "FixedLengthRecordDataset:handle:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_521bfd08"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/filenames_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "$(DATA_DIR)/train-labels-idx1-ubyte"
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/header_bytes_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 8
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/record_bytes_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/footer_bytes_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1/buffer_size_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 262144
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FixedLengthRecordDataset_1"
+      op: "FixedLengthRecordDataset"
+      input: "FixedLengthRecordDataset_1/filenames_1:output:0"
+      input: "FixedLengthRecordDataset_1/header_bytes_1:output:0"
+      input: "FixedLengthRecordDataset_1/record_bytes_1:output:0"
+      input: "FixedLengthRecordDataset_1/footer_bytes_1:output:0"
+      input: "FixedLengthRecordDataset_1/buffer_size_1:output:0"
+    }
+    node_def {
+      name: "MapDataset_1"
+      op: "MapDataset"
+      input: "FixedLengthRecordDataset_1:handle:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "tf_map_func_9a08860d"
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ZipDataset"
+      op: "ZipDataset"
+      input: "MapDataset:handle:0"
+      input: "MapDataset_1:handle:0"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "CacheDataset/filename"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: ""
+          }
+        }
+      }
+    }
+    node_def {
+      name: "CacheDataset"
+      op: "CacheDataset"
+      input: "ZipDataset:handle:0"
+      input: "CacheDataset/filename:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "RepeatDataset/count"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "RepeatDataset"
+      op: "RepeatDataset"
+      input: "CacheDataset:handle:0"
+      input: "RepeatDataset/count:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/buffer_size_2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 50000
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/seed"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset/seed2"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "ShuffleDataset"
+      op: "ShuffleDataset"
+      input: "RepeatDataset:handle:0"
+      input: "ShuffleDataset/buffer_size_2:output:0"
+      input: "ShuffleDataset/seed:output:0"
+      input: "ShuffleDataset/seed2:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 784
+              }
+            }
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+      attr {
+        key: "reshuffle_each_iteration"
+        value {
+          b: true
+        }
+      }
+    }
+    node_def {
+      name: "BatchDataset/batch_size"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -123
+          }
+        }
+      }
+    }
+    node_def {
+      name: "BatchDataset"
+      op: "BatchDataset"
+      input: "ShuffleDataset:handle:0"
+      input: "BatchDataset/batch_size:output:0"
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: -1
+              }
+              dim {
+                size: 784
+              }
+            }
+            shape {
+              dim {
+                size: -1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FilterDataset/batch_size_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: -123
+          }
+        }
+      }
+    }
+    node_def {
+      name: "FilterDataset"
+      op: "FilterDataset"
+      input: "BatchDataset:handle:0"
+      input: "FilterDataset/batch_size_1:output:0"
+      attr {
+        key: "Targuments"
+        value {
+          list {
+            type: DT_INT64
+          }
+        }
+      }
+      attr {
+        key: "output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: -1
+              }
+              dim {
+                size: 784
+              }
+            }
+            shape {
+              dim {
+                size: -1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "output_types"
+        value {
+          list {
+            type: DT_FLOAT
+            type: DT_INT32
+          }
+        }
+      }
+      attr {
+        key: "predicate"
+        value {
+          func {
+            name: "tf_predicate_7089b845"
+          }
+        }
+      }
+    }
+    ret {
+      key: "FilterDataset"
+      value: "FilterDataset:handle:0"
+    }
+  }
+}
+)PREFIX";
+
+  *dataset_name = "_make_dataset_2451e43a";
+  std::function<void(FunctionDef*)> mutate_proto_func =
+      [dataset_name, file_path, batch_size](FunctionDef* fdef) {
+        VLOG(1) << "Processsing function " << fdef->DebugString();
+        if (std::string(fdef->signature().name()) != *dataset_name) return;
+        // Change the input file pattern to `file_path`.
+        bool found_file_path = false, found_batch_size = false;
+        // `node_def` may be mutated.
+        for (auto& node_def : *fdef->mutable_node_def()) {
+          if (node_def.name() == "FixedLengthRecordDataset/filenames" ||
+              node_def.name() == "FixedLengthRecordDataset_1/filenames_1") {
+            DCHECK_EQ(node_def.op(), "Const");
+            DCHECK_GT(node_def.attr().count("value"), 0);
+            found_file_path = true;
+            // Replace $(DATA_DIR)/foo with <file_path>/foo
+            // TODO(hongm): Use StringPiece manipulation for better efficiency.
+            const std::string cur_value =
+                node_def.attr().at("value").tensor().string_val(0);
+            const std::string pattern = "$(DATA_DIR)";
+            DCHECK_EQ(cur_value.compare(0, pattern.length(), pattern), 0);
+            const std::string new_value =
+                file_path + cur_value.substr(pattern.length());
+            VLOG(1) << "Setting the value of node_def " << node_def.name()
+                    << " to " << new_value;
+            auto* tensor = (*node_def.mutable_attr())["value"].mutable_tensor();
+            tensor->clear_string_val();
+            tensor->add_string_val(new_value);
+          } else if (node_def.name() == "BatchDataset/batch_size" ||
+                     node_def.name() == "FilterDataset/batch_size_1") {
+            DCHECK_EQ(node_def.op(), "Const");
+            DCHECK_GT(node_def.attr().count("value"), 0);
+            found_batch_size = true;
+            // Replace $(BATCH_SIZE) with `batch_size`
+            DCHECK_EQ(node_def.attr().at("value").tensor().int64_val(0), -123);
+            VLOG(1) << "Setting the batch size attr value of node_def "
+                    << node_def.name() << " to " << batch_size;
+            auto* tensor = (*node_def.mutable_attr())["value"].mutable_tensor();
+            tensor->clear_int64_val();
+            tensor->add_int64_val(batch_size);
+          }
+        }
+        VLOG(1) << "Rewrote function to " << fdef->DebugString();
+        DCHECK(found_file_path);
+        DCHECK(found_batch_size);
+      };
+  return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+}
+
+// Adds the input functions to `graph`.  On success, returns the created
+// IteratorGetNext node.
+static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph(
+    const std::vector<UniqueFuncPtr>& funcs, const std::string& dataset_name,
+    const std::vector<tensorflow::DataType>& output_types,
+    const std::vector<tensorflow::TensorShapeProto>& output_shapes,
+    TF_Graph* graph, TF_Status* status) {
+  DCHECK(!dataset_name.empty());
+  for (auto& func : funcs) {
+    TF_GraphCopyFunction(graph, func.get(), /*gradient*/ nullptr, status);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+  }
+
+  tensorflow::mutex_lock c(graph->mu);
+
+  tensorflow::NameAttrList func;
+  func.set_name(dataset_name);
+  // Run the iterator node on CPU.
+  Node* oneshot_iterator_node;
+  tensorflow::Status s = NodeBuilder("OneShotIterator", "OneShotIterator")
+                             .Device("/device:CPU:0")
+                             .Attr("container", "")
+                             .Attr("dataset_factory", func)
+                             .Attr("output_types", output_types)
+                             .Attr("output_shapes", output_shapes)
+                             .Attr("shared_name", "")
+                             .Finalize(&graph->graph, &oneshot_iterator_node);
+  if (!s.ok()) {
+    status->status = s;
+    return nullptr;
+  }
+  // Run shape inference function for each newly added node, so that more
+  // subsequent nodes can be added to the graph via C API (TF_NewOperation()).
+  s = graph->refiner.AddNode(oneshot_iterator_node);
+  if (!s.ok()) {
+    status->status = s;
+    return nullptr;
+  }
+
+  // Run the iterator node on CPU.
+  Node* getnext_node;
+  s = NodeBuilder("IteratorGetNext", "IteratorGetNext")
+          .Input(oneshot_iterator_node)
+          .Device("/device:CPU:0")
+          .Attr("output_types", output_types)
+          .Attr("output_shapes", output_shapes)
+          .Finalize(&graph->graph, &getnext_node);
+  if (!s.ok()) {
+    status->status = s;
+    return nullptr;
+  }
+  // Run shape inference function for each newly added node, so that more
+  // subsequent nodes can be added to the graph via C API (TF_NewOperation()).
+  s = graph->refiner.AddNode(getnext_node);
+  if (!s.ok()) {
+    status->status = s;
+    return nullptr;
+  }
+
+  VLOG(1) << "Output graph: " << graph->graph.ToGraphDefDebug().DebugString();
+  return ToTF_Operation(getnext_node);
+}
+
+TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph,
+                                                     TF_Status* status) {
+  tensorflow::Status s;
+
+  std::string dataset_name;
+  UniqueFuncPtr result_func = CreateFakeDatasetFunction(&dataset_name, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  std::vector<UniqueFuncPtr> funcs;
+  funcs.push_back(std::move(result_func));
+  std::vector<tensorflow::TensorShapeProto> output_shape_list;
+  output_shape_list.push_back(tensorflow::TensorShapeProto());
+  auto* getnext_node = AddDatasetFunctionAndIteratorNodesToGraph(
+      funcs, dataset_name, {tensorflow::DT_FLOAT}, output_shape_list, graph,
+      status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  return getnext_node;
+}
+
+TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
+    TF_Graph* graph, const char* file_path, int batch_size,
+    unsigned char is_mnist, TF_Status* status) {
+  tensorflow::Status s;
+
+  std::string dataset_name;
+  const auto& funcs =
+      is_mnist
+          ? CreateMNISTDatasetFunctions(file_path, batch_size, &dataset_name,
+                                        status)
+          : CreateImagenetDatasetFunctions(file_path, &dataset_name, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  std::vector<tensorflow::TensorShapeProto> output_shape_list;
+  // batch_size X 224 X 224 X 3
+  auto image_shape = tensorflow::TensorShapeProto();
+  image_shape.add_dim()->set_size(batch_size);
+  if (is_mnist) {
+    image_shape.add_dim()->set_size(784);
+  } else {
+    image_shape.add_dim()->set_size(224);
+    image_shape.add_dim()->set_size(224);
+    image_shape.add_dim()->set_size(3);
+  }
+  output_shape_list.push_back(image_shape);
+
+  // batch_size
+  auto label_shape = tensorflow::TensorShapeProto();
+  label_shape.add_dim()->set_size(batch_size);
+  output_shape_list.push_back(label_shape);
+  auto* getnext_node = AddDatasetFunctionAndIteratorNodesToGraph(
+      funcs, dataset_name, {tensorflow::DT_FLOAT, tensorflow::DT_INT32},
+      output_shape_list, graph, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  tensorflow::mutex_lock c(graph->mu);
+  VLOG(1) << "The extended graph: "
+          << graph->graph.ToGraphDefDebug().DebugString();
+
+  return getnext_node;
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 5a7b007e40aa199889b2d00b2bde5976c19e2966..ebcec8176b63f9a91c847ebe96fba3ff023fc599 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -25,6 +25,7 @@ limitations under the License.
 // Experimental C API for TensorFlow.
 //
 // The API here is subject to changes in the future.
+// --------------------------------------------------------------------------
 
 // Macro to control visibility of exported symbols in the shared library (.so,
 // .dylib, .dll).
@@ -59,6 +60,53 @@ extern "C" {
 TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
                                                    unsigned char enable);
 
+// Initializes TPU system. Must be called exactly once before TF_SessionRun() is
+// called on a TPU graph.
+//
+// The session graph must contain a node named ConfigureDistributedTPU.
+// TODO(b/74774824): Improve the API on initializing TPU system.
+TF_CAPI_EXPORT extern void TF_InitializeTPU(TF_Session* session,
+                                            TF_Status* status);
+
+// Shuts down TPU system. For any `session` where TF_InitializeTPU() has
+// been successfully called, this call must be made exactly once before the
+// session is closed.
+// The session graph must contain a node named ShutdownDistributedTPU.
+TF_CAPI_EXPORT extern void TF_ShutdownTPU(TF_Session* session,
+                                          TF_Status* status);
+
+// Returns the graph content in a human-readable format, with length set in
+// `len`. The format is subject to change in the future.
+// The returned string is heap-allocated, and caller should call free() on it.
+TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
+                                                      size_t* len);
+
+// Returns the graph content in a human-readable format, with length set in
+// `len`. The format is subject to change in the future.
+// The returned string is heap-allocated, and caller should call free() on it.
+TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
+                                                      size_t* len);
+
+// Creates a stack of data set + iterator nodes, currently hard-coded to return
+// a sequence of 3 float values <42.0, 43.0, 44.0> over 3 calls. On success,
+// returns the IteratorGetNext node, which caller can run or feed into an node.
+//
+// TODO(hongm): Extend the API to allow customization of the nodes created.
+TF_CAPI_EXPORT extern TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(
+    TF_Graph* graph, TF_Status* status);
+
+// Similar to the above API, except that the returned iterator reads the
+// file based dataset from `file_path`.
+// If `is_mnist` is 0, the dataset corresponds to ImageNet.
+// The iterators outputs 2 tensors:
+// - A float tensor of shape `batch_size` X 784 when `is_mnist` is non-zero, or
+// `batch_size` X 224 X 224 X 3 otherwise.
+// - An int32 tensor of shape `batch_size`
+// TODO(hongm): Extend the API to allow customization of the nodes created.
+TF_CAPI_EXPORT extern TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
+    TF_Graph* graph, const char* file_path, int batch_size,
+    unsigned char is_mnist, TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30fcfd401d9d634962d64aaa3bf348de91f2ecae
--- /dev/null
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/c_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+void TestFakeIteratorStack() {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  TF_Operation* get_next = TF_MakeFakeIteratorGetNextWithDatasets(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  CSession csession(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Run the graph.
+  const float base_value = 42.0;
+  for (int i = 0; i < 3; ++i) {
+    csession.SetOutputs({get_next});
+    csession.Run(s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_Tensor* out = csession.output_tensor(0);
+    ASSERT_TRUE(out != nullptr);
+    ASSERT_EQ(TF_FLOAT, TF_TensorType(out));
+    ASSERT_EQ(0, TF_NumDims(out));  // scalar
+    ASSERT_EQ(sizeof(float), TF_TensorByteSize(out));
+    float* output_contents = static_cast<float*>(TF_TensorData(out));
+    ASSERT_EQ(base_value + i, *output_contents);
+  }
+
+  // This should error out since we've exhausted the iterator.
+  csession.Run(s);
+  ASSERT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s)) << TF_Message(s);
+
+  // Clean up
+  csession.CloseAndDelete(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST(CAPI_EXPERIMENTAL, FakeIteratorGetNext) { TestFakeIteratorStack(); }
+
+TEST(CAPI_EXPERIMENTAL, ImagenetIteratorGetNext) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  const string file_path = tensorflow::io::JoinPath(
+      tensorflow::testing::TensorFlowSrcRoot(), "c/testdata/tf_record");
+  VLOG(1) << "data file path is " << file_path;
+  const int batch_size = 64;
+  TF_Operation* get_next = TF_MakeFileBasedIteratorGetNextWithDatasets(
+      graph, file_path.c_str(), batch_size, /*is_mnist*/ false, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  CSession csession(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Run the graph.
+  // The two output tensors should look like:
+  // Tensor("IteratorGetNext:0", shape=(batch_size, 224, 224, 3), dtype=float32)
+  // Tensor("IteratorGetNext:1", shape=(batch_size, ), dtype=int32)
+  for (int i = 0; i < 3; ++i) {
+    LOG(INFO) << "Running iter " << i;
+    csession.SetOutputs({{get_next, 0}, {get_next, 1}});
+    csession.Run(s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+    {
+      TF_Tensor* image = csession.output_tensor(0);
+      ASSERT_TRUE(image != nullptr);
+      ASSERT_EQ(TF_FLOAT, TF_TensorType(image));
+      // Confirm shape is 224 X 224 X 3
+      ASSERT_EQ(4, TF_NumDims(image));
+      ASSERT_EQ(batch_size, TF_Dim(image, 0));
+      ASSERT_EQ(224, TF_Dim(image, 1));
+      ASSERT_EQ(224, TF_Dim(image, 2));
+      ASSERT_EQ(3, TF_Dim(image, 3));
+      ASSERT_EQ(sizeof(float) * batch_size * 224 * 224 * 3,
+                TF_TensorByteSize(image));
+    }
+
+    {
+      TF_Tensor* label = csession.output_tensor(1);
+      ASSERT_TRUE(label != nullptr);
+      ASSERT_EQ(TF_INT32, TF_TensorType(label));
+      ASSERT_EQ(1, TF_NumDims(label));
+      ASSERT_EQ(batch_size, TF_Dim(label, 0));
+      ASSERT_EQ(sizeof(int32) * batch_size, TF_TensorByteSize(label));
+    }
+  }
+
+  // Clean up
+  csession.CloseAndDelete(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index e885a699274cfae04d5a17c736da1acfddcc7b3b..95652a11378d6276b5ba6540a07baa15aa77cc1c 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -84,19 +84,20 @@ struct TF_Graph {
   std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
       GUARDED_BY(mu);
 
-  // The keys of this map are all the active sessions using this graph.
-  // Each value is the current "runnability" status of the corresponding
-  // session. Under normal conditions all statuses are Status::OK(), but
-  // if some operation is mutated after it was run by a session (this
-  // is detected in RecordMutation function), that session is no longer
-  // safe to run. Its status will contain the error that will be returned
-  // to the user, should she try running this session.
+  // The keys of this map are all the active sessions using this graph. Each
+  // value records whether the graph has been mutated since the corresponding
+  // session has been run (this is detected in RecordMutation function). If the
+  // string is empty, no mutation has occurred. Otherwise the string is a
+  // description of the mutation suitable for returning to the user.
   //
   // Sessions are added to this map in TF_NewSession, and removed in
   // TF_DeleteSession.
   // TF_Graph may only / must be deleted when
   //   sessions.size() == 0 && delete_requested == true
-  tensorflow::gtl::FlatMap<TF_Session*, tensorflow::Status> sessions
+  //
+  // TODO(b/74949947): mutations currently trigger a warning instead of a bad
+  // status, this should be reverted when possible.
+  tensorflow::gtl::FlatMap<TF_Session*, tensorflow::string> sessions
       GUARDED_BY(mu);
   bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
 
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 028f146be31790b211e546978302e81afe26b231..ca80db23ed3ccbbdc49c61db6cd03ff735470512 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -53,7 +53,7 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index 22f77e7b874a13b3b6e0fbe981b4188c634db439..f3b28c1708129d39e451d927a89c0d10e2193b63 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -94,18 +94,22 @@ TF_Tensor* FloatTensor(float v) {
 // one cannot call ASSERT_* methods in non-void-returning functions (when
 // exceptions are disabled during compilation)
 void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name,
-                       TF_DataType dtype, TF_Operation** op) {
+                       TF_DataType dtype, const std::vector<int64_t>& dims,
+                       TF_Operation** op) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
   TF_SetAttrType(desc, "dtype", dtype);
+  if (!dims.empty()) {
+    TF_SetAttrShape(desc, "shape", dims.data(), dims.size());
+  }
   *op = TF_FinishOperation(desc, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   ASSERT_NE(*op, nullptr);
 }
 
 TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name,
-                          TF_DataType dtype) {
+                          TF_DataType dtype, const std::vector<int64_t>& dims) {
   TF_Operation* op;
-  PlaceholderHelper(graph, s, name, dtype, &op);
+  PlaceholderHelper(graph, s, name, dtype, dims, &op);
   return op;
 }
 
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index d87c57fd5193129665ca65761872a38131ee532b..cd19cf8d624d9b914b61132f93d918b046cdbd30 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -48,7 +48,8 @@ TF_Tensor* FloatTensor(float v);
 
 TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s,
                           const char* name = "feed",
-                          TF_DataType dtype = TF_INT32);
+                          TF_DataType dtype = TF_INT32,
+                          const std::vector<int64_t>& dims = {});
 
 TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
                     const char* name = "const");
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3046d9064a6d4b39cd8a7209d7f20e1e779c2847..a2d96357ac8a55be7fe03bf58e33ff1733967dd1 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -27,6 +27,14 @@ tf_cuda_library(
             ":runtime",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core/common_runtime/eager:context",
+            "//tensorflow/core/common_runtime/eager:eager_executor",
+            "//tensorflow/core/common_runtime/eager:execute",
+            "//tensorflow/core/common_runtime/eager:execute_node",
+            "//tensorflow/core/common_runtime/eager:kernel_and_device",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
+            "//tensorflow/core/common_runtime/eager:copy_to_device_node",
             "//tensorflow/core:core_cpu_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -54,12 +62,17 @@ tf_cuda_library(
         ":runtime",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:eager_executor",
+        "//tensorflow/core/common_runtime/eager:kernel_and_device",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
     ],
 )
 
@@ -94,6 +107,7 @@ tf_cuda_library(
         "//conditions:default": [
             "//tensorflow/c:c_api",
             "//tensorflow/core:core_cpu",
+            "//tensorflow/core/common_runtime/eager:kernel_and_device",
             "//tensorflow/core:core_cpu_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 0811bd363f2aecedc94488b6ee87fac3f4b2af14..c96a38dec3ed7bcbbd77415ec3b158390def797e 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -32,6 +32,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -71,18 +74,6 @@ std::atomic_int_fast64_t func_id_generator(0);
 
 }  // namespace
 
-TFE_ContextDevicePlacementPolicy PlacementPolicy(
-    bool soft_placement, TFE_ContextDevicePlacementPolicy original_policy) {
-  if (!soft_placement) {
-    return original_policy;
-  }
-  if (original_policy == TFE_DEVICE_PLACEMENT_EXPLICIT ||
-      original_policy == TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32) {
-    return TFE_DEVICE_PLACEMENT_SILENT;
-  }
-  return original_policy;
-}
-
 extern "C" {
 
 TFE_ContextOptions* TFE_NewContextOptions() { return new TFE_ContextOptions; }
@@ -104,19 +95,7 @@ void TFE_ContextOptionsSetDevicePlacementPolicy(
 TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
                                                         unsigned char async,
                                                         TF_Status* status) {
-  {
-    tensorflow::mutex_lock l(ctx->async_map_mu);
-    ctx->thread_local_async[std::this_thread::get_id()] = async;
-  }
-  if (async) {
-    ctx->executor.EnableAsync();
-  } else {
-    // TODO(agarwal): Currently we add a wait here to handle cases where a sync
-    // op has a control dependency on an async op, and the latter has not
-    // executed yet. This wait can be removed by storing all the control inputs
-    // and waiting for them when executing ops.
-    status->status = ctx->executor.WaitForAllPendingNodes();
-  }
+  status->status = ctx->context.SetAsyncForThread(async);
 }
 
 void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
@@ -133,60 +112,47 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
       new tensorflow::DeviceMgr(devices));
   tensorflow::Rendezvous* r =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
-  return new TFE_Context(*opts, std::move(device_mgr), r);
+  return new TFE_Context(opts->session_options.options, opts->policy,
+                         opts->async, std::move(device_mgr), r);
 }
 
 void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) {
-  status->status = ctx->executor.WaitForAllPendingNodes();
-  {
-    tensorflow::mutex_lock ml(ctx->cache_mu);
-    tensorflow::gtl::STLDeleteValues(&ctx->kernel_cache);
-  }
-  ctx->rendezvous->Unref();
   delete ctx;
 }
 
 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   TF_DeviceList* list = new TF_DeviceList;
-  ctx->device_manager->ListDeviceAttributes(&list->response);
+  ctx->context.device_mgr()->ListDeviceAttributes(&list->response);
   return list;
 }
 
-void TFE_ContextClearCaches(TFE_Context* ctx) {
-  tensorflow::mutex_lock ml(ctx->cache_mu);
-  tensorflow::gtl::STLDeleteValues(&ctx->kernel_cache);
-}
+void TFE_ContextClearCaches(TFE_Context* ctx) { ctx->context.ClearCaches(); }
 
 void TFE_ContextSetThreadLocalDevicePlacementPolicy(
     TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy) {
-  tensorflow::mutex_lock ml(ctx->policy_map_mu);
-  ctx->thread_local_policies[std::this_thread::get_id()] = policy;
+  ctx->context.SetThreadLocalDevicePlacementPolicy(
+      static_cast<tensorflow::ContextDevicePlacementPolicy>(policy));
 }
 
 // Note: this function looks up a thread local policy. So it should be called in
 // the appropriate client thread. In particular, in async mode, it may not be
-// safe to call this function from the async TFE_Executor threads.
+// safe to call this function from the async EagerExecutor threads.
 extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
     TFE_Context* ctx) {
-  tensorflow::mutex_lock ml(ctx->policy_map_mu);
-  auto policy_map_it =
-      ctx->thread_local_policies.find(std::this_thread::get_id());
-  if (policy_map_it != ctx->thread_local_policies.end()) {
-    return policy_map_it->second;
-  }
-  return ctx->policy;
+  return static_cast<TFE_ContextDevicePlacementPolicy>(
+      ctx->context.GetDevicePlacementPolicy());
 }
 
 void TFE_ContextAsyncWait(TFE_Context* ctx, TF_Status* status) {
-  status->status = ctx->executor.WaitForAllPendingNodes();
+  status->status = ctx->context.AsyncWait();
 }
 
 void TFE_ContextGetStatus(TFE_Context* ctx, TF_Status* status) {
-  status->status = ctx->executor.status();
+  status->status = ctx->context.GetStatus();
 }
 
 void TFE_ContextAsyncClearError(TFE_Context* ctx) {
-  ctx->executor.ClearError();
+  ctx->context.ClearAsyncError();
 }
 
 TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
@@ -198,29 +164,32 @@ TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
 
 void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
   DCHECK(h);
-  h->Unref();
+  if (h->handle) {
+    h->handle->Unref();
+  }
+  delete h;
 }
 
 TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h) {
-  return static_cast<TF_DataType>(h->dtype);
+  return static_cast<TF_DataType>(h->handle->dtype);
 }
 
 int TFE_TensorHandleNumDims(TFE_TensorHandle* h, TF_Status* status) {
   const tensorflow::Tensor* t = nullptr;
-  status->status = h->Tensor(&t);
+  status->status = h->handle->Tensor(&t);
   return t == nullptr ? 0 : t->dims();
 }
 
 int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index,
                             TF_Status* status) {
   const tensorflow::Tensor* t = nullptr;
-  status->status = h->Tensor(&t);
+  status->status = h->handle->Tensor(&t);
   return t == nullptr ? 0 : t->dim_size(dim_index);
 }
 
 const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) {
   tensorflow::Device* d = nullptr;
-  status->status = h->OpDevice(&d);
+  status->status = h->handle->OpDevice(&d);
   return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                         : d->name().c_str();
 }
@@ -230,98 +199,28 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
   tensorflow::Device* d = nullptr;
   tensorflow::Device* op_device = nullptr;
   const tensorflow::Tensor* t = nullptr;
-  status->status = h->TensorAndDevice(&t, &d, &op_device);
+  status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
   if (!status->status.ok()) return nullptr;
+  tensorflow::TensorHandle* h_cpu = nullptr;
   if (!IsCPU(d)) {
-    TF_SetStatus(status, TF_UNIMPLEMENTED,
-                 tensorflow::strings::StrCat(
-                     "TFE_TensorHandle can be resolved iff it is on CPU (this "
-                     "handle is on ",
-                     d->name(),
-                     "). Consider using TFE_TensorHandleCopyToDevice to get a "
-                     "copy of the tensor on CPU")
-                     .c_str());
-    return nullptr;
-  }
-  return tensorflow::TF_TensorFromTensor(*t, status);
-}
-}  // extern "C"
-
-namespace {
-
-tensorflow::Status TensorHandleCopyToDevice(TFE_TensorHandle* h,
-                                            TFE_Context* ctx,
-                                            tensorflow::Device* dstd,
-                                            TFE_TensorHandle** output) {
-  const tensorflow::Tensor* src = nullptr;
-  tensorflow::Device* srcd = nullptr;
-  // TODO(agarwal): src_opd is unused. Perhaps allow TensorAndDevice to accept
-  // nullptr.
-  tensorflow::Device* src_opd = nullptr;
-  TF_RETURN_IF_ERROR(h->TensorAndDevice(&src, &srcd, &src_opd));
-  if (srcd == nullptr) srcd = ctx->devices[0];
-  bool is_same_device =
-      (srcd == dstd) || (DeviceName(srcd) == DeviceName(dstd));
-  const bool dst_cpu = IsCPU(dstd);
-  const bool src_cpu = IsCPU(srcd);
-  // both_on_cpu can be true and yet is_same_device is false, if one of src/dst
-  // has device type XLA_CPU, and the other CPU.
-  const bool both_on_cpu = src_cpu && dst_cpu;
-  if (is_same_device || both_on_cpu) {
-    dstd = dst_cpu ? nullptr : dstd;
-    *output = new TFE_TensorHandle(*src, dstd, dstd);
-    return tensorflow::Status::OK();
-  }
-  if (!dst_cpu && (src->dtype() != tensorflow::DT_VARIANT &&
-                   !tensorflow::DataTypeCanUseMemcpy(src->dtype()))) {
-    return tensorflow::errors::InvalidArgument(
-        "Can't copy Tensor with type ",
-        tensorflow::DataTypeString(src->dtype()), " to device ",
-        DeviceName(dstd), ".");
-  }
-  tensorflow::AllocatorAttributes attr;
-  if (src->dtype() == tensorflow::DT_VARIANT) {
-    attr.set_on_host(true);
-  }
-  tensorflow::Tensor dst(dstd->GetAllocator(attr), src->dtype(), src->shape());
-  if (src->shape().num_elements() == 0) {
-    dstd = dst_cpu ? nullptr : dstd;
-    *output = new TFE_TensorHandle(dst, dstd, dstd);
-    return tensorflow::Status::OK();
-  }
-  tensorflow::DeviceContext* src_device_context = nullptr;
-  if (!src_cpu) {
-    src_device_context = srcd->tensorflow_gpu_device_info()->default_context;
-  }
-  tensorflow::DeviceContext* dst_device_context = nullptr;
-  if (!dst_cpu) {
-    dst_device_context = dstd->tensorflow_gpu_device_info()->default_context;
+    status->status = h->handle->CopyToDevice(
+        h->handle->Context(), h->handle->Context()->HostCPU(), &h_cpu);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+    status->status = h_cpu->TensorAndDevice(&t, &d, &op_device);
+    if (!status->status.ok()) {
+      h_cpu->Unref();
+      return nullptr;
+    }
   }
-  // TODO(ashankar): The Sync() call below may be more aggressive than
-  // necessary. It is based on knowledge of implementation details - that
-  // GPU devices are implemented using 3 streams - one for host->device copies,
-  // one for device->host copies and one for sending operations to the GPU.
-  // With that setup, Sync()ing across all 3 streams should be sufficient
-  // but more than necessary (since it waits for operations that might have
-  // nothing to do with this tensor to complete).
-  TF_RETURN_IF_ERROR(srcd->Sync());
-  tensorflow::Notification n;
-  tensorflow::Status status;
-  tensorflow::CopyTensor::ViaDMA("copy", src_device_context, dst_device_context,
-                                 srcd, dstd, tensorflow::AllocatorAttributes(),
-                                 tensorflow::AllocatorAttributes(), src, &dst,
-                                 [&status, &n](const tensorflow::Status& s) {
-                                   status = s;
-                                   n.Notify();
-                                 });
-  n.WaitForNotification();
-  if (status.ok()) {
-    dstd = dst_cpu ? nullptr : dstd;
-    *output = new TFE_TensorHandle(dst, dstd, dstd);
+  TF_Tensor* retval = tensorflow::TF_TensorFromTensor(*t, status);
+  if (h_cpu != nullptr) {
+    h_cpu->Unref();
   }
-  return status;
+  return retval;
 }
-}  // namespace
+}  // extern "C"
 
 extern "C" {
 
@@ -332,8 +231,7 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
   status->status = tensorflow::AttrTypeMapForOp(name, &types);
   if (status->status.ok()) return new TFE_Op(ctx, name, types);
   if (TF_GetCode(status) == TF_NOT_FOUND) {
-    tensorflow::mutex_lock l(ctx->functions_mu);
-    if (ctx->func_lib_def.Find(name) != nullptr) {
+    if (ctx->context.FindFunctionByName(name)) {
       status->status = tensorflow::Status::OK();
       return new TFE_Op(ctx, name, nullptr);
     }
@@ -346,15 +244,14 @@ void TFE_DeleteOp(TFE_Op* op) { delete op; }
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
   tensorflow::Device* d = nullptr;
   if (device_name != nullptr && strlen(device_name) > 0) {
-    status->status = op->ctx->device_manager->LookupDevice(device_name, &d);
-    if (!status->status.ok()) return;
+    status->status = op->ctx->context.FindDeviceByName(device_name, &d);
   }
   op->device = d;
 }
 
 const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
   tensorflow::Device* device =
-      (op->device == nullptr) ? op->ctx->devices[0] : op->device;
+      (op->device == nullptr) ? op->ctx->context.HostCPU() : op->device;
   return device->name().c_str();
 }
 
@@ -367,19 +264,8 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  if (op->device == nullptr) {
-    // Questionable heuristic ...
-    // - If a device was explicitly set on the op, always use that.
-    // - If not, place on the first non-host device seen.
-    tensorflow::Device* d = nullptr;
-    // TODO(agarwal): This call may block if h is not ready. Avoid this if
-    // possible.
-    status->status = h->Device(&d);
-    if (!status->status.ok()) return;
-    if (!IsCPU(d)) op->device = d;
-  }
-  h->Ref();
-  op->inputs.push_back(h);
+  h->handle->Ref();
+  op->inputs.push_back(h->handle);
   op->attrs.NumInputs(op->inputs.size());
 }
 
@@ -545,10 +431,39 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
 
 namespace {
 
+// Initializes the step stats if needed.
+void MaybeInitializeStepStats(tensorflow::StepStats* step_stats,
+                              tensorflow::EagerContext* ctx) {
+  // Lazily initialize the RunMetadata with information about all devices if
+  // this is the first call.
+  while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+    int device_idx = step_stats->dev_stats_size();
+    auto* dev_stats = step_stats->add_dev_stats();
+    dev_stats->set_device(ctx->devices()->at(device_idx)->name());
+  }
+}
+
+int StepStatsDeviceIndex(tensorflow::StepStats* step_stats,
+                         tensorflow::EagerContext* ctx,
+                         tensorflow::Device* device) {
+  // Find the current device's index.
+  if (device == nullptr) {
+    device = ctx->HostCPU();
+  }
+  for (int i = 0; i < ctx->devices()->size(); ++i) {
+    if (ctx->devices()->at(i) == device ||
+        ctx->devices()->at(i)->name() == device->name()) {
+      return i;
+    }
+  }
+  // TODO(apassos) do not fall back to host CPU if device is unknown.
+  return 0;
+}
+
 tensorflow::Status ValidateInputTypeAndPlacement(
-    TFE_Context* ctx, tensorflow::Device* host_device,
-    tensorflow::Device* op_device, TFE_Op* op,
-    const tensorflow::OpKernel* kernel) {
+    tensorflow::EagerContext* ctx, tensorflow::Device* op_device, TFE_Op* op,
+    const tensorflow::OpKernel* kernel, tensorflow::RunMetadata* run_metadata) {
+  tensorflow::Device* host_device = ctx->HostCPU();
   const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types();
   if (memtypes.size() != op->inputs.size()) {
     return tensorflow::errors::InvalidArgument(
@@ -557,14 +472,14 @@ tensorflow::Status ValidateInputTypeAndPlacement(
   for (int i = 0; i < op->inputs.size(); ++i) {
     const tensorflow::Device* expected_device =
         memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device;
-    TFE_TensorHandle* handle = op->inputs[i];
+    tensorflow::TensorHandle* handle = op->inputs[i];
     tensorflow::Device* handle_device = nullptr;
     TF_RETURN_IF_ERROR(handle->Device(&handle_device));
     const tensorflow::Device* actual_device =
         handle_device == nullptr ? host_device : handle_device;
     if (expected_device != actual_device) {
-      switch (TFE_ContextGetDevicePlacementPolicy(ctx)) {
-        case TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32:
+      switch (ctx->GetDevicePlacementPolicy()) {
+        case tensorflow::DEVICE_PLACEMENT_SILENT_FOR_INT32:
           // TODO(xpan): See if we could bubble python related error up
           // to python level.
           if (handle->dtype == tensorflow::DT_INT32) {
@@ -573,7 +488,7 @@ tensorflow::Status ValidateInputTypeAndPlacement(
             break;
           }
           TF_FALLTHROUGH_INTENDED;
-        case TFE_DEVICE_PLACEMENT_EXPLICIT:
+        case tensorflow::DEVICE_PLACEMENT_EXPLICIT:
           return tensorflow::errors::InvalidArgument(
               "Tensors on conflicting devices:"
               " cannot compute ",
@@ -581,11 +496,13 @@ tensorflow::Status ValidateInputTypeAndPlacement(
               expected_device->name(), " but is actually on ",
               actual_device->name(), " (operation running on ",
               op_device->name(), ")",
-              " Tensors can be copied explicitly using .gpu() or .cpu(),"
-              " or transparently copied by using tfe.enable_eager_execution("
-              "tfe.DEVICE_PLACEMENT_SILENT). Copying tensors between devices"
+              " Tensors can be copied explicitly using .gpu() or .cpu() "
+              "methods,"
+              " or transparently copied by using tf.enable_eager_execution("
+              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
+              "between devices"
               " may slow down your model");
-        case TFE_DEVICE_PLACEMENT_WARN:
+        case tensorflow::DEVICE_PLACEMENT_WARN:
           LOG(WARNING) << "before computing " << op->name << " input #" << i
                        << " was expected to be on " << expected_device->name()
                        << " but is actually on " << actual_device->name()
@@ -593,16 +510,27 @@ tensorflow::Status ValidateInputTypeAndPlacement(
                        << "). This triggers a copy which can be a performance "
                           "bottleneck.";
           break;
-        case TFE_DEVICE_PLACEMENT_SILENT:  // Do nothing.
+        case tensorflow::DEVICE_PLACEMENT_SILENT:  // Do nothing.
           break;
       }
       // We are only here if the policy is warn or silent copies, so we should
       // trigger a copy.
-      TF_Status* s = TF_NewStatus();
-      TFE_TensorHandle* copied_tensor = TFE_TensorHandleCopyToDevice(
-          handle, ctx, expected_device->name().c_str(), s);
-      tensorflow::Status status = s->status;
-      TF_DeleteStatus(s);
+      auto pre_time = tensorflow::Env::Default()->NowMicros();
+      tensorflow::TensorHandle* copied_tensor = nullptr;
+      tensorflow::Status status = tensorflow::EagerCopyToDevice(
+          handle, ctx, expected_device->name().c_str(), &copied_tensor);
+      if (run_metadata != nullptr) {
+        auto* step_stats = run_metadata->mutable_step_stats();
+        MaybeInitializeStepStats(step_stats, ctx);
+        // Record the sending on the source device for now.
+        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
+        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+        auto* node_stats = dev_stats->add_node_stats();
+        node_stats->set_node_name("_Send");
+        node_stats->set_all_start_micros(pre_time);
+        node_stats->set_op_end_rel_micros(
+            tensorflow::Env::Default()->NowMicros() - pre_time);
+      }
       if (!status.ok()) {
         if (copied_tensor != nullptr) copied_tensor->Unref();
         return tensorflow::errors::Internal(
@@ -629,7 +557,7 @@ tensorflow::Status ValidateInputTypeAndPlacement(
 tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
                                  TFE_Context* ctx, TF_Status* status) {
   tensorflow::DeviceSet ds;
-  for (tensorflow::Device* d : ctx->devices) {
+  for (tensorflow::Device* d : *ctx->context.devices()) {
     ds.AddDevice(d);
   }
   tensorflow::DeviceTypeVector final_devices;
@@ -643,7 +571,7 @@ tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
         "Could not find valid device for node ", ndef.DebugString());
     return nullptr;
   }
-  for (tensorflow::Device* d : ctx->devices) {
+  for (tensorflow::Device* d : *ctx->context.devices()) {
     if (d->device_type() == final_devices[0].type_string()) {
       return d;
     }
@@ -653,186 +581,6 @@ tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
   return nullptr;
 }
 
-tensorflow::Status Execute(
-    TFE_Context* ctx, tensorflow::Device* device,
-    const tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 4>& op_inputs,
-    tensorflow::KernelAndDevice* kernel, tensorflow::NodeExecStats* maybe_stats,
-    TFE_TensorHandle** retvals, int num_retvals) {
-  if (!ctx->soft_placement && device == nullptr) {
-    // TODO(ashankar): ASSUMPTION: ctx->devices[0] is always CPU
-    device = ctx->devices[0];
-  }
-
-  if (device == nullptr) {
-    // TODO(apassos) debug how the assignment below might return a different
-    // device from the one requested above.
-    device = kernel->device();
-  }
-
-  std::vector<tensorflow::Tensor> outputs(1);
-  const tensorflow::MemoryTypeVector* output_memory_types = nullptr;
-  output_memory_types = &kernel->kernel()->output_memory_types();
-  std::vector<tensorflow::Tensor> inputs(op_inputs.size());
-  for (int i = 0; i < op_inputs.size(); ++i) {
-    const tensorflow::Tensor* input_tensor = nullptr;
-    TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
-    inputs[i] = *input_tensor;
-  }
-  // WARNING: kernel->Run utilizes the FunctionLibraryRuntime
-  // (ctx->func_lib(device)), which in turn holds a pointer to func_lib_def,
-  // which is GUARDED_BY(ctx->functions_mu). But knowledge of the implementation
-  // of FunctionLibraryRuntime tells us that func_lib_def is not accessed by
-  // FunctionLibraryRuntime::Run(), so there is no thread-safety concern here.
-  // This is quite subtle. Re-work things to make this better?  (Would it make
-  // sense for FunctionLibraryRuntime to ensure thread-safe access to
-  // FunctionLibraryDefinition?).  TODO(apassos) figure out how to record stats
-  // for ops which are a part of functions.
-  // TODO(agarwal): change Run to take vector of handles ?
-  TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats));
-  if (maybe_stats != nullptr) {
-    maybe_stats->set_op_end_rel_micros(tensorflow::Env::Default()->NowMicros() -
-                                       maybe_stats->all_start_micros());
-    tensorflow::mutex_lock ml(ctx->metadata_mu);
-    if (ctx->should_store_metadata.load()) {
-      auto* step_stats = ctx->run_metadata.mutable_step_stats();
-      // Lazily initialize the RunMetadata with information about all devices if
-      // this is the first call.
-      while (step_stats->dev_stats_size() < ctx->devices.size()) {
-        step_stats->add_dev_stats();
-      }
-      // Find the current device's index.
-      int device_idx = 0;
-      for (int i = 0; i < ctx->devices.size(); ++i) {
-        if (ctx->devices[i] == device) {
-          device_idx = i;
-          break;
-        }
-      }
-      // Populate the device stats for this device.
-      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-      dev_stats->set_device(device->name());
-      *dev_stats->add_node_stats() = *maybe_stats;
-    }
-  }
-  DCHECK_EQ(num_retvals, outputs.size());
-  tensorflow::Device* op_device = IsCPU(device) ? nullptr : device;
-  for (int i = 0; i < num_retvals; ++i) {
-    tensorflow::Device* d = op_device;
-    if (d != nullptr && output_memory_types != nullptr &&
-        (*output_memory_types)[i] == tensorflow::HOST_MEMORY) {
-      d = nullptr;
-    }
-    if (retvals[i] == nullptr) {
-      retvals[i] = new TFE_TensorHandle(outputs[i], d, op_device);
-    } else {
-      retvals[i]->SetTensorAndDevice(outputs[i], d, op_device);
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
-// TODO(agarwal): move TFE_Executor and TFE_Node related code to a separate
-// file.
-class ExecuteNode : public TFE_Node {
- public:
-  ExecuteNode(TFE_Op* op, tensorflow::KernelAndDevice* kernel,
-              tensorflow::NodeExecStats* maybe_stats,
-              const tensorflow::DataTypeVector& output_dtypes,
-              TFE_TensorHandle** retvals, int num_retvals)
-      : TFE_Node(op->ctx->executor.NextId()),
-        ctx_(op->ctx),
-        op_device_(op->device),
-        inputs_(op->inputs),
-        kernel_(kernel),
-        maybe_stats_(maybe_stats),
-        retvals_(num_retvals) {
-    for (auto handle : inputs_) {
-      handle->Ref();
-    }
-    TFE_Context* ctx = op->ctx;
-    for (int i = 0; i < num_retvals; ++i) {
-      TFE_TensorHandle* h = new TFE_TensorHandle(id, output_dtypes[i], ctx);
-      h->Ref();
-      retvals[i] = h;
-      retvals_[i] = h;
-    }
-  }
-
-  ~ExecuteNode() override {
-    for (auto handle : inputs_) {
-      handle->Unref();
-    }
-    for (auto handle : retvals_) {
-      handle->Unref();
-    }
-  }
-
-  tensorflow::Status Run() override {
-    const tensorflow::Status status =
-        Execute(ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
-                retvals_.begin(), retvals_.size());
-    if (status.ok()) {
-      return status;
-    } else {
-      return tensorflow::Status(
-          status.code(),
-          tensorflow::strings::StrCat("Got error, \"", status.error_message(),
-                                      "\" while executing kernel ",
-                                      kernel_->kernel()->def().DebugString()));
-    }
-  }
-
- private:
-  TFE_Context* ctx_;
-  tensorflow::Device* op_device_;
-  tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 4> inputs_;
-  tensorflow::KernelAndDevice* kernel_;
-  std::unique_ptr<tensorflow::NodeExecStats> maybe_stats_;
-  tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 2> retvals_;
-};
-
-class CopyToDeviceNode : public TFE_Node {
- public:
-  CopyToDeviceNode(TFE_TensorHandle* src, tensorflow::Device* dstd,
-                   TFE_Context* ctx)
-      : TFE_Node(ctx->executor.NextId()),
-        src_(src),
-        dstd_(dstd),
-        ctx_(ctx),
-        dst_(new TFE_TensorHandle(id, src_->dtype, ctx)) {
-    src_->Ref();
-    dst_->Ref();
-  }
-
-  ~CopyToDeviceNode() override {
-    src_->Unref();
-    dst_->Unref();
-  }
-
-  tensorflow::Status Run() override {
-    TFE_TensorHandle* temp = nullptr;
-    TF_RETURN_IF_ERROR(TensorHandleCopyToDevice(src_, ctx_, dstd_, &temp));
-    const tensorflow::Tensor* tensor = nullptr;
-    tensorflow::Device* device = nullptr;
-    tensorflow::Device* op_device = nullptr;
-    tensorflow::Status status =
-        temp->TensorAndDevice(&tensor, &device, &op_device);
-    // `temp` is a ready handle. So the following call should return OK.
-    TF_DCHECK_OK(status) << status.error_message();
-    DCHECK(tensor);
-    dst_->SetTensorAndDevice(*tensor, device, op_device);
-    temp->Unref();
-    return tensorflow::Status::OK();
-  }
-
-  TFE_TensorHandle* dst() { return dst_; }
-
- private:
-  TFE_TensorHandle* src_;
-  tensorflow::Device* dstd_;
-  TFE_Context* ctx_;
-  TFE_TensorHandle* dst_;
-};
 
 #ifdef TENSORFLOW_EAGER_USE_XLA
 // Synthesizes and returns a wrapper function over `op`, which must be a
@@ -861,8 +609,7 @@ const tensorflow::FunctionDef* OpToFunction(
   TFE_Context* ctx = op->ctx;
   const tensorflow::OpRegistrationData* op_data;
   {
-    tensorflow::tf_shared_lock l(ctx->functions_mu);
-    status->status = ctx->func_lib_def.LookUp(op->name, &op_data);
+    status->status = ctx->context.FindFunctionOpData(op->name, &op_data);
     if (!status->status.ok()) {
       return nullptr;
     }
@@ -958,10 +705,9 @@ const tensorflow::FunctionDef* OpToFunction(
   }
   VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString();
 
-  tensorflow::mutex_lock l(ctx->functions_mu);
-  status->status = ctx->func_lib_def.AddFunctionDef(fdef);
+  status->status = ctx->context.AddFunctionDef(fdef);
   if (!status->status.ok()) return nullptr;
-  const auto ret = ctx->func_lib_def.Find(signature->name());
+  const auto ret = ctx->context.FindFunctionDef(signature->name());
   DCHECK(ret != nullptr);
   return ret;
 }
@@ -980,8 +726,7 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
 
   const tensorflow::FunctionDef* fdef;
   {
-    tensorflow::tf_shared_lock l(op->ctx->functions_mu);
-    fdef = op->ctx->func_lib_def.Find(op->name);
+    fdef = op->ctx->context.FindFunctionDef(op->name);
   }
   std::vector<TF_DataType> const_input_types;
   std::vector<TF_DataType> arg_input_types;
@@ -1008,7 +753,7 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
   // Since input param reordering may have occurred between `op` and `launch_op`
   // via `op_input_to_func_input`, adjust the actual inputs accordingly.
   launch_op->inputs = op->inputs;
-  for (TFE_TensorHandle* h : launch_op->inputs) {
+  for (tensorflow::TensorHandle* h : launch_op->inputs) {
     h->Ref();
   }
   if (!op_input_to_func_input.empty()) {
@@ -1058,7 +803,7 @@ extern "C" {
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
   TFE_Context* ctx = op->ctx;
-  status->status = ctx->executor.status();
+  status->status = ctx->context.GetStatus();
   if (!status->status.ok()) {
     return;
   }
@@ -1079,10 +824,14 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     tensorflow::Device* input_op_device = nullptr;
     status->status = op->inputs[i]->OpDevice(&input_op_device);
     if (!status->status.ok()) return;
+    VLOG(2) << "for op " << op->name << " input " << i << " "
+            << tensorflow::DataTypeString(op->inputs[i]->dtype) << " "
+            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
+            << " " << (op->device == nullptr ? "cpu" : op->device->name());
     if (op->inputs[i]->dtype == tensorflow::DT_RESOURCE &&
-        input_op_device != op->device) {
+        (input_op_device != op->device || input_op_device == nullptr)) {
       tensorflow::Device* d =
-          input_op_device == nullptr ? ctx->devices[0] : input_op_device;
+          input_op_device == nullptr ? ctx->context.HostCPU() : input_op_device;
       VLOG(1) << "Changing device of operation " << op->name << " to "
               << d->name() << " because input #" << i
               << " is a resource in this device.";
@@ -1090,40 +839,32 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     }
   }
   tensorflow::Device* device = op->device;
-  if (!ctx->soft_placement && device == nullptr) {
-    // TODO(ashankar): ASSUMPTION: ctx->devices[0] is always CPU
-    device = ctx->devices[0];
-  }
 
   tensorflow::Fprint128 cache_key =
       op->attrs.CacheKey(device == nullptr ? "unspecified" : device->name());
-  tensorflow::KernelAndDevice* kernel;
-  {
-    tensorflow::tf_shared_lock l(ctx->cache_mu);
-    kernel = tensorflow::gtl::FindPtrOrNull(ctx->kernel_cache, cache_key);
-  }
+  tensorflow::KernelAndDevice* kernel = ctx->context.GetCachedKernel(cache_key);
   if (kernel == nullptr) {
     const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
-    if (ctx->soft_placement && device == nullptr) {
+    if (device == nullptr) {
       device = SelectDevice(ndef, ctx, status);
       if (!status->status.ok()) {
         return;
       }
     }
     CHECK(device != nullptr);
-    if (ctx->log_device_placement) {
+    if (ctx->context.LogDevicePlacement()) {
       LOG(INFO) << "Executing op " << ndef.op() << " in device "
                 << device->name();
     }
-    kernel = new tensorflow::KernelAndDevice(ctx->rendezvous);
+    kernel = new tensorflow::KernelAndDevice(ctx->context.GetRendezvous());
     // Knowledge of the implementation of Init (and in-turn
     // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
     // will be accessed, so grab on to the lock.
     // See WARNING comment in Execute (before kernel->Run) - would be nice to
     // rework to avoid this subtlety.
-    tensorflow::tf_shared_lock l(ctx->functions_mu);
-    status->status =
-        tensorflow::KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel);
+    tensorflow::tf_shared_lock l(*ctx->context.FunctionsMu());
+    status->status = tensorflow::KernelAndDevice::Init(
+        ndef, ctx->context.func_lib(device), kernel);
     if (!status->status.ok()) {
       delete kernel;
       return;
@@ -1131,7 +872,7 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     // Update output_dtypes inside `kernel`.
     const tensorflow::OpDef* op_def = nullptr;
     const tensorflow::FunctionDef* function_def =
-        ctx->func_lib_def.Find(ndef.op());
+        ctx->context.FuncLibDef()->Find(ndef.op());
     if (function_def != nullptr) {
       op_def = &(function_def->signature());
     }
@@ -1147,8 +888,7 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     if (!status->status.ok()) {
       return;
     }
-    tensorflow::mutex_lock ml(ctx->cache_mu);
-    tensorflow::gtl::InsertOrUpdate(&(ctx->kernel_cache), cache_key, kernel);
+    ctx->context.AddKernelToCache(cache_key, kernel);
   }
   const tensorflow::DataTypeVector& output_dtypes = kernel->output_dtypes();
   const int output_dtypes_size = output_dtypes.size();
@@ -1166,11 +906,13 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     // device from the one requested above.
     device = kernel->device();
   }
-  status->status = ValidateInputTypeAndPlacement(ctx, ctx->devices[0], device,
-                                                 op, kernel->kernel());
+  status->status = ValidateInputTypeAndPlacement(
+      &ctx->context, device, op, kernel->kernel(),
+      ctx->context.ShouldStoreMetadata() ? ctx->context.RunMetadataProto()
+                                         : nullptr);
   if (!status->status.ok()) return;
   std::unique_ptr<tensorflow::NodeExecStats> maybe_stats;
-  if (ctx->should_store_metadata.load()) {
+  if (ctx->context.ShouldStoreMetadata()) {
     maybe_stats.reset(new tensorflow::NodeExecStats);
     maybe_stats->set_node_name(op->name);
     maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros());
@@ -1178,21 +920,34 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros());
     // TODO(apassos) track referenced tensors
   }
-  if (ctx->Async()) {
+  if (ctx->context.Async()) {
     // Note that for async mode, execution order will make sure that all
     // input handles are ready before executing them.
     // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
-    TFE_Node* node = new ExecuteNode(op, kernel, maybe_stats.release(),
-                                     output_dtypes, retvals, *num_retvals);
-    ctx->executor.Add(node);
+    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
+        *num_retvals);
+    tensorflow::uint64 id = op->ctx->context.NextId();
+    for (int i = 0; i < *num_retvals; ++i) {
+      tensorflow::TensorHandle* h =
+          new tensorflow::TensorHandle(id, output_dtypes[i], &op->ctx->context);
+      retvals[i] = new TFE_TensorHandle(h);
+      handle_retvals[i] = h;
+    }
+    tensorflow::EagerNode* node = new tensorflow::ExecuteNode(
+        id, &op->ctx->context, op->device, op->inputs, kernel,
+        maybe_stats.release(), output_dtypes, handle_retvals);
+    ctx->context.ExecutorAdd(node);
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
+    std::vector<tensorflow::TensorHandle*> handle_retvals(*num_retvals,
+                                                          nullptr);
+    status->status = tensorflow::EagerExecute(
+        &op->ctx->context, op->device, op->inputs, kernel, maybe_stats.get(),
+        handle_retvals.data(), *num_retvals);
     for (int i = 0; i < *num_retvals; ++i) {
-      retvals[i] = nullptr;
+      retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
     }
-    status->status = Execute(op->ctx, op->device, op->inputs, kernel,
-                             maybe_stats.get(), retvals, *num_retvals);
   }
 }
 
@@ -1200,26 +955,13 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
                                                TFE_Context* ctx,
                                                const char* device_name,
                                                TF_Status* status) {
-  status->status = ctx->executor.status();
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-  tensorflow::Device* dstd = ctx->devices[0];
-  if (device_name != nullptr && strlen(device_name) > 0) {
-    status->status = ctx->device_manager->LookupDevice(device_name, &dstd);
-    if (!status->status.ok()) return nullptr;
-  }
-  if (ctx->Async()) {
-    // Note that `h` may not be currently ready. However execution order will
-    // make sure that `h` is ready before the copy is actually done.
-    CopyToDeviceNode* node = new CopyToDeviceNode(h, dstd, ctx);
-    ctx->executor.Add(node);
-    return node->dst();
-  } else {
-    TFE_TensorHandle* output = nullptr;
-    status->status = TensorHandleCopyToDevice(h, ctx, dstd, &output);
-    return output;
+  tensorflow::TensorHandle* handle;
+  status->status = tensorflow::EagerCopyToDevice(h->handle, &ctx->context,
+                                                 device_name, &handle);
+  if (status->status.ok()) {
+    return new TFE_TensorHandle(handle);
   }
+  return nullptr;
 }
 
 void TFE_ContextAddFunctionDef(TFE_Context* ctx,
@@ -1231,24 +973,20 @@ void TFE_ContextAddFunctionDef(TFE_Context* ctx,
         tensorflow::errors::InvalidArgument("Invalid FunctionDef proto");
     return;
   }
-  tensorflow::mutex_lock l(ctx->functions_mu);
-  status->status = ctx->func_lib_def.AddFunctionDef(function_def);
+  status->status = ctx->context.AddFunctionDef(function_def);
 }
 
 void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
                             TF_Status* status) {
-  tensorflow::mutex_lock l(ctx->functions_mu);
-  status->status = ctx->func_lib_def.AddFunctionDef(function->fdef);
+  status->status = ctx->context.AddFunctionDef(function->fdef);
 }
 
 void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
-  ctx->should_store_metadata.store(true);
+  ctx->context.SetShouldStoreMetadata(true);
 }
 
 void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
-  tensorflow::mutex_lock ml(ctx->metadata_mu);
-  ctx->should_store_metadata.store(false);
-  ctx->run_metadata.Clear();
+  ctx->context.SetShouldStoreMetadata(false);
 }
 
 }  // extern "C"
@@ -1262,7 +1000,7 @@ const tensorflow::Tensor* TFE_TensorHandleUnderlyingTensorInHostMemory(
   tensorflow::Device* d = nullptr;
   tensorflow::Device* op_device = nullptr;
   const tensorflow::Tensor* t = nullptr;
-  status->status = h->TensorAndDevice(&t, &d, &op_device);
+  status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
   if (!status->status.ok()) return nullptr;
   if (d != nullptr) {
     status->status = tensorflow::errors::FailedPrecondition(
@@ -1277,9 +1015,9 @@ void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
                                   TF_Status* status) {
   TFE_ContextAsyncWait(ctx, status);
   if (!status->status.ok()) return;
-  tensorflow::mutex_lock ml(ctx->metadata_mu);
-  status->status = MessageToBuffer(ctx->run_metadata, buf);
-  ctx->run_metadata.Clear();
+  tensorflow::mutex_lock ml(*ctx->context.MetadataMu());
+  status->status = MessageToBuffer(*ctx->context.RunMetadataProto(), buf);
+  ctx->context.RunMetadataProto()->Clear();
 }
 
 namespace {
@@ -1353,207 +1091,9 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
 }
 }  // namespace tensorflow
 
-TFE_Node::TFE_Node(tensorflow::uint64 id) : id(id) {}
-
-TFE_Executor::~TFE_Executor() {
-  tensorflow::mutex_lock l(node_queue_mutex_);
-  thread_done_ = true;
-  nodes_pending_.notify_all();
-}
-
-tensorflow::uint64 TFE_Executor::NextId() {
-  tensorflow::mutex_lock l(next_id_mutex_);
-  return next_id_++;
-}
-
-void TFE_Executor::EnableAsync() {
-  tensorflow::mutex_lock l(node_queue_mutex_);
-  if (thread_ == nullptr) {
-    thread_.reset(tensorflow::Env::Default()->StartThread(
-        tensorflow::ThreadOptions(), "eager_async_executor",
-        std::bind(&TFE_Executor::Run, this)));
-  }
-}
-
-void TFE_Executor::Add(TFE_Node* node) {
-  tensorflow::mutex_lock l(node_queue_mutex_);
-  DCHECK(thread_) << "EnableAsync should have been called before Add";
-  if (!status_.ok()) {
-    delete node;
-    return;
-  }
-  int qlen = node_queue_.size();
-  if (qlen > 0) {
-    if (node_queue_.back()->id >= node->id) {
-      status_ = tensorflow::errors::InvalidArgument(
-          "Inserting TFE_Node with non-increasing ids:", node_queue_.back()->id,
-          " vs ", node->id);
-      delete node;
-      return;
-    }
-    node_queue_.push(node);
-  } else {
-    node_queue_.push(node);
-    nodes_pending_.notify_all();
-  }
-}
-
-tensorflow::Status TFE_Executor::WaitFor(tensorflow::uint64 node_id) {
-  return WaitImpl(false, node_id);
-}
-
-tensorflow::Status TFE_Executor::WaitForAllPendingNodes() {
-  return WaitImpl(true, 0);
-}
-
-tensorflow::Status TFE_Executor::WaitImpl(bool wait_all,
-                                          tensorflow::uint64 node_id) {
-  tensorflow::condition_variable cond;
-  tensorflow::mutex_lock l(node_queue_mutex_);
-  // Don't wait if an error is already set.
-  if (!status_.ok()) return status_;
-  if (node_queue_.empty()) return tensorflow::Status::OK();
-  if (wait_all) {
-    node_id = node_queue_.back()->id;
-  } else if (node_id < node_queue_.front()->id) {
-    // Note that we are relying on the ops being dispatched sequentially from
-    // the queue.
-    return tensorflow::Status::OK();
-  }
-  node_done_notifications_.insert(std::make_pair(node_id, &cond));
-  cond.wait(l);
-  // Note that we could be woken up if an error occurs, even though the node has
-  // not actually executed.
-  return status_;
-}
-
-void TFE_Executor::ClearError() {
-  tensorflow::mutex_lock l(node_queue_mutex_);
-  if (status_.ok()) return;
-  // If an error was set, node_done_notifications_ and node_queue_ should have
-  // been cleared, and no new entries should have been added since.
-  DCHECK(node_done_notifications_.empty());
-  DCHECK(node_queue_.empty());
-  status_ = tensorflow::Status::OK();
-  nodes_pending_.notify_all();
-}
-
-tensorflow::Status TFE_Executor::status() {
-  tensorflow::mutex_lock l(node_queue_mutex_);
-  return status_;
-}
-
-void TFE_Executor::Run() {
-  while (true) {
-    std::unique_ptr<TFE_Node> curr_node;
-    {
-      tensorflow::mutex_lock l(node_queue_mutex_);
-      while (node_queue_.empty() || !status_.ok()) {
-        if (thread_done_) return;
-        nodes_pending_.wait(l);
-      }
-      curr_node.reset(node_queue_.front());
-    }
-    tensorflow::Status status = curr_node->Run();
-    const bool ok = status.ok();
-    tensorflow::mutex_lock l(node_queue_mutex_);
-    node_queue_.pop();
-    if (!ok) {
-      status_ = status;
-      // TODO(agarwal): mark all affected handles as corrupted before clearing
-      // this queue.
-      // We remove any pending ops so that we don't try to execute them if
-      // ClearError is called.
-      for (int i = 0; i < node_queue_.size(); ++i) {
-        delete node_queue_.front();
-        node_queue_.pop();
-      }
-    }
-    if (!node_done_notifications_.empty()) {
-      tensorflow::uint64 node_id = curr_node->id;
-      // Note that we notify all waiting threads in case an error has occurred.
-      // These calling threads are responsible for checking status_ before
-      // proceeding.
-      const auto range = ok ? node_done_notifications_.equal_range(node_id)
-                            : make_pair(node_done_notifications_.begin(),
-                                        node_done_notifications_.end());
-      for (auto it = range.first; it != range.second; ++it) {
-        it->second->notify_all();
-      }
-      node_done_notifications_.erase(range.first, range.second);
-    }
-  }
-}
-
-bool TFE_Context::Async() const {
-  tensorflow::mutex_lock l(async_map_mu);
-  return tensorflow::gtl::FindWithDefault(
-      thread_local_async, std::this_thread::get_id(), async_default);
-}
-
-bool TFE_TensorHandle::IsReady() {
-  if (node_id == 0) return true;
-  tensorflow::mutex_lock l(ctx_mutex_);
-  return ctx_ == nullptr;
-}
-
-tensorflow::Status TFE_TensorHandle::WaitReady() {
-  if (node_id == 0) return tensorflow::Status::OK();
-  TFE_Executor* executor = nullptr;
-  {
-    tensorflow::mutex_lock l(ctx_mutex_);
-    if (ctx_ == nullptr) return tensorflow::Status::OK();
-    executor = &ctx_->executor;
-  }
-  return executor->WaitFor(node_id);
-}
-
-tensorflow::Status TFE_TensorHandle::Tensor(const tensorflow::Tensor** t) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *t = &tensor_;
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status TFE_TensorHandle::Device(tensorflow::Device** d) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *d = device_;
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status TFE_TensorHandle::OpDevice(tensorflow::Device** d) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *d = op_device_;
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status TFE_TensorHandle::TensorAndDevice(
-    const tensorflow::Tensor** tensor, tensorflow::Device** device,
-    tensorflow::Device** op_device) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *tensor = &tensor_;
-  *device = device_;
-  *op_device = op_device_;
-  return tensorflow::Status::OK();
-}
-
-void TFE_TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor,
-                                          tensorflow::Device* device,
-                                          tensorflow::Device* op_device) {
-  tensorflow::mutex_lock l(ctx_mutex_);
-  DCHECK(node_id > 0 && ctx_) << "SetTensorAndDevice should be only called  "
-                              << "on non-ready handles.";
-  ctx_ = nullptr;
-  tensor_ = tensor;
-  device_ = device;
-  op_device_ = op_device;
-}
 
 TFE_Op::~TFE_Op() {
-  for (TFE_TensorHandle* h : inputs) {
+  for (tensorflow::TensorHandle* h : inputs) {
     h->Unref();
   }
 }
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index a5029bf2115c7dac54d03b8bc6397bc63349c068..3926c22ce1f9e194b1452c796c83944d10cfdc64 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -61,17 +61,15 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetConfig(
 // Controls how to act when we try to run an operation on a given device but
 // some input tensors are not on that device.
 typedef enum TFE_ContextDevicePlacementPolicy {
-  // Running operations with input tensors on the wrong device will fail. When
-  // soft placement is enabled acts like TFE_DEVICE_PLACEMENT_SILENT.
+  // Running operations with input tensors on the wrong device will fail.
   TFE_DEVICE_PLACEMENT_EXPLICIT = 0,
   // Copy the tensor to the right device but log a warning.
   TFE_DEVICE_PLACEMENT_WARN = 1,
-  // Silently copy the tensor, which has a performance cost since the
-  // operation will be blocked till the copy completes.
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default placement
+  // policy.
   TFE_DEVICE_PLACEMENT_SILENT = 2,
-  // Default placement policy which silently copies int32 tensors but not other
-  // dtypes.  When soft placement is enabled acts like
-  // TFE_DEVICE_PLACEMENT_SILENT.
+  // Placement policy which silently copies int32 tensors but not other dtypes.
   TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 } TFE_ContextDevicePlacementPolicy;
 
@@ -162,7 +160,11 @@ TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
 TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
     TFE_TensorHandle* h, TF_Status* status);
 
-// This function will block till the operation that produces `h` has completed.
+// This function will block till the operation that produces `h` has
+// completed. The memory returned might alias the internal memory used by
+// TensorFlow. Hence, callers should not mutate this memory (for example by
+// modifying the memory region pointed to by TF_TensorData() on the returned
+// TF_Tensor).
 TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
                                                          TF_Status* status);
 
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 8dba12f47b580c33041cc134c6f07a1fafff7453..05dc64f521735f944559392f470a37590e93f17c 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -30,9 +30,14 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/runtime.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -40,261 +45,40 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/version.h"
 
-// A unit of execution for the TFE_Executor class below. Example subclasses
-// encapsulate execution of a TFE_Op, or copying a TFE_TensorHandle from one
-// device to another.
-class TFE_Node {
- public:
-  explicit TFE_Node(tensorflow::uint64 id);
-
-  virtual ~TFE_Node() {}
-
-  // Runs the computation corresponding to this node and blocks till the
-  // execution is done.
-  virtual tensorflow::Status Run() = 0;
-
-  // An id unique to the TFE_Context under which this node is created. Allocated
-  // monotonically.
-  const tensorflow::uint64 id;
-};
-
-// A class for handling async execution (see TFE_ContextSetAsync).
-// Note that this class is thread-safe.
-// TODO(agarwal): TFE_OpAddInput may currently block if it tries to access the
-// device of the input handle. Fix that.
-// TODO(agarwal): On error, mark all affected handles as corrupted.
-// TODO(agarwal): Implement support for control dependencies.
-// TODO(agarwal): Support out-of-order execution and dispatching multiple
-// TFE_Node in parallel.
-// TODO(agarwal): Implement optimizations over TFE_Node traces.
-class TFE_Executor {
- public:
-  ~TFE_Executor();
-
-  // This is called whenever async mode is enabled. Note that it may be called
-  // multiple times as different calling threads may switch async mode on or off
-  // independently.
-  void EnableAsync();
-
-  // Helper function to create monotonically increasing ids unique to this
-  // object.
-  tensorflow::uint64 NextId();
-
-  // Schedules `node` for execution.
-  // Note that Add must be called in monotonically increasing order of node->id.
-  void Add(TFE_Node* node);
-
-  // Causes the caller to block till node with id `node_id` has finished
-  // execution.
-  tensorflow::Status WaitFor(tensorflow::uint64 node_id);
-
-  // Blocks till all currently pending ops are done.
-  tensorflow::Status WaitForAllPendingNodes();
-
-  // Clears all currently set errors which re-enables async execution.
-  void ClearError();
-
-  // Returns Status based on any errors that occurred during async execution.
-  tensorflow::Status status();
-
- private:
-  // Starts execution of pending TFE_Nodes. This function loops till
-  // thread_done_ is set to true. If any errors are encontered, these are set
-  // inside `status_`. The loop blocks anytime there are no pending nodes, or if
-  // `status_` is not ok.
-  void Run();
-
-  tensorflow::Status WaitImpl(bool wait_all, tensorflow::uint64 node_id);
-
-  tensorflow::mutex node_queue_mutex_;
-
-  // Used to signal that some TFE_Nodes are pending execution.
-  tensorflow::condition_variable nodes_pending_ GUARDED_BY(node_queue_mutex_);
-
-  // Queue of pending TFE_Nodes.
-  std::queue<TFE_Node*> node_queue_ GUARDED_BY(node_queue_mutex_);
-
-  // `status_` is set based on any errors raised during execution of a TFE_Node.
-  // It remains set until ClearError is called.
-  tensorflow::Status status_ GUARDED_BY(node_queue_mutex_);
-
-  // Map from id of a TFE_Node to condition_variables (not owned by the map).
-  // These condition_variables are notified and removed when that TFE_Node is
-  // done executing, or if an error is found in execution of any TFE_Node.
-  std::multimap<tensorflow::uint64, tensorflow::condition_variable*>
-      node_done_notifications_ GUARDED_BY(node_queue_mutex_);
-
-  // Thread object that calls the `Run` method. Currently we use only one thread
-  // for executing the TFE_Nodes one-by-one.
-  std::unique_ptr<tensorflow::Thread> thread_ GUARDED_BY(node_queue_mutex_);
-
-  // Indicates that `thread_` should stop as soon as it is done executing the
-  // current TFE_Node.
-  bool thread_done_ GUARDED_BY(node_queue_mutex_) = false;
-
-  tensorflow::mutex next_id_mutex_;
-  tensorflow::uint64 next_id_ GUARDED_BY(next_id_mutex_) = 1;
-};
 
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
   // true if async execution is enabled.
   bool async = false;
-  TFE_ContextDevicePlacementPolicy policy{
-      TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32};
+  TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_SILENT};
 };
 
-TFE_ContextDevicePlacementPolicy PlacementPolicy(
-    bool soft_placement, TFE_ContextDevicePlacementPolicy original_policy);
-
 struct TFE_Context {
-  explicit TFE_Context(const TFE_ContextOptions& opts,
+  explicit TFE_Context(const tensorflow::SessionOptions& opts,
+                       TFE_ContextDevicePlacementPolicy default_policy,
+                       bool async,
                        std::unique_ptr<tensorflow::DeviceMgr> device_mgr,
                        tensorflow::Rendezvous* rendezvous)
-      : soft_placement(
-            opts.session_options.options.config.allow_soft_placement()),
-        policy(PlacementPolicy(soft_placement, opts.policy)),
-        device_manager(std::move(device_mgr)),
-        devices(device_manager->ListDevices()),
-        rendezvous(rendezvous),
-        pflr(new tensorflow::ProcessFunctionLibraryRuntime(
-            device_manager.get(), opts.session_options.options.env,
-            TF_GRAPH_DEF_VERSION, &func_lib_def, {})),
-        log_device_placement(
-            opts.session_options.options.config.log_device_placement()),
-        async_default(opts.async) {
-    if (async_default) executor.EnableAsync();
-  }
-
-  const bool soft_placement;
-  const TFE_ContextDevicePlacementPolicy policy;
-
-  // Note: we cannot use C++11 thread_local here as there is no concept of a
-  // thread-local-object-local variable in C++11.
-  tensorflow::mutex policy_map_mu;
-  std::unordered_map<std::thread::id, TFE_ContextDevicePlacementPolicy>
-      thread_local_policies GUARDED_BY(policy_map_mu);
-
-  std::unique_ptr<tensorflow::DeviceMgr> device_manager;
-  // Devices owned by device_manager
-  const std::vector<tensorflow::Device*> devices;
-  tensorflow::Rendezvous* const rendezvous;
-
-  tensorflow::mutex functions_mu;
-  tensorflow::FunctionLibraryDefinition func_lib_def GUARDED_BY(functions_mu){
-      tensorflow::OpRegistry::Global(), {}};
-
-  // One FunctionLibraryRuntime per device.
-  // func_libs[i] is the FunctionLibraryRuntime corresponding to
-  // session->devices[i].
-  const std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
+      : context(opts,
+                static_cast<tensorflow::ContextDevicePlacementPolicy>(
+                    default_policy),
+                async, std::move(device_mgr), rendezvous) {}
 
-  tensorflow::mutex cache_mu;
-  std::unordered_map<tensorflow::Fprint128, tensorflow::KernelAndDevice*,
-                     tensorflow::Fprint128Hasher>
-      kernel_cache GUARDED_BY(cache_mu);
-
-  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) const {
-    return pflr->GetFLR(d->name());
-  }
-
-  // Whether we should compute RunMetadata.
-  std::atomic<bool> should_store_metadata{false};
-  tensorflow::mutex metadata_mu;
-  tensorflow::RunMetadata run_metadata GUARDED_BY(metadata_mu);
-  const bool log_device_placement;
-  // TFE_Executor for async execution.
-  TFE_Executor executor;
-
-  // True if running in asynchronous mode.
-  bool Async() const;
-
-  // True if the default value for execution mode is async. Note that this value
-  // can be overridden per thread based on `thread_local_async` overrides.
-  const bool async_default;
-  mutable tensorflow::mutex async_map_mu;
-  std::unordered_map<std::thread::id, bool> thread_local_async
-      GUARDED_BY(async_map_mu);
+  tensorflow::EagerContext context;
 };
 
-struct TFE_TensorHandle : public tensorflow::core::RefCounted {
- public:
+struct TFE_TensorHandle {
   TFE_TensorHandle(const tensorflow::Tensor& t, tensorflow::Device* d,
                    tensorflow::Device* op_device)
-      : dtype(t.dtype()),
-        node_id(0),
-        tensor_(t),
-        device_(d),
-        op_device_(op_device),
-        ctx_(nullptr) {}
+      : handle(new tensorflow::TensorHandle(t, d, op_device, nullptr)) {}
 
   TFE_TensorHandle(tensorflow::uint64 node_id, tensorflow::DataType dtype,
-                   TFE_Context* ctx)
-      : dtype(dtype),
-        node_id(node_id),
-        tensor_(dtype),
-        device_(nullptr),
-        op_device_(nullptr),
-        ctx_(ctx) {
-    DCHECK_GT(node_id, 0);
-  }
-
-  ~TFE_TensorHandle() override {}
-
-  tensorflow::Status Tensor(const tensorflow::Tensor** t);
-
-  tensorflow::Status Device(tensorflow::Device** d);
-
-  tensorflow::Status OpDevice(tensorflow::Device** d);
-
-  tensorflow::Status TensorAndDevice(const tensorflow::Tensor** tensor,
-                                     tensorflow::Device** device,
-                                     tensorflow::Device** op_device);
-
-  // Note that this can be called at most once, and only on non-ready handles,
-  // and makes them ready.
-  void SetTensorAndDevice(const tensorflow::Tensor& tensor,
-                          tensorflow::Device* device,
-                          tensorflow::Device* op_device);
-
-  // dtype for the handle. It must be the same as t.dtype() once the handle is
-  // ready.
-  const tensorflow::DataType dtype;
-
- private:
-  // If the contents of the Tensor pointed to by this handle is yet to be
-  // computed by a TFE_Node, this function will block till that compuatation is
-  // done and the handle is "ready".
-  tensorflow::Status WaitReady();
-
-  bool IsReady();
-
-  // Id for the TFE_Node that will compute the value pointed to by this handle.
-  // If the value is 0, the handle is already ready, but not vice-versa.
-  const tensorflow::uint64 node_id;
-
-  tensorflow::Tensor tensor_;
-
-  // TODO(ashankar): device_ == nullptr iff local CPU
-  // This was expedient, but perhaps worth revisiting ('device_' should always
-  // be a valid pointer?)
-  // This can be done if TFE_NewOp() and the TFE_TensorHandle constructors are
-  // provided with the appropriate TFE_Context.
-  //
-  // TODO(ashankar): Reference count TFE_Context to ensure that 'device_' of a
-  // TFE_TensorHandle does not outlive the TFE_Context from which it came?
-  tensorflow::Device* device_;
-
-  // Device in which the op producing this tensor was executed. Equals to
-  // device_ for constant tensors.
-  tensorflow::Device* op_device_;
+                   tensorflow::EagerContext* ctx)
+      : handle(new tensorflow::TensorHandle(node_id, dtype, ctx)) {}
 
-  tensorflow::mutex ctx_mutex_;
+  TFE_TensorHandle(tensorflow::TensorHandle* handle) : handle(handle) {}
 
-  // `ctx` is only guaranteed to be set if the handle is not "ready". This is
-  // typically true when the handle was produced during async execution.
-  // `ctx` object is not owned and should outlive this handle.
-  TFE_Context* ctx_ GUARDED_BY(ctx_mutex_);
+  tensorflow::TensorHandle* handle;
 };
 
 struct TFE_Op {
@@ -311,7 +95,7 @@ struct TFE_Op {
   const tensorflow::string name;
   tensorflow::AttrBuilder attrs;
   const tensorflow::AttrTypeMap* attr_types;
-  tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 4> inputs;
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs;
   tensorflow::Device* device;
   bool use_xla = false;
 };
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 2268aba90d60b7b2f10e99f64fd7aa3ae719badb..701175e4943d1d23532fe595319f67711316ed4d 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -590,7 +590,13 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) {
   TFE_TensorHandle* m1 = TestMatrixTensorHandle();
   TFE_TensorHandle* m2 = TestMatrixTensorHandle3X2();
   TFE_Op* matmul = MatMulOp(ctx, m1, m2);
+  TFE_OpSetDevice(matmul, "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_Op* matmul2 = MatMulOp(ctx, m1, m1);
+  TFE_OpSetDevice(matmul2, "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_TensorHandle* retvals[1] = {nullptr};
   int num_retvals = 1;
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
@@ -688,19 +694,19 @@ TEST(CAPI, Execute_Min_CPU) {
   TFE_DeleteOp(minOp);
   TFE_DeleteTensorHandle(input);
   TFE_DeleteTensorHandle(axis);
-  TFE_DeleteContext(ctx, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   ASSERT_EQ(1, num_retvals);
 
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
-  TFE_DeleteTensorHandle(retvals[0]);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(retvals[0]);
   float output[2] = {0};
   EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
   memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
   TF_DeleteTensor(t);
   EXPECT_EQ(1, output[0]);
   EXPECT_EQ(3, output[1]);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
 
diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/c/eager/runtime.cc
index 9b46cf8245901934c9c4d41a2b7c10c1c5bf7cbd..abe2793ce894ad07c252575c5d55d98342916eac 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/c/eager/runtime.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/eager/runtime.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -95,22 +96,6 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
   return Status::OK();
 }
 
-Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
-                      TF_AttrType* out, unsigned char* is_list) {
-  auto* t = gtl::FindOrNull(m, attr_name);
-  if (t == nullptr) {
-    return errors::InvalidArgument("Attribute '", attr_name,
-                                   "' does not exist for this operation");
-  }
-  *out = static_cast<TF_AttrType>(*t & ~kIsList);
-  if (*t & kIsList) {
-    *is_list = 1;
-  } else {
-    *is_list = 0;
-  }
-  return Status::OK();
-}
-
 #define DEFINE_SET_ATTR(value_type, value_field)                             \
   template <>                                                                \
   AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
@@ -168,6 +153,22 @@ const NodeDef& AttrBuilder::BuildNodeDef() {
   return *node_def_;
 }
 
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
+                      TF_AttrType* out, unsigned char* is_list) {
+  auto* t = gtl::FindOrNull(m, attr_name);
+  if (t == nullptr) {
+    return errors::InvalidArgument("Attribute '", attr_name,
+                                   "' does not exist for this operation");
+  }
+  *out = static_cast<TF_AttrType>(*t & ~kIsList);
+  if (*t & kIsList) {
+    *is_list = 1;
+  } else {
+    *is_list = 0;
+  }
+  return Status::OK();
+}
+
 namespace {
 inline tensorflow::Fprint128 FingerprintCat128(const tensorflow::Fprint128& a,
                                                const tensorflow::Fprint128& b) {
@@ -245,104 +246,4 @@ void AttrBuilder::MayBeInitializeNodeDef() {
   }
 }
 
-// static
-Status KernelAndDevice::InitOp(Device* device, const NodeDef& ndef,
-                               KernelAndDevice* out) {
-  OpKernel* k = nullptr;
-  Status s = CreateOpKernel(device->device_type().c_str(), device,
-                            device->GetAllocator(AllocatorAttributes()),
-                            nullptr, ndef, TF_GRAPH_DEF_VERSION, &k);
-  out->device_ = device;
-  out->kernel_.reset(k);
-  out->flib_ = nullptr;
-  return s;
-}
-
-// static
-Status KernelAndDevice::Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
-                             KernelAndDevice* out) {
-  OpKernel* k = nullptr;
-  Status s = flib->CreateKernel(ndef, &k);
-  out->device_ = flib->device();
-  out->kernel_.reset(k);
-  out->flib_ = flib;
-  return s;
-}
-
-Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
-                            std::vector<Tensor>* output_tensors,
-                            NodeExecStats* stats) {
-  gtl::InlinedVector<TensorValue, 4> inputs;
-  for (Tensor& t : *input_tensors) {
-    inputs.push_back(TensorValue(&t));
-  }
-
-  std::vector<AllocatorAttributes> out_attrs(kernel_->num_outputs());
-  for (size_t i = 0; i < out_attrs.size(); ++i) {
-    out_attrs[i].set_on_host(kernel_->output_memory_types()[i] ==
-                             tensorflow::HOST_MEMORY);
-  }
-
-  OpKernelContext::Params params;
-  params.device = device_;
-  params.frame_iter = FrameAndIter(0, 0);
-  params.inputs = &inputs;
-  params.op_kernel = kernel_.get();
-  params.resource_manager = device_->resource_manager();
-  params.output_attr_array = gtl::vector_as_array(&out_attrs);
-  params.function_library = flib_;
-  params.slice_reader_cache = &slice_reader_cache_;
-  params.rendezvous = rendez_;
-  if (stats != nullptr) {
-    params.track_allocations = true;
-  }
-  // TODO(apassos): use a thread pool.
-  std::function<void(std::function<void()>)> runner =
-      [](std::function<void()> f) { f(); };
-  params.runner = &runner;
-
-  OpKernelContext context(&params);
-
-  if (kernel_->def().op() == "_Recv") {
-    // TODO(apassos) do not special-case _Recv. Currently the GPU device fails
-    // if trying to run _Recv->Compute(), specifically checking for _Recv. To go
-    // around this we call _Recv->ComputeAsync, to mimic graph mode behavior.
-    AsyncOpKernel* async = kernel_->AsAsync();
-    Notification done;
-    device_->ComputeAsync(async, &context, [&done]() { done.Notify(); });
-    done.WaitForNotification();
-  } else {
-    device_->Compute(kernel_.get(), &context);
-  }
-  if (!context.status().ok()) return context.status();
-
-  output_tensors->clear();
-  for (int i = 0; i < context.num_outputs(); ++i) {
-    output_tensors->push_back(Tensor(*context.mutable_output(i)));
-  }
-  if (stats != nullptr) {
-    for (const auto& allocator_pair : context.wrapped_allocators()) {
-      AllocatorMemoryUsed* memory = stats->add_memory();
-      memory->set_allocator_name(allocator_pair.first->Name());
-      auto sizes = allocator_pair.second->GetSizes();
-      memory->set_total_bytes(std::get<0>(sizes));
-      memory->set_peak_bytes(std::get<1>(sizes));
-      memory->set_live_bytes(std::get<2>(sizes));
-
-      AllocatorStats allocator_stats;
-      allocator_pair.first->GetStats(&allocator_stats);
-      memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
-      allocator_pair.second->GetRecordsAndUnRef();
-    }
-    auto* ms = stats->mutable_memory_stats();
-    ms->set_temp_memory_size(context.temp_memory_allocated());
-    for (const auto& alloc_id : context.persistent_alloc_ids()) {
-      ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
-    }
-
-    ms->set_persistent_memory_size(context.persistent_memory_allocated());
-  }
-  return Status::OK();
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/runtime.h b/tensorflow/c/eager/runtime.h
index ad16f65495f8a8193b685c2b13a099232d03a505..929b1b8296faf61c11c68af06ffc4ca3770ae929 100644
--- a/tensorflow/c/eager/runtime.h
+++ b/tensorflow/c/eager/runtime.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -45,6 +46,10 @@ Status OpDefForOp(const char* op_name, const OpDef** op_def);
 // Returns the AttrTypeMap for the TensorFlow operation named op_name.
 Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out);
 
+// Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
+                      TF_AttrType* out, unsigned char* is_list);
+
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list);
@@ -149,53 +154,6 @@ template <>
 AttrBuilder& AttrBuilder::Set(StringPiece attr_name,
                               tensorflow::DataType&& value);
 
-// KernelAndDevice encapsulates an instantiated kernel and the device it is on.
-//
-// Also see:
-// https://www.tensorflow.org/code/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
-// and
-// https://www.tensorflow.org/code/tensorflow/core/kernels/ops_testutil.h
-class KernelAndDevice {
- public:
-  // Populates 'out' with a kernel appropriate for 'ndef'.
-  //
-  // The provided FunctionLibraryRuntime MUST outlive all calls to
-  // Run() on the returned KernelAndDevice.
-  //
-  // TODO(ashankar): Figure out thread-safety concerns around
-  // FunctionLibraryRuntime (in particular, how the underlying
-  // FunctionLibraryDefinition might be mutated by another thread as new
-  // functions are registered with it).  Conservatively, thread-safe usage of
-  // the FunctionLibraryRuntime is pushed on to the caller (see locking in
-  // c_api.cc).
-  static Status Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
-                     KernelAndDevice* out);
-  // TODO(ashankar): Remove this
-  static Status InitOp(Device* device, const NodeDef& ndef,
-                       KernelAndDevice* out);
-
-  KernelAndDevice(tensorflow::Rendezvous* rendez)
-      : device_(nullptr), flib_(nullptr), rendez_(rendez) {}
-
-  // TODO(ashankar): Handle list-valued inputs.
-  Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
-             NodeExecStats* stats);
-
-  const OpKernel* kernel() const { return kernel_.get(); }
-
-  Device* device() const { return device_; }
-
-  DataTypeVector* mutable_output_dtypes() { return &output_dtypes_; }
-  const DataTypeVector& output_dtypes() { return output_dtypes_; }
-
- private:
-  std::unique_ptr<OpKernel> kernel_;
-  Device* device_;
-  FunctionLibraryRuntime* flib_;
-  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
-  Rendezvous* rendez_;
-  DataTypeVector output_dtypes_;
-};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/eager/runtime_test.cc b/tensorflow/c/eager/runtime_test.cc
index 4f75d278878d7c8ff6a5e48e5b4e633aa13aedc5..27ebeb0508844ee1ee89e0733b66f6ed129b7757 100644
--- a/tensorflow/c/eager/runtime_test.cc
+++ b/tensorflow/c/eager/runtime_test.cc
@@ -33,27 +33,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-class TestEnv {
- public:
-  TestEnv() : flib_def_(OpRegistry::Global(), {}) {
-    Device* device =
-        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
-    device_mgr_.reset(new DeviceMgr({device}));
-    flib_runtime_ = NewFunctionLibraryRuntime(device_mgr_.get(), Env::Default(),
-                                              device, TF_GRAPH_DEF_VERSION,
-                                              &flib_def_, nullptr, {}, nullptr);
-  }
-
-  FunctionLibraryRuntime* function_library_runtime() const {
-    return flib_runtime_.get();
-  }
-
- private:
-  FunctionLibraryDefinition flib_def_;
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
-};
-
 TEST(AttrTypeMap, Lookup) {
   const AttrTypeMap* m = nullptr;
   Status s = AttrTypeMapForOp("ThisOpCannotPossiblyExist", &m);
@@ -79,113 +58,5 @@ TEST(AttrTypeMap, Lookup) {
   EXPECT_NE(is_list, 0);
 }
 
-TEST(KernelAndDevice, Run) {
-  Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
-  std::vector<Tensor> inputs;
-  inputs.push_back(t);
-  inputs.push_back(t);
-  NodeDef ndef(AttrBuilder("MatMul")
-                   .Set("T", DT_FLOAT)
-                   .Set("transpose_a", false)
-                   .Set("transpose_b", false)
-                   .NumInputs(inputs.size())
-                   .BuildNodeDef());
-  TestEnv env;
-  KernelAndDevice kernel(nullptr);
-  Status s =
-      KernelAndDevice::Init(ndef, env.function_library_runtime(), &kernel);
-  ASSERT_TRUE(s.ok()) << s;
-  std::vector<Tensor> outputs;
-  s = kernel.Run(&inputs, &outputs, nullptr);
-  ASSERT_TRUE(s.ok()) << s;
-  ASSERT_EQ(1, outputs.size());
-  const Tensor& out = outputs[0];
-  EXPECT_EQ(7, out.matrix<float>()(0, 0));
-  EXPECT_EQ(10, out.matrix<float>()(0, 1));
-  EXPECT_EQ(15, out.matrix<float>()(1, 0));
-  EXPECT_EQ(22, out.matrix<float>()(1, 1));
-}
-
-void BM_CreateGraph(int iters) {
-  for (int i = 0; i < iters; ++i) {
-    Scope root = Scope::NewRootScope();
-    auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
-    auto M = ops::MatMul(root, C, C);
-    TF_CHECK_OK(root.status());
-  }
-}
-BENCHMARK(BM_CreateGraph);
-
-void BM_RunGraph(int iters) {
-  tensorflow::testing::StopTiming();
-  Scope root = Scope::NewRootScope();
-  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
-  auto M = ops::MatMul(root, C, C);
-  SessionOptions opts;
-  opts.config.set_inter_op_parallelism_threads(1);
-  opts.config.set_intra_op_parallelism_threads(1);
-  ClientSession sess(root, opts);
-  std::vector<Tensor> outputs;
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    outputs.clear();
-    TF_CHECK_OK(sess.Run({M}, &outputs));
-  }
-}
-BENCHMARK(BM_RunGraph);
-
-void BM_CreateAndDestroySession(int iters) {
-  tensorflow::testing::StopTiming();
-  Scope root = Scope::NewRootScope();
-  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
-  auto M = ops::MatMul(root, C, C);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    ClientSession sess(root);
-  }
-}
-BENCHMARK(BM_CreateAndDestroySession);
-
-void BM_KernelAndDeviceInit(int iters) {
-  tensorflow::testing::StopTiming();
-  NodeDef ndef(AttrBuilder("MatMul")
-                   .Set("T", DT_FLOAT)
-                   .Set("transpose_a", false)
-                   .Set("transpose_b", false)
-                   .NumInputs(2)
-                   .BuildNodeDef());
-  TestEnv env;
-  KernelAndDevice k(nullptr);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(
-        KernelAndDevice::Init(ndef, env.function_library_runtime(), &k));
-  }
-}
-BENCHMARK(BM_KernelAndDeviceInit);
-
-void BM_KernelAndDeviceRun(int iters) {
-  tensorflow::testing::StopTiming();
-  Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
-  std::vector<Tensor> inputs;
-  inputs.push_back(t);
-  inputs.push_back(t);
-  std::vector<Tensor> outputs;
-  NodeDef ndef(AttrBuilder("MatMul")
-                   .Set("T", DT_FLOAT)
-                   .Set("transpose_a", false)
-                   .Set("transpose_b", false)
-                   .NumInputs(inputs.size())
-                   .BuildNodeDef());
-  TestEnv env;
-  KernelAndDevice kernel(nullptr);
-  TF_CHECK_OK(
-      KernelAndDevice::Init(ndef, env.function_library_runtime(), &kernel));
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(kernel.Run(&inputs, &outputs, nullptr));
-  }
-}
-BENCHMARK(BM_KernelAndDeviceRun);
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index bdb0815d6b68444ec1c89b835d563db20ce4d8a1..97c323b87228039ba10f4ed5e434aa83621b1220 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -152,6 +152,8 @@ class GradientTape {
                          gtl::ArraySlice<Gradient*> output_gradients,
                          std::vector<Gradient*>* result);
 
+  bool IsPersistent() const { return persistent_; }
+
  private:
   TensorTape tensor_tape_;
   OpTape<BackwardFunction> op_tape_;
@@ -599,23 +601,28 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
   }
   CHECK(state.op_tape.empty());
   result->reserve(source_tensor_ids.size());
+  gtl::FlatSet<int64> used_gradient_ids(source_tensor_ids.size());
   for (auto is : source_tensor_ids) {
     auto grad_it = gradients.find(is);
     if (grad_it == gradients.end()) {
       result->push_back(nullptr);
     } else {
-      if (grad_it->second.size() == 1) {
-        result->push_back(grad_it->second[0]);
-      } else {
-        result->push_back(vspace.AggregateGradients(grad_it->second));
+      if (grad_it->second.size() > 1) {
+        Gradient* grad = vspace.AggregateGradients(grad_it->second);
+        grad_it->second.clear();
+        grad_it->second.push_back(grad);
       }
-      gradients.erase(grad_it);
+      result->push_back(grad_it->second[0]);
+      used_gradient_ids.insert(is);
     }
   }
-  VLOG(1) << "Final gradients size: " << gradients.size();
+  VLOG(1) << "Final gradients size: "
+          << gradients.size() - used_gradient_ids.size();
   for (auto grad_pair : gradients) {
-    for (const auto& g : grad_pair.second) {
-      vspace.DeleteGradient(g);
+    if (used_gradient_ids.find(grad_pair.first) == used_gradient_ids.end()) {
+      for (const auto& g : grad_pair.second) {
+        vspace.DeleteGradient(g);
+      }
     }
   }
   return Status::OK();
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index cd604538f1fa142c6fe6a76624c048baddaa52fb..93155998b86d59ec78c7ff25f146b8e3c8eac380 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/python_api.h"
 
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/python/framework/cpp_shape_inference.pb.h"
 
 namespace tensorflow {
 
@@ -109,4 +110,29 @@ void ExtendSession(TF_Session* session, TF_Status* status) {
   session->extend_before_run = false;
 }
 
+std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) {
+  Node* node = &output.oper->node;
+  CppShapeInferenceResult::HandleData handle_data;
+  handle_data.set_is_set(true);
+  {
+    mutex_lock l(graph->mu);
+    tensorflow::shape_inference::InferenceContext* ic =
+        graph->refiner.GetContext(node);
+    CHECK(ic != nullptr);
+    CHECK_LT(output.index, ic->num_outputs());
+    const auto* shapes_and_types =
+        ic->output_handle_shapes_and_types(output.index);
+    if (shapes_and_types == nullptr) return "";
+
+    for (const auto& p : *shapes_and_types) {
+      auto* out_shape_and_type = handle_data.add_shape_and_type();
+      ic->ShapeHandleToProto(p.shape, out_shape_and_type->mutable_shape());
+      out_shape_and_type->set_dtype(p.dtype);
+    }
+  }
+  string result;
+  handle_data.SerializeToString(&result);
+  return result;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index 13b680b3a24afa2d285ea18207578aff4350f6d5..2d4c8cd9ed7bc926f448dab1f6b50ed74179ea14 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_PYTHON_API_H_
 #define TENSORFLOW_C_PYTHON_API_H_
 
+#include <string>
+
 #include "tensorflow/c/c_api.h"
 
 // These functions can be removed without notice. They exist to facilitate some
@@ -51,6 +53,11 @@ void SetRequireShapeInferenceFns(TF_Graph* graph, bool require);
 // the graph after the session has been made aware of them.
 void ExtendSession(TF_Session* session, TF_Status* status);
 
+// Returns the serialized CppShapeInferenceResult::HandleData proto for
+// `output` if its a resource tensor, or otherwise returns the empty string.
+// TODO(b/74620627): remove when _USE_C_SHAPES is removed
+std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/c/testdata/tf_record b/tensorflow/c/testdata/tf_record
new file mode 100644
index 0000000000000000000000000000000000000000..6e16076bfb79ad8151952e96567565e8820b0f5b
Binary files /dev/null and b/tensorflow/c/testdata/tf_record differ
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 9060c19e9d2cf965c2b9be07be07c42017da45a8..079e063d3e3fbdaf833e9031f5f9438853c14099 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -620,18 +620,6 @@ tf_cc_binary(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "queue_runner",
     srcs = ["training/queue_runner.cc"],
diff --git a/tensorflow/cc/framework/cc_op_gen_test.cc b/tensorflow/cc/framework/cc_op_gen_test.cc
index 1e0f2d241bb350897a840dda90d6d0c009b1daad..5d9dfd95a5538ae0f3d2d111a1f989552c3363b8 100644
--- a/tensorflow/cc/framework/cc_op_gen_test.cc
+++ b/tensorflow/cc/framework/cc_op_gen_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -61,12 +62,12 @@ op {
 )";
 
 void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(s.contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
 void ExpectDoesNotHaveSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_FALSE(s.contains(expected))
+  EXPECT_FALSE(str_util::StrContains(s, expected))
       << "'" << s << "' contains '" << expected << "'";
 }
 
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 71642492627422e09c19b7bcb4dc522846cf08b1..c143b978338815ebc7134eb0a07867c5d8b13dca 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -218,7 +219,7 @@ std::unordered_set<string> Scope::Impl::GetColocationConstraints(
   if (GetNodeAttr(attrs, kColocationAttrName, &node_constraints).ok()) {
     for (const string& entry : node_constraints) {
       StringPiece s(entry);
-      if (s.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&s, kColocationGroupPrefix)) {
         current_constraints.insert(s.ToString());
       }
     }
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index d29ad3ebcbe29087d5572b51c7713e0c98d0d840..06a3be18e08f611d3ecf9804908d791d15fdab13 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -94,18 +94,3 @@ filegroup(
         "testdata/half_plus_two/**",
     ]),
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 4c64d2cfe3c10e6c7ed82a2d72460a0b34283bb2..72b8bc18710b0ee77cb01ed3ad0c2abb5183efb2 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -133,9 +134,9 @@ TEST_F(LoaderTest, NoTagMatch) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {"missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(StringPiece(st.error_message())
-                  .contains("Could not find meta graph def matching supplied "
-                            "tags: { missing-tag }"))
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: { missing-tag }"))
       << st.error_message();
 }
 
@@ -149,9 +150,9 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe, "missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(
-      StringPiece(st.error_message())
-          .contains("Could not find meta graph def matching supplied tags: "))
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: "))
       << st.error_message();
 }
 
@@ -169,7 +170,7 @@ TEST_F(LoaderTest, SessionCreationFailure) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget))
+  EXPECT_TRUE(str_util::StrContains(st.error_message(), kInvalidTarget))
       << st.error_message();
 }
 
diff --git a/tensorflow/cc/saved_model/python/BUILD b/tensorflow/cc/saved_model/python/BUILD
index f5fbc75edcba9d5ae9ef7432de224df766bcab9e..6f04ebdc55cda329527c95f62efc37c8dfbb4ae5 100644
--- a/tensorflow/cc/saved_model/python/BUILD
+++ b/tensorflow/cc/saved_model/python/BUILD
@@ -7,18 +7,6 @@ package(
     default_visibility = ["//visibility:public"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_py_clif_cc")
 
 tf_py_clif_cc(
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index f413a5cc52e9eb4bc393b8186f5b591681fa2e5e..6f1c87354076565af22f7ba0610a5c6bb999d25c 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -41,18 +41,3 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/cc/tutorials/example_trainer.cc b/tensorflow/cc/tutorials/example_trainer.cc
index 3675d72ee354533a7d84b5e8783cde452d8d60c9..5dbc4f5f6aa389978e55ca2656c17ff97202203d 100644
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/graph/default_device.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -166,7 +167,8 @@ namespace {
 
 bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     int32* dst) {
-  if (arg.Consume(flag) && arg.Consume("=")) {
+  if (tensorflow::str_util::ConsumePrefix(&arg, flag) &&
+      tensorflow::str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     return (sscanf(arg.data(), "%d%c", dst, &extra) == 1);
   }
@@ -176,7 +178,7 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 
 bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    bool* dst) {
-  if (arg.Consume(flag)) {
+  if (tensorflow::str_util::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *dst = true;
       return true;
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index ffa2d088295375bbbcd2cdd9365982907f2bf480..fa03b1f3c2dfc334d4a3871e6a1bf5503fa8d5f8 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -250,17 +250,3 @@ exports_files([
     "benchmark_main.template",  # used by tf_library(...,gen_benchmark=True)
     "test.cc",  # used by tf_library(...,gen_test=True)
 ])
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 972b7d51ecb3798e61757ac55e973075a23b433a..2642536c4f67eba8eedf315f24d800e7913d62a0 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -33,7 +34,7 @@ namespace {
 
 void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
-  EXPECT_TRUE(StringPiece(status.error_message()).contains(str))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), str))
       << "expected error: " << status.error_message() << " to contain: " << str;
 }
 
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 28aab6eb614ca7123d9e00f7f5cc3661b62e23f7..b053dad1b57c258b7cb0d6831923e6a0f30f5e7e 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -182,17 +182,3 @@ tf_cc_test(
         "//third_party/eigen3",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 9dff1be09fede6f65f82c2f36d94be07e781949f..3a877c5337ff76193a7f27fb9681e5a9ca500961 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -132,7 +132,7 @@ def tf_library(name, graph, config,
   header_file = name + ".h"
   metadata_object_file = name + "_tfcompile_metadata.o"
   function_object_file = name + "_tfcompile_function.o"
-  ep = ("__" + PACKAGE_NAME + "__" + name).replace("/", "_")
+  ep = ("__" + native.package_name() + "__" + name).replace("/", "_")
   if type(tfcompile_flags) == type(""):
     flags = tfcompile_flags
   else:
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index e2f01179d4e2e4f6ef72b2761d06e130ffa3a94f..8ea014c2eede2cb7a9cede9dd4ade8b970bd519c 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -55,7 +55,7 @@ const char kUsageHeader[] =
     "\n";
 
 Status ReadProtoFile(const string& fname, protobuf::Message* proto) {
-  if (StringPiece(fname).ends_with(".pbtxt")) {
+  if (str_util::EndsWith(fname, ".pbtxt")) {
     return ReadTextProto(Env::Default(), fname, proto);
   } else {
     return ReadBinaryProto(Env::Default(), fname, proto);
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index c4a2d4ab0321bbf9db91f5e4387084c27e576b87..24aa203c00b3a011ae11007e308f8bbb6998204e 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -76,6 +76,7 @@ cc_library(
         ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_launch_op",
+        "//tensorflow/compiler/jit/legacy_flags:xla_device_flags",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
@@ -118,14 +119,33 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_tensor",
+    srcs = ["xla_tensor.cc"],
+    hdrs = ["xla_tensor.h"],
+    deps = [
+        ":common",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 cc_library(
     name = "xla_device",
     srcs = [
+        "xla_compile_on_demand_op.cc",
         "xla_device.cc",
         "xla_device_context.cc",
         "xla_device_ops.cc",
     ],
     hdrs = [
+        "xla_compile_on_demand_op.h",
         "xla_device.h",
         "xla_device_context.h",
         "xla_device_ops.h",
@@ -136,6 +156,7 @@ cc_library(
         ":common",
         ":jit_compilation_passes",
         ":xla_launch_util",
+        ":xla_tensor",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
@@ -182,6 +203,7 @@ cc_library(
     deps = [
         ":common",
         ":xla_compilation_cache",
+        ":xla_tensor",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -328,6 +350,7 @@ tf_cc_test(
     deps = [
         ":common",
         ":compilation_passes",
+        ":graph_to_functiondef",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
@@ -338,26 +361,13 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
 
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
 cc_header_only_library(
     name = "xla_jit_headers_lib",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 2d175c40f9dfaef4e5024b77a6ecb8d6022e7a56..b04b333141a616e7c4db2751c14ec6eb0b7725b5 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -53,6 +53,8 @@ namespace tensorflow {
 const char* const kXlaCompiledKernelAttr = "_XlaCompiledKernel";
 const char* const kXlaNumConstantArgsAttr = "_XlaNumConstantArgs";
 const char* const kXlaNumResourceArgsAttr = "_XlaNumResourceArgs";
+const char* const kXlaHostTransferSequencerAttr =
+    "_xla_host_transfer_sequencer";
 
 namespace {
 
@@ -143,7 +145,7 @@ struct NodeSlot {
 // everything to use it.
 static const char* const kArgOp = "_Arg";
 static const char* const kRetValOp = "_Retval";
-static const char* const kHostComputeOp = "_XlaHostCompute";
+static const char* const kHostComputeOp = "XlaHostCompute";
 static const char* const kSendFromHostOp = "_XlaSendFromHost";
 static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
 
@@ -252,7 +254,8 @@ class Encapsulator {
 
     // Adds _RecvAtHost and _SendFromHost nodes, where needed, to graph_out.
     Status AddOutsideCompilationHostIONodes(
-        const string& subgraph_name,
+        const string& group_attribute, const string& subgraph_name,
+        const string& outside_compilation_attribute,
         const std::unordered_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
@@ -328,12 +331,14 @@ class Encapsulator {
     Status MakeSequencingNode(const string& subgraph_name, Graph* graph_out);
 
     // If there is a sequencer node, adds a control edge from the sequencer to
-    // all the downstream nodes of call_node_outputs.
-    void ConnectSequencerToOutputs(Graph* graph_out);
+    // the call node.
+    void ConnectSequencerToCallNode(Graph* graph_out);
 
     Status AddShapeInferenceInfo(
+        const string& subgraph_name,
         const string& outside_compilation_subgraph_name,
-        const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph);
+        const std::vector<TensorShapeProto>& shapes, Graph* inference_graph,
+        FunctionLibraryDefinition* library);
 
     Status ReplaceFunctionDef(FunctionLibraryDefinition* library);
 
@@ -401,7 +406,9 @@ class Encapsulator {
 
     // Builds a _RecvAtHost node producing all the inputs of an
     // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host.
-    Status AddRecvAtHostNode(const string& subgraph_name,
+    Status AddRecvAtHostNode(const string& group_attribute,
+                             const string& subgraph_name,
+                             const string& outside_compilation_attribute,
                              const string& oc_subgraph_name,
                              OutsideCompilationSubgraph* oc_subgraph,
                              Graph* graph_out);
@@ -410,8 +417,10 @@ class Encapsulator {
     // outside_compilation subgraph and stores it in oc_subgraph.send_from_host.
     Status AddSendFromHostNode(
         const std::unordered_map<const Node*, Node*>& node_images,
-        const string& subgraph_name, const string& oc_subgraph_name,
-        OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out);
+        const string& group_attribute, const string& subgraph_name,
+        const string& outside_compilation_attribute,
+        const string& oc_subgraph_name, OutsideCompilationSubgraph* oc_subgraph,
+        Graph* graph_out);
 
     // The subgraph extracted from the input graph, suitable for being turned
     // into a FunctionDef. Inputs are fed by _Arg nodes, and outputs are
@@ -425,6 +434,10 @@ class Encapsulator {
     // NodeDef for the function call node.
     NodeDef call_node_def_;
 
+    // Name that is used for the call node. This may not be
+    // call_node_def_.name() if the client supplies a rewrite lambda.
+    string function_def_name_;
+
     // Placeholder node simulating the host compute key in the output graph.
     // Not owned.
     Node* host_compute_key_placeholder_ = nullptr;
@@ -567,7 +580,7 @@ class Encapsulator {
       const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
       FunctionLibraryDefinition* library,
       std::vector<TensorShapeProto>* static_shape_out,
-      std::unique_ptr<GraphDef>* graphdef_out);
+      std::unique_ptr<Graph>* graph_out);
 
   // Makes a copy of graph containing only nodes that are ancestors of at least
   // one node in send_from_host_nodes and store it in pruned_graph. On exit
@@ -812,6 +825,7 @@ Status Encapsulator::Subgraph::AddHostComputes(
       builder.Attr("key",
                    strings::StrCat("host_compute_channel_", subgraph_name, "_",
                                    oc_subgraph_name));
+      builder.Attr("_outside_compilation_subgraph", oc_subgraph_name);
       Status s = builder.Finalize(&host_compute_def);
       if (!s.ok()) return s;
 
@@ -863,25 +877,21 @@ Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name,
     NodeDef seq_def;
     NodeDefBuilder builder(strings::StrCat(subgraph_name, "_sequencer"),
                            "NoOp");
+    builder.Attr(kXlaHostTransferSequencerAttr, subgraph_name);
+    builder.Device(device_);
     Status s = builder.Finalize(&seq_def);
     if (!s.ok()) return s;
 
     sequencer_ = graph_out->AddNode(seq_def, &s);
     if (!s.ok()) return s;
-    sequencer_->set_assigned_device_name(device_);
   }
   return Status::OK();
 }
 
-void Encapsulator::Subgraph::ConnectSequencerToOutputs(Graph* graph_out) {
+void Encapsulator::Subgraph::ConnectSequencerToCallNode(Graph* graph_out) {
   if (sequencer_ != nullptr) {
-    std::unordered_set<Node*> output_dependencies;
-    for (Node* node : call_node_outputs_->out_nodes()) {
-      output_dependencies.insert(node);
-    }
-    for (Node* node : output_dependencies) {
-      graph_out->AddControlEdge(sequencer_, node);
-    }
+    VLOG(2) << "ConnectSequencerToCallNode";
+    graph_out->AddControlEdge(sequencer_, call_node_inputs_);
   }
 }
 
@@ -927,6 +937,8 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
     name = call_node_def_.op();
   }
 
+  function_def_name_ = name;
+
   FunctionDef fdef;
   TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
 
@@ -945,8 +957,10 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
 }
 
 Status Encapsulator::Subgraph::AddShapeInferenceInfo(
+    const string& subgraph_name,
     const string& outside_compilation_subgraph_name,
-    const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph) {
+    const std::vector<TensorShapeProto>& shapes, Graph* inference_graph,
+    FunctionLibraryDefinition* library) {
   OutsideCompilationSubgraph& oc_subgraph =
       outside_compilation_subgraphs_.at(outside_compilation_subgraph_name);
 
@@ -968,21 +982,22 @@ Status Encapsulator::Subgraph::AddShapeInferenceInfo(
     host_compute->AddAttr("shape_inference_graph", "");
     host_compute->AddAttr("shapes", shapes);
   } else {
-    string serialized_graph;
-    if (!inference_graph->SerializeToString(&serialized_graph)) {
-      return errors::Internal(
-          "Failed to serialize graph for outside compilation subgraph ",
-          oc_subgraph.host_compute_name);
-    }
-    host_compute->AddAttr("shape_inference_graph", serialized_graph);
+    string inference_graph_name =
+        strings::StrCat("_outside_compilation_shape_inference_", subgraph_name,
+                        "_", outside_compilation_subgraph_name);
+    FunctionDef fdef;
+    TF_RETURN_IF_ERROR(
+        GraphToFunctionDef(*inference_graph, inference_graph_name, &fdef));
+    host_compute->AddAttr("shape_inference_graph", inference_graph_name);
     host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());
+    TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
   }
   return Status::OK();
 }
 
 Status Encapsulator::Subgraph::ReplaceFunctionDef(
     FunctionLibraryDefinition* library) {
-  const string& name = call_node_def_.name();
+  const string& name = function_def_name_;
 
   FunctionDef fdef;
   TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
@@ -1105,7 +1120,8 @@ Status Encapsulator::Subgraph::AddHostComputeKeyPlaceholder(
 }
 
 Status Encapsulator::Subgraph::AddRecvAtHostNode(
-    const string& subgraph_name, const string& oc_subgraph_name,
+    const string& group_attribute, const string& subgraph_name,
+    const string& outside_compilation_attribute, const string& oc_subgraph_name,
     OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
   if (host_compute_key_placeholder_ == nullptr) {
     TF_RETURN_IF_ERROR(AddHostComputeKeyPlaceholder(oc_subgraph, graph_out));
@@ -1128,17 +1144,19 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
                          kRecvAtHostOp);
   builder.Device(device_);
   builder.Attr("Toutputs", dtypes);
-  // TODO(misard) For now we only support TPU device 0.
+  // The correct device_ordinal will be inserted during replication in a
+  // subsequent rewrite.
   builder.Attr("device_ordinal", 0);
   builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
                                       "_", oc_subgraph_name));
+  builder.Attr(group_attribute, subgraph_name);
+  builder.Attr(outside_compilation_attribute, oc_subgraph_name);
   builder.Input(host_compute_key_placeholder_->name(), 0, DT_STRING);
   Status s = builder.Finalize(&recv_def);
   if (!s.ok()) return s;
 
   oc_subgraph->recv_at_host = graph_out->AddNode(recv_def, &s);
   if (!s.ok()) return s;
-  oc_subgraph->recv_at_host->set_assigned_device_name(device_);
   graph_out->AddEdge(host_compute_key_placeholder_, 0,
                      oc_subgraph->recv_at_host, 0);
 
@@ -1153,7 +1171,8 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
 
 Status Encapsulator::Subgraph::AddSendFromHostNode(
     const std::unordered_map<const Node*, Node*>& node_images,
-    const string& subgraph_name, const string& oc_subgraph_name,
+    const string& group_attribute, const string& subgraph_name,
+    const string& outside_compilation_attribute, const string& oc_subgraph_name,
     OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
   if (host_compute_key_placeholder_ == nullptr) {
     TF_RETURN_IF_ERROR(AddHostComputeKeyPlaceholder(oc_subgraph, graph_out));
@@ -1182,8 +1201,11 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   builder.Attr("Tinputs", dtypes);
   builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
                                       "_", oc_subgraph_name));
-  // TODO(misard) For now we only support TPU device 0.
+  // The correct device_ordinal will be inserted during replication in a
+  // subsequent rewrite.
   builder.Attr("device_ordinal", 0);
+  builder.Attr(group_attribute, subgraph_name);
+  builder.Attr(outside_compilation_attribute, oc_subgraph_name);
   builder.Input(inputs);
   builder.Input(host_compute_key_placeholder_->name(), 0, DT_STRING);
   Status s = builder.Finalize(&send_def);
@@ -1191,7 +1213,6 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
 
   oc_subgraph->send_from_host = graph_out->AddNode(send_def, &s);
   if (!s.ok()) return s;
-  oc_subgraph->send_from_host->set_assigned_device_name(device_);
   graph_out->AddEdge(host_compute_key_placeholder_, 0,
                      oc_subgraph->send_from_host, inputs.size());
 
@@ -1205,7 +1226,8 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
 }
 
 Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
-    const string& subgraph_name,
+    const string& group_attribute, const string& subgraph_name,
+    const string& outside_compilation_attribute,
     const std::unordered_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
   for (auto& outside_compilation_subgraph_entry :
@@ -1215,14 +1237,16 @@ Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
         outside_compilation_subgraph_entry.second;
 
     if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) {
-      TF_RETURN_IF_ERROR(
-          AddRecvAtHostNode(subgraph_name, oc_name, &oc_subgraph, graph_out));
+      TF_RETURN_IF_ERROR(AddRecvAtHostNode(group_attribute, subgraph_name,
+                                           outside_compilation_attribute,
+                                           oc_name, &oc_subgraph, graph_out));
     }
 
     if (!oc_subgraph.outputs_by_src.empty() ||
         !oc_subgraph.control_outputs.empty()) {
-      TF_RETURN_IF_ERROR(AddSendFromHostNode(node_images, subgraph_name,
-                                             oc_name, &oc_subgraph, graph_out));
+      TF_RETURN_IF_ERROR(AddSendFromHostNode(
+          node_images, group_attribute, subgraph_name,
+          outside_compilation_attribute, oc_name, &oc_subgraph, graph_out));
     }
   }
   return Status::OK();
@@ -1439,8 +1463,6 @@ Status Encapsulator::CopyNodesToOutputGraph(
             "Parallel checking is not supported when outside_compilation "
             "clusters are present.");
       }
-      image->ClearAttr(group_attribute_);
-      image->ClearAttr(outside_compilation_attribute_);
     }
     (*node_images)[node] = image;
   }
@@ -1466,7 +1488,8 @@ Status Encapsulator::AddOutsideCompilationHostIONodes(
     const string& subgraph_name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
     TF_RETURN_IF_ERROR(subgraph.AddOutsideCompilationHostIONodes(
-        subgraph_name, node_images, graph_out));
+        group_attribute_, subgraph_name, outside_compilation_attribute_,
+        node_images, graph_out));
   }
   return Status::OK();
 }
@@ -1675,7 +1698,7 @@ Status Encapsulator::AddEdgesToOutputGraph(
 
   for (auto& subgraph_entry : subgraphs_) {
     Subgraph& subgraph = subgraph_entry.second;
-    subgraph.ConnectSequencerToOutputs(graph_out);
+    subgraph.ConnectSequencerToCallNode(graph_out);
   }
 
   return Status::OK();
@@ -1754,7 +1777,7 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
     const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
     FunctionLibraryDefinition* library,
     std::vector<TensorShapeProto>* static_shape_out,
-    std::unique_ptr<GraphDef>* graphdef_out) {
+    std::unique_ptr<Graph>* graph_out) {
   // Maps from nodes in graph_in to nodes in graph_out.
   //
   // When an edge has fully defined shape the source node in graph_in is
@@ -1771,8 +1794,8 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
   std::unordered_map<Node*, Node*> dummy_node_images;
   std::unordered_map<Node*, Node*> copied_node_images;
 
-  std::unique_ptr<Graph> graph_out(new Graph(graph_in.op_registry()));
-  graph_out->set_versions(graph_in.versions());
+  graph_out->reset(new Graph(graph_in.op_registry()));
+  (*graph_out)->set_versions(graph_in.versions());
   // The final input to the send node is the dynamic key, which we don't include
   // in the static shapes.
   static_shape_out->resize(send_node->num_inputs() - 1);
@@ -1794,7 +1817,7 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
     if (w.leave) {
       TF_RETURN_IF_ERROR(CopyShapeInferenceNodeToGraph(
           n, send_node, dummy_node_images, library, &copied_node_images,
-          graph_out.get()));
+          graph_out->get()));
     } else {
       if (visited[n->id()]) continue;
       visited[n->id()] = true;
@@ -1818,7 +1841,7 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
             context->ShapeHandleToProto(shape, &proto);
             if (dummy_node_images.find(src_node) == dummy_node_images.end()) {
               dummy_node_images[src_node] = AddDummyShapedNode(
-                  src_node->output_type(src_port), proto, graph_out.get());
+                  src_node->output_type(src_port), proto, graph_out->get());
             }
             // The final input to the send node is the dynamic key, which we
             // don't include in the static shapes.
@@ -1827,8 +1850,12 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
               (*static_shape_out)[in_edge->dst_input()] = proto;
             }
           } else {
+            has_parent_with_unknown_shape = true;
             if (!visited[src_node->id()]) {
-              has_parent_with_unknown_shape = true;
+              if (VLOG_IS_ON(2)) {
+                TensorShapeProto proto;
+                context->ShapeHandleToProto(shape, &proto);
+              }
               stack.push_back({src_node, false});
             }
           }
@@ -1839,7 +1866,7 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
           // The shapes of all the inputs to send_node are statically known. We
           // won't have to do any inference at compile time so return now: the
           // shapes were stored in static_shape_out above.
-          graphdef_out->reset();
+          graph_out->reset();
           return Status::OK();
         } else {
           // Any shape that is being processed is either the original send node
@@ -1862,9 +1889,6 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
     }
   }
 
-  graphdef_out->reset(new GraphDef());
-  graph_out->ToGraphDef(graphdef_out->get());
-
   return Status::OK();
 }
 
@@ -1981,14 +2005,20 @@ Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
   TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
       *graph_out, &pruned_graph, &shape_refiner, &node_images, library));
 
+  if (VLOG_IS_ON(1)) {
+    dump_graph::DumpGraphToFile("pruned_graph_for_shape_inference",
+                                *pruned_graph, library);
+  }
+
   for (auto& subgraph_entry : subgraphs_) {
+    const string& subgraph_name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
     // Find all the recv_at_host nodes in this subgraph.
     std::vector<string> outside_compilation_names;
     subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
     std::unordered_set<string> recv_at_host_names;
-    for (const auto& name : outside_compilation_names) {
-      Node* recv_node = subgraph.GetRecvAtHostNode(name);
+    for (const auto& oc_name : outside_compilation_names) {
+      Node* recv_node = subgraph.GetRecvAtHostNode(oc_name);
       if (recv_node != nullptr) {
         recv_at_host_names.insert(recv_node->name());
       }
@@ -1997,26 +2027,30 @@ Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
     // without knowing the shape of the recv_at_host nodes, and store the
     // result, along with enough information to complete the job at compile time
     // once the recv_at_host shapes are known.
-    for (const auto& name : outside_compilation_names) {
-      Node* send_node = subgraph.GetSendFromHostNode(name);
+    for (const auto& oc_name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(oc_name);
       std::vector<TensorShapeProto> static_shape;
-      std::unique_ptr<GraphDef> graphdef;
+      std::unique_ptr<Graph> graph;
       if (send_node != nullptr) {
         TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
             *pruned_graph, shape_refiner, recv_at_host_names,
-            node_images[send_node], library, &static_shape, &graphdef));
-        if (graphdef == nullptr) {
+            node_images[send_node], library, &static_shape, &graph));
+        if (graph == nullptr) {
           VLOG(2) << "Send node  " << send_node->name() << " shapes";
           for (int i = 0; i < static_shape.size(); ++i) {
             VLOG(2) << static_shape[i].DebugString();
           }
         } else {
-          VLOG(2) << "Send node " << send_node->name() << " graph\n"
-                  << graphdef->DebugString();
+          if (VLOG_IS_ON(2)) {
+            GraphDef graphdef;
+            graph->ToGraphDef(&graphdef);
+            VLOG(2) << "Send node " << send_node->name() << " graph\n"
+                    << graphdef.DebugString();
+          }
         }
       }
-      TF_RETURN_IF_ERROR(
-          subgraph.AddShapeInferenceInfo(name, static_shape, graphdef.get()));
+      TF_RETURN_IF_ERROR(subgraph.AddShapeInferenceInfo(
+          subgraph_name, oc_name, static_shape, graph.get(), library));
     }
     if (!outside_compilation_names.empty()) {
       TF_RETURN_IF_ERROR(subgraph.ReplaceFunctionDef(library));
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index d7bea56a7244665c571c23d49c6769a163b86e9e..8599a7038af9663e5af6f3231429cb7f6ea5f69b 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -13,22 +13,46 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <utility>
 
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 namespace {
 
+const char* const kXlaHostTransferSequencerAttr =
+    "_xla_host_transfer_sequencer";
+
+Status AddGraphDefToFunctionLibrary(const GraphDefBuilder& graphdef_builder,
+                                    const string& name_suffix,
+                                    FunctionDefLibrary* library) {
+  GraphDef graphdef;
+  TF_RETURN_IF_ERROR(graphdef_builder.ToGraphDef(&graphdef));
+  std::unique_ptr<Graph> graph =
+      std::unique_ptr<Graph>(new Graph(OpRegistry::Global()));
+  GraphConstructorOptions opts;
+  opts.allow_internal_ops = true;
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graphdef, graph.get()));
+  FunctionDef* fdef = library->add_function();
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(
+      *graph,
+      strings::StrCat("_outside_compilation_shape_inference_", name_suffix),
+      fdef));
+  return Status::OK();
+}
+
 template <class Tkey, class Tvalue>
 bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
                    const ::tensorflow::protobuf::Map<Tkey, Tvalue>& b,
@@ -112,23 +136,7 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
       a.attr(), b.attr(), [](const string& s) { return s; },
       [](const AttrValue& v) { return v.DebugString(); },
       [](const string& key, const AttrValue& av, const AttrValue& bv) {
-        if (key == "shape_inference_graph") {
-          // Default serialization of GraphDef is unstable because maps don't
-          // serialize deterministically. Rather than go through the hoops to
-          // turn on deterministic serialization of this attr just for this
-          // test, add logic here to compare determinstically.
-          GraphDef ga;
-          if (!ga.ParseFromString(av.s())) {
-            return false;
-          }
-          GraphDef gb;
-          if (!gb.ParseFromString(bv.s())) {
-            return false;
-          }
-          return EqualGraphDef(ga, gb, nullptr);
-        } else {
-          return av.DebugString() == bv.DebugString();
-        }
+        return av.DebugString() == bv.DebugString();
       },
       strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()),
       diff);
@@ -248,7 +256,7 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
 
 // These dummy Op registrations are here because the real Op registrations live
 // in contrib and there can't be a dependence from this test to contrib.
-REGISTER_OP("_XlaHostCompute")
+REGISTER_OP("XlaHostCompute")
     .Input("inputs: Tinputs")
     .Output("outputs: Toutputs")
     .Attr("Tinputs: list(type) >= 0")
@@ -321,8 +329,13 @@ REGISTER_OP("AddNLikeTest")
     .SetIsCommutative()
     .SetIsAggregate();
 
-Node* NoOp(const GraphDefBuilder::Options& opts) {
-  return ops::SourceOp("NoOp", opts);
+Node* Sequencer(const GraphDefBuilder::Options& opts,
+                const string& call_node_name) {
+  if (opts.HaveError()) return nullptr;
+  NodeBuilder node_builder(opts.GetNameForOp("NoOp"), "NoOp",
+                           opts.op_registry());
+  return opts.WithAttr(kXlaHostTransferSequencerAttr, call_node_name)
+      .FinalizeBuilder(&node_builder);
 }
 
 Node* Input(const GraphDefBuilder::Options& opts) {
@@ -370,24 +383,36 @@ Node* KeyPlaceholder(const string& call_node,
       .FinalizeBuilder(&node_builder);
 }
 
-Node* RecvAtHost(ops::NodeOut key_input, const string& key,
+Node* RecvAtHost(ops::NodeOut key_input, const string& cluster,
+                 const string& oc_cluster,
                  const gtl::ArraySlice<DataType>& dtypes,
                  const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  NodeBuilder node_builder(opts.GetNameForOp("_XlaRecvAtHost"),
+  string key =
+      strings::StrCat("host_compute_channel_", cluster, "_", oc_cluster);
+  string name = strings::StrCat("outside_compilation_", cluster, "_",
+                                oc_cluster, "_recv");
+  NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaRecvAtHost"),
                            "_XlaRecvAtHost", opts.op_registry());
   node_builder.Input(std::move(key_input));
   return opts.WithAttr("Toutputs", dtypes)
       .WithAttr("key", key)
       .WithAttr("device_ordinal", 0)
+      .WithAttr("_encapsulate", cluster)
+      .WithAttr("_outside", oc_cluster)
       .FinalizeBuilder(&node_builder);
 }
 
-Node* SendFromHost(ops::NodeOut key_input, const string& key,
+Node* SendFromHost(ops::NodeOut key_input, const string& cluster,
+                   const string& oc_cluster,
                    const std::vector<ops::NodeOut>& inputs,
                    const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  NodeBuilder node_builder(opts.GetNameForOp("_XlaSendFromHost"),
+  string key =
+      strings::StrCat("host_compute_channel_", cluster, "_", oc_cluster);
+  string name = strings::StrCat("outside_compilation_", cluster, "_",
+                                oc_cluster, "_send");
+  NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaSendFromHost"),
                            "_XlaSendFromHost", opts.op_registry());
   node_builder.Input(inputs);
   node_builder.Input(std::move(key_input));
@@ -398,6 +423,8 @@ Node* SendFromHost(ops::NodeOut key_input, const string& key,
   return opts.WithAttr("Tinputs", dtypes)
       .WithAttr("key", key)
       .WithAttr("device_ordinal", 0)
+      .WithAttr("_encapsulate", cluster)
+      .WithAttr("_outside", oc_cluster)
       .FinalizeBuilder(&node_builder);
 }
 
@@ -745,7 +772,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) {
         Graph* graph = graph_ptr->get();
         for (const Node* n : graph->nodes()) {
           if (n->type_string() == "_Arg" &&
-              StringPiece(n->name()).starts_with("const")) {
+              str_util::StartsWith(n->name(), "const")) {
             ++guaranteed_consts;
             EXPECT_TRUE(HasGuaranteeConstAttr(*n));
           } else {
@@ -790,7 +817,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) {
         Graph* graph = graph_ptr->get();
         for (const Node* n : graph->nodes()) {
           if (n->type_string() == "_Arg" &&
-              StringPiece(n->name()).starts_with("const")) {
+              str_util::StartsWith(n->name(), "const")) {
             ++guaranteed_consts;
             EXPECT_TRUE(HasGuaranteeConstAttr(*n));
           } else {
@@ -840,22 +867,20 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected;
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
     Node* key_constant =
         KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
-    Node* recv =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts().WithName("E"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                 {e}, shape.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape_graph;
-    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
-    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+                     shape.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
   *library_expected.add_function() = test::function::XTimesTwo();
@@ -870,13 +895,15 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"C:o:0", "c:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -888,28 +915,29 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    NodeBuilder node_builder("F1", "F1", lib_def.get());
-    node_builder.Input(a).Input(b);
-    Node* call = b2.opts().FinalizeBuilder(&node_builder);
-
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost(ops::NodeOut(key_constant, 0),
-                              "host_compute_channel_F1_O1", {e},
-                              b2.opts()
-                                  .WithName("outside_compilation_F1_O1_send")
-                                  .WithControlInput(e));
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                              b2.opts().WithControlInput(e));
+
+    Node* s = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
 
-    Node* s = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(a).Input(b);
+    Node* call =
+        b2.opts().WithControlInputs({s}).FinalizeBuilder(&node_builder);
 
-    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({e}));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -959,45 +987,43 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected_1;
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
     Node* key_constant =
         KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
-    Node* recv =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape1.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape1.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape1.opts().WithName("E"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                 {e}, shape1.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape1_graph;
-    TF_EXPECT_OK(shape1.ToGraphDef(&shape1_graph));
-    EXPECT_TRUE(shape1_graph.SerializeToString(&shape_string_expected_1));
+                     shape1.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
   }
 
-  string shape_string_expected_2;
   {
     GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
     Node* key_constant =
         KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0"));
-    Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     shape2.opts().WithName("E"));
-    Node* recv2 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O2",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape2.opts().WithName("outside_compilation_F1_O2_recv"));
-    Node* h = Binary(ops::NodeOut(recv2, 0), e, shape2.opts().WithName("H"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O2",
-                 {h}, shape2.opts().WithName("outside_compilation_F1_O2_send"));
-    GraphDef shape2_graph;
-    TF_EXPECT_OK(shape2.ToGraphDef(&shape2_graph));
-    EXPECT_TRUE(shape2_graph.SerializeToString(&shape_string_expected_2));
+                     shape2.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
+    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+                     shape2.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, shape2.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
   }
 
   *library_expected.add_function() = FunctionDefHelper::Create(
@@ -1014,22 +1040,26 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O2_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"D:o:0", "F:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"key", "host_compute_channel_F1_O2"},
-            {"shape_inference_graph", shape_string_expected_2},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O2"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O2"}},
            {"F"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"C:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected_1},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"i_0_retval", "I:o:0"}});
@@ -1041,40 +1071,45 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    NodeBuilder node_builder("F1", "F1", lib_def.get());
-    node_builder.Input(a).Input(b);
-    Node* call = b2.opts().FinalizeBuilder(&node_builder);
-
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0),
-                               "host_compute_channel_F1_O1", {e},
-                               b2.opts()
-                                   .WithName("outside_compilation_F1_O1_send")
-                                   .WithControlInput(e));
-
-    Node* recv2 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O2",
-                   {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O2_recv"));
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv1, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                               b2.opts().WithControlInput(e));
+
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* g = Binary(e, ops::NodeOut(recv2, 1),
-                     b2.opts().WithName("G").WithControlInputs({recv2, e}));
-    Node* h = Binary(ops::NodeOut(recv2, 0), e, b2.opts().WithName("H"));
-    Node* send2 = SendFromHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O2", {h},
-        b2.opts().WithName("outside_compilation_F1_O2_send"));
+                     b2.opts()
+                         .WithName("G")
+                         .WithControlInputs({recv2, e})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+                     b2.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    Node* send2 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, b2.opts());
 
-    Node* s = NoOp(b2.opts()
-                       .WithName("F1_sequencer")
-                       .WithControlInputs({recv1, send1, recv2, send2}));
+    Node* s = Sequencer(b2.opts()
+                            .WithName("F1_sequencer")
+                            .WithControlInputs({recv1, send1, recv2, send2}),
+                        "F1");
+
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(a).Input(b);
+    Node* call = b2.opts().WithControlInput(s).FinalizeBuilder(&node_builder);
 
-    Binary(g, call, b2.opts().WithName("J").WithControlInput(s));
+    Binary(g, call, b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1123,22 +1158,20 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected;
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
     Node* key_constant =
         KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
-    Node* recv =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts().WithName("E"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                 {e}, shape.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape_graph;
-    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
-    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+                     shape.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
   TensorShapeProto shape_proto_expected;
@@ -1156,13 +1189,15 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"C:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
@@ -1176,14 +1211,15 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            "BinaryTest",
            {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"G:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"key", "host_compute_channel_F2_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
 
@@ -1196,43 +1232,46 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
     Node* key_constant1 =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant1, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0),
-                               "host_compute_channel_F1_O1", {e},
-                               b2.opts()
-                                   .WithName("outside_compilation_F1_O1_send")
-                                   .WithControlInput(e));
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv1, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
+                               b2.opts().WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
+
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
-    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
     Node* key_constant2 =
         KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
-    Node* recv2 = RecvAtHost(
-        ops::NodeOut(key_constant2, 0), "host_compute_channel_F2_O1",
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_recv"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1",
+                             {DT_FLOAT}, b2.opts());
     Node* h = Binary(ops::NodeOut(call1, 1), recv2,
-                     b2.opts().WithName("H").WithControlInput(s1));
-    Node* send2 = SendFromHost(
-        ops::NodeOut(key_constant2, 0), "host_compute_channel_F2_O1", {h},
-        b2.opts().WithName("outside_compilation_F2_O1_send"));
+                     b2.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F2")
+                         .WithAttr("_outside", "O1"));
+    Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
+                               b2.opts());
 
+    Node* s2 = Sequencer(
+        b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}),
+        "F2");
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
     node_builder2.Input(e).Input(call1);
     Node* call2 = b2.opts()
-                      .WithControlInputs({s1, e, call1})
+                      .WithControlInputs({s2, e, call1})
                       .FinalizeBuilder(&node_builder2);
-    Node* s2 = NoOp(
-        b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}));
-    Binary(call2, ops::NodeOut(call2, 1),
-           b2.opts().WithName("J").WithControlInput(s2));
+    Binary(call2, ops::NodeOut(call2, 1), b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1280,14 +1319,15 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
            "BinaryTest",
            {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {},
            {{"Tinputs", gtl::ArraySlice<DataType>({})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1298,18 +1338,22 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts().WithName("E"));
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* send1 = SendFromHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {e},
-        b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInput(send1), "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
-    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(b2.opts().WithName("F1_sequencer").WithControlInput(send1));
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Unary(call1, b2.opts().WithName("G").WithControlInput(s1));
+    Unary(call1, b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1358,14 +1402,15 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
            "BinaryTest",
            {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {},
            {{"Tinputs", gtl::ArraySlice<DataType>({})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -1380,19 +1425,23 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
     Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Unary(a, b2.opts().WithName("E").WithControlInput(recv1));
-    Node* send1 = SendFromHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {e},
-        b2.opts().WithName("outside_compilation_F1_O1_send"));
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {}, b2.opts());
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithControlInput(recv1)
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* send1 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
-    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Unary(call1, b2.opts().WithName("G").WithControlInput(s1));
+    Unary(call1, b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1434,13 +1483,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"}, "UnaryTest", {"D:o:0"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1453,16 +1503,20 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {DT_FLOAT},
-        b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Unary(recv1, b2.opts().WithName("E"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInput(recv1), "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
-    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(b2.opts().WithName("F1_sequencer").WithControlInput(recv1));
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("G").WithControlInput(s1));
+    Binary(e, call1, b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1509,13 +1563,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1528,22 +1583,23 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {DT_FLOAT},
-        b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Unary(recv1, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0),
-                               "host_compute_channel_F1_O1", {},
-                               b2.opts()
-                                   .WithName("outside_compilation_F1_O1_send")
-                                   .WithControlInput(e));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {},
+                               b2.opts().WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
-    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
-    Node* s1 = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("G").WithControlInput(s1));
+    Binary(e, call1, b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1594,7 +1650,10 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts().WithName("E"));
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -1640,21 +1699,21 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected;
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
     Node* key_constant =
         KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
     Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_1"));
-    Node* recv = RecvAtHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {DT_FLOAT},
-        shape.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = BinaryUnknownShape(known, recv, shape.opts().WithName("E"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                 {e}, shape.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape_graph;
-    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
-    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT}, shape.opts());
+    Node* e = BinaryUnknownShape(known, recv,
+                                 shape.opts()
+                                     .WithName("E")
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
   *library_expected.add_function() = test::function::XTimesTwo();
@@ -1668,13 +1727,15 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
            {},
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           "XlaHostCompute",
            {"c:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -1687,29 +1748,29 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
     Node* b = Input(b2.opts().WithName("B"));
     Node* c = Unary(a, b2.opts().WithName("C"));
 
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT}, b2.opts());
+    Node* e = BinaryUnknownShape(c, ops::NodeOut(recv, 0),
+                                 b2.opts()
+                                     .WithName("E")
+                                     .WithControlInputs({recv, b})
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                              b2.opts().WithControlInput(e));
+
+    Node* s = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
+
     NodeBuilder node_builder("F1", "F1", lib_def.get());
     node_builder.Input(b).Input(c);
     Node* call =
-        b2.opts().WithControlInputs({c}).FinalizeBuilder(&node_builder);
+        b2.opts().WithControlInputs({s, c}).FinalizeBuilder(&node_builder);
 
-    Node* key_constant =
-        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {DT_FLOAT},
-        b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = BinaryUnknownShape(
-        c, ops::NodeOut(recv, 0),
-        b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost(ops::NodeOut(key_constant, 0),
-                              "host_compute_channel_F1_O1", {e},
-                              b2.opts()
-                                  .WithName("outside_compilation_F1_O1_send")
-                                  .WithControlInput(e));
-
-    Node* s = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
-
-    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({e}));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.cc b/tensorflow/compiler/jit/graph_to_functiondef.cc
index 6fa21fa6204dcc9446081d07e2a59ccace216713..8f5e11dfa47956f1fdaa4d1ff115affa375c5c73 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.cc
+++ b/tensorflow/compiler/jit/graph_to_functiondef.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -229,7 +230,7 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
   for (int n_index = 0; n_index < fdef->node_def_size(); ++n_index) {
     NodeDef* node_def = fdef->mutable_node_def(n_index);
     for (int i = 0; i < node_def->input_size(); ++i) {
-      if (StringPiece(node_def->input(i)).starts_with("^")) {
+      if (str_util::StartsWith(node_def->input(i), "^")) {
         // Control input
         const string normalized =
             node_names.Renormalize(node_def->input(i).substr(1));
diff --git a/tensorflow/compiler/jit/graphcycles/BUILD b/tensorflow/compiler/jit/graphcycles/BUILD
index 15507b3851751c681044a744c07c247410fb3e2d..676f71a75aede2a7720ae0c8a579d64cc184509a 100644
--- a/tensorflow/compiler/jit/graphcycles/BUILD
+++ b/tensorflow/compiler/jit/graphcycles/BUILD
@@ -27,17 +27,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 616a7f8f1541d3debff97a90bd390c76c665d196..00a6f4075f9a18efc3895b033eb6d08e36088a53 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -41,17 +41,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index cd7f8dd779120637c96d6af041b0afcc734e5eff..2d6511a45b9b37df8405d34dd2aec5ba31254c16 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -114,10 +114,12 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   // this is more obviously correct.)
   core::ScopedUnref cache_ref(cache);
 
+  const XlaDevice::Metadata* metadata;
+  Status s = XlaDevice::GetMetadata(ctx, &metadata);
+  bool allocate_xla_tensors = s.ok();
+
   // Get the platform_id_ for XLA_* devices.
   if (platform_id_ == nullptr) {
-    const XlaDevice::Metadata* metadata;
-    Status s = XlaDevice::GetMetadata(ctx, &metadata);
     if (s.ok()) {
       platform_id_ = metadata->platform()->id();
     }
@@ -128,8 +130,23 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
-  // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
+  XlaAllocator local_xla_allocator(client->backend().platform(),
+                                   ctx->device()->GetAllocator({}));
+  xla::DeviceMemoryAllocator* xla_allocator;
+  // If we are on an XlaDevice, use the underlying XLA platform's allocator
+  // directly. We could use the StreamExecutor's allocator which may
+  // theoretically be more correct, but XLA returns a nice OOM message in a
+  // Status and StreamExecutor does not.
+  //
+  // Importantly we can't use ctx->device()->GetAllocator() as the allocator
+  // (which local_xla_allocator above uses) as on an XlaDevice, this is a
+  // dummy allocator that returns XlaTensor objects. The XlaCompiler needs a
+  // real allocator to allocate real buffers.
+  if (allocate_xla_tensors) {
+    xla_allocator = client->backend().memory_allocator();
+  } else {
+    xla_allocator = &local_xla_allocator;
+  }
 
   XlaCompiler::Options options;
   options.client = client;
@@ -137,26 +154,30 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
-  options.device_allocator = &xla_allocator;
+  options.device_allocator = xla_allocator;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
 
-  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, num_constant_args_,
+  std::map<int, Tensor> constant_args;
+  for (int i = 0; i < num_constant_args_; ++i) {
+    constant_args.insert({i, ctx->input(i)});
+  }
+  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, constant_args,
                                      variables, ctx, &kernel, &executable,
                                      /*compile_options=*/nullptr));
 
   VLOG(1) << "Executing XLA Computation...";
 
-  XlaComputationLaunchContext launch_context(num_resource_args_, client,
-                                             &xla_allocator);
+  XlaComputationLaunchContext launch_context(
+      num_resource_args_, client, xla_allocator, allocate_xla_tensors);
   launch_context.PopulateInputs(ctx, kernel, variables);
 
   // Execute the computation.
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
-  run_options.set_allocator(&xla_allocator);
+  run_options.set_allocator(xla_allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
@@ -166,8 +187,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
-  launch_context.PopulateOutputs(ctx, kernel,
-                                 run_result.ConsumeValueOrDie()->release());
+  launch_context.PopulateOutputs(ctx, kernel, run_result.ConsumeValueOrDie());
   VLOG(1) << "Done";
 }
 
diff --git a/tensorflow/compiler/jit/legacy_flags/BUILD b/tensorflow/compiler/jit/legacy_flags/BUILD
index 4491dd6ac8f2b84f341162eb469cc8194f817c9a..5d211f4d733d8d807426e62dd116092799184f35 100644
--- a/tensorflow/compiler/jit/legacy_flags/BUILD
+++ b/tensorflow/compiler/jit/legacy_flags/BUILD
@@ -52,16 +52,14 @@ cc_library(
         ],
 )
 
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
+cc_library(
+    name = "xla_device_flags",
+    srcs = ["xla_device_flags.cc"],
+    hdrs = ["xla_device_flags.h"],
+    deps =
+        [
+            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
         ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
 )
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
index 51384ac2fe6fa70c8a723097093a0a29e7ad2c6b..7277a1d1f8ad5fa045645ead839ab9efa01e89c7 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
+++ b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
@@ -41,6 +41,7 @@ static void AllocateFlags() {
   flags->tf_xla_clustering_debug = false;
   flags->tf_xla_cpu_global_jit = false;
   flags->tf_xla_clustering_fuel = std::numeric_limits<int64>::max();
+  flags->tf_xla_fusion_only = false;
   flag_list = new std::vector<Flag>(
       {Flag("tf_xla_auto_jit", &flags->tf_xla_auto_jit,
             "Control compilation of operators into XLA computations on CPU and "
@@ -59,7 +60,10 @@ static void AllocateFlags() {
             "Enables global JIT compilation for CPU via SessionOptions."),
        Flag("tf_xla_clustering_fuel", &flags->tf_xla_clustering_fuel,
             "Places an artificial limit on the number of ops marked as "
-            "eligible for clustering.")});
+            "eligible for clustering."),
+       Flag("tf_xla_fusion_only", &flags->tf_xla_fusion_only,
+            "enable fusion of element-wise operations only using XLA when "
+            "global_jit_level is ON*.")});
   xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
 }
 
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
index 170b89c987f30f985f981d7835b4af455922594e..2affda6ab4e0fbad32a246744fa5b38aeb629c1b 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
+++ b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
@@ -51,6 +51,10 @@ typedef struct {
   int64 tf_xla_clustering_fuel;   // "Compiler fuel" for clustering.  Only this
                                   // many ops will be marked as eligible for
                                   // clustering.
+  bool tf_xla_fusion_only;  // This flag is effective only when global_jit_level
+                            // is set to ON* and overrides its behavior. If
+                            // true, enable fusion of element-wise operations
+                            // only using XLA.
 } MarkForCompilationPassFlags;
 
 // Return a pointer to the MarkForCompilationPassFlags struct;
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1bb2fce2dbad5bffce2e33b665b7222090d0855a
--- /dev/null
+++ b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Legacy flags for the XLA bridge's xla_device module.
+
+#include <mutex>
+#include <vector>
+
+#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace legacy_flags {
+
+// Pointers to the parsed value of the flags and flag descriptors, initialized
+// via flags_init.
+static XlaDeviceFlags* flags;
+static std::vector<Flag>* flag_list;
+static std::once_flag flags_init;
+
+// Allocate *flags.  Called via call_once(&flags_init,...).
+static void AllocateFlags() {
+  flags = new XlaDeviceFlags;
+  flags->tf_xla_compile_on_demand = false;
+  flag_list = new std::vector<Flag>({
+      Flag("tf_xla_compile_on_demand", &flags->tf_xla_compile_on_demand,
+           "Switch a device into 'on-demand' mode, where instead of "
+           "autoclustering ops are compiled one by one just-in-time."),
+  });
+  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
+}
+
+// Return a pointer to the XlaDeviceFlags struct;
+// repeated calls return the same pointer.
+// This should be called only after Flags::Parse() has returned.
+XlaDeviceFlags* GetXlaDeviceFlags() {
+  std::call_once(flags_init, &AllocateFlags);
+  return flags;
+}
+
+}  // namespace legacy_flags
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..27b22121ac1e089bd5d5a494e1e3fb60b05bc76d
--- /dev/null
+++ b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
+#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
+
+// Legacy flags for the XLA bridge's xla_device module.
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace legacy_flags {
+
+// The values of flags associated with the XLA bridge's
+// xla_device module.
+typedef struct {
+  // Switch the CPU device into "on-demand" mode, where instead of
+  // autoclustering ops are compiled one by one just-in-time.
+  // Enabling this mode by a legacy flag is a temporary mechanism. When this
+  // feature is battle-tested, we will switch this to be a session option.
+  bool tf_xla_compile_on_demand;
+} XlaDeviceFlags;
+
+// Return a pointer to the XlaDeviceFlags struct;
+// repeated calls return the same pointer.
+// This should be called only after Flags::Parse() has returned.
+XlaDeviceFlags* GetXlaDeviceFlags();
+
+}  // namespace legacy_flags
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 57fb8d242208318a608b1f356bef7a8d39dbdc83..f651768a67278628e40445291d7fb271bb1ae611 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -180,6 +180,158 @@ struct NodeCompare {
 };
 using OrderedNodeSet = std::set<Node*, NodeCompare>;
 
+// Returns true if the op can be decomposed into XLA ops for which
+// there are fusable elemental implementations.
+//
+// TODO(hpucha): Consider a black list instead of a white list as
+// implemented below.
+bool IsXlaFusable(const NodeDef& node) {
+  static const std::unordered_set<std::string>* elementwise_ops =
+      new std::unordered_set<std::string>(
+          {// tf2xla/kernels/aggregate_ops.cc
+           "AddN",
+           // tf2xla/kernels/batchtospace_op.cc
+           "BatchToSpace", "BatchToSpaceND",
+           // tf2xla/kernels/bcast_ops.cc
+           "BroadcastArgs", "BroadcastGradientArgs",
+           // tf2xla/kernels/bias_ops.cc
+           "BiasAdd", "BiasAddV1", "BiasAddGrad" /*(Reduce)*/,
+           // tf2xla/kernels/binary_ops.cc
+           "Add", "Sub", "Mul", "Div", "Atan2", "Complex", "FloorDiv",
+           "FloorMod", "BitwiseAnd", "BitwiseOr", "LeftShift", "RightShift",
+           "LogicalAnd", "LogicalOr", "Mod", "Maximum", "Minimum", "RealDiv",
+           "ReciprocalGrad", "RsqrtGrad", "SqrtGrad", "SquaredDifference",
+           "TruncateDiv", "TruncateMod", "Equal", "NotEqual", "Greater",
+           "GreaterEqual", "Less", "LessEqual", "SigmoidGrad", "SoftplusGrad",
+           "SoftsignGrad", "TanhGrad", "Pow", "ApproximateEqual",
+           // tf2xla/kernels/cast_op.cc
+           "Cast",
+           // tf2xla/kernels/categorical_op.cc
+           "Multinomial" /* (Rng ops are disabled on GPU backend currently)*/,
+           // tf2xla/kernels/concat_op.cc
+           "Concat", "ConcatV2", "ConcatOffset",
+           // tf2xla/kernels/const_op.cc
+           "Const",
+           // tf2xla/kernels/cross_op.cc
+           "Cross",
+           // tf2xla/kernels/depthtospace_op.cc
+           "DepthToSpace",
+           // tf2xla/kernels/diag_op.cc
+           "Diag", "DiagPart", "MatrixDiag", "MatrixDiagPart",
+           // tf2xla/kernels/dynamic_stitch_op.cc
+           "DynamicStitch", "ParallelDynamicStitch",
+           // tf2xla/kernels/elu_op.cc
+           "Elu", "EluGrad", "Selu", "SeluGrad",
+           // tf2xla/kernels/fake_quantize_ops.cc
+           "FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxArgsGradient",
+           "FakeQuantWithMinMaxVars",
+           "FakeQuantWithMinMaxVarsGradient" /*(Reduce)*/,
+           // tf2xla/kernels/fill_op.cc
+           "Fill",
+           // tf2xla/kernels/gather_op.cc
+           "Gather", "GatherV2", "GatherNd",
+           // tf2xla/kernels/identity_op.cc
+           "Identity", "IdentityN", "PreventGradient", "StopGradient",
+           "Snapshot",
+           // tf2xla/kernels/image_ops.cc
+           "RGBToHSV", "HSVToRGB", "AdjustContrastv2" /*(Reduce)*/,
+           "AdjustSaturation", "AdjustHue",
+           // tf2xla/kernels/index_ops.cc
+           "ArgMax", "ArgMin",
+           // tf2xla/kernels/l2loss_op.cc
+           "L2Loss" /*(Reduce)*/,
+           // tf2xla/kernels/lrn_ops.cc (ReduceWindow)
+           "LRN", "LRNGrad",
+           // tf2xla/kernels/matrix_band_part_op.cc
+           "MatrixBandPart",
+           // tf2xla/kernels/matrix_set_diag_op.cc
+           "MatrixSetDiag",
+           // tf2xla/kernels/mirror_pad_op.cc
+           "MirrorPad",
+           // tf2xla/kernels/no_op.cc
+           "NoOp", "ControlTrigger",
+           // tf2xla/kernels/one_hot_op.cc
+           "OneHot",
+           // tf2xla/kernels/pack_op.cc
+           "Pack",
+           // tf2xla/kernels/pad_op.cc
+           "Pad", "PadV2",
+           // tf2xla/kernels/pooling_ops.cc
+           "MaxPool", "MaxPoolV2", "MaxPool3D", "AvgPool",
+           "AvgPool3D", /*(all the pooling ops use ReduceWindow)*/
+           "MaxPoolGrad", "MaxPoolGradV2", "MaxPool3DGrad", "AvgPoolGrad",
+           "AvgPool3DGrad",
+           // tf2xla/kernels/quantize_and_dequantize_op.cc (Reduce)
+           "QuantizeAndDequantizeV2",
+           // tf2xla/kernels/random_ops.cc (Rng ops are disabled on GPU backend
+           // currently)
+           "RandomUniform", "RandomUniformInt", "RandomStandardNormal",
+           "TruncatedNormal",
+           // tf2xla/kernels/reduction_ops.cc (Reduce)
+           "Sum", "Prod", "Min", "Max", "Mean", "All", "Any",
+           // tf2xla/kernels/relu_op.cc
+           "Relu", "Relu6", "ReluGrad", "Relu6Grad",
+           // tf2xla/kernels/reshape_op.cc
+           "Reshape",
+           // tf2xla/kernels/reverse_op.cc
+           "Reverse", "ReverseV2",
+           // tf2xla/kernels/reverse_sequence_op.cc
+           "ReverseSequence",
+           // tf2xla/kernels/scan_ops.cc (ReduceWindow)
+           "Cumsum", "Cumprod",
+           // tf2xla/kernels/scatter_nd_op.cc (Reduce)
+           "ScatterNd",
+           // tf2xla/kernels/segment_reduction_ops.cc (Reduce)
+           "UnsortedSegmentSum",
+           // tf2xla/kernels/select_op.cc
+           "Select",
+           // tf2xla/kernels/sequence_ops.cc
+           "Range", "LinSpace",
+           // tf2xla/kernels/shape_op.cc
+           "Shape", "ShapeN", "Rank", "Size", "ExpandDims", "Squeeze",
+           "ZerosLike", "OnesLike",
+           // tf2xla/kernels/slice_op.cc
+           "Slice",
+           // tf2xla/kernels/softmax_op.cc (Reduce)
+           "Softmax", "LogSoftmax", "SoftmaxCrossEntropyWithLogits",
+           "SparseSoftmaxCrossEntropyWithLogits",
+           // tf2xla/kernels/spacetobatch_op.cc
+           "SpaceToBatchND", "SpaceToBatch",
+           // tf2xla/kernels/spacetodepth_op.cc
+           "SpaceToDepth",
+           // tf2xla/kernels/split_op.cc
+           "Split", "SplitV",
+           // tf2xla/kernels/stack_ops.cc
+           "StackV2", "StackPushV2", "StackPopV2", "StackCloseV2",
+           // tf2xla/kernels/stateless_random_ops.cc (Rng ops are disabled on
+           // GPU
+           // backend currently)
+           "StatelessRandomUniform",
+           "StatelessRandomNormal"
+           // tf2xla/kernels/strided_slice_op.cc
+           "StridedSlice",
+           "StridedSliceGrad", "ResourceStridedSliceAssign",
+           // tf2xla/kernels/tile_ops.cc
+           "Tile",
+           // tf2xla/kernels/training_ops.cc
+           "ResourceApplyGradientDescent", "ResourceApplyMomentum",
+           "ResourceApplyAdagrad", "ResourceApplyAdam", "ResourceApplyRMSProp",
+           "ResourceApplyFtrl", "ResourceApplyFtrlV2",
+           // tf2xla/kernels/transpose_op.cc
+           "Transpose", "InvertPermutation",
+           // tf2xla/kernels/unary_ops.cc
+           "ComplexAbs", "Angle", "Conj", "Abs", "Acos", "Acosh", "Asin",
+           "Asinh", "Atan", "Atanh", "Ceil", "Cos", "Cosh", "Sin", "Exp",
+           "Expm1", "Floor", "IsFinite", "IsInf", "IsNan", "Inv", "Reciprocal",
+           "Log", "Log1p", "Invert", "LogicalNot", "Neg", "Rint", "Round",
+           "Rsqrt", "Sigmoid", "Sign", "Sinh", "Softplus", "Softsign", "Sqrt",
+           "Square", "Tan", "Tanh", "Real", "Imag",
+           // tf2xla/kernels/unpack_op.cc
+           "Unpack"});
+
+  return elementwise_ops->count(node.op()) > 0;
+}
+
 Status FindCompilationCandidates(
     const Graph& graph, FunctionLibraryDefinition* flib_def, Env* env,
     const std::function<bool(const Node*, const DeviceType&)>& is_compilable_fn,
@@ -338,10 +490,13 @@ Status MarkForCompilationPass::Run(
         static_cast<OptimizerOptions::GlobalJitLevel>(flags->tf_xla_auto_jit);
   }
   bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+  bool fusion_only = flags->tf_xla_fusion_only;
+
   VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
+  VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
   const FunctionLibraryDefinition* fld = options.flib_def;
 
-  auto is_compilable = [global_jit_level, cpu_global_jit, fld](
+  auto is_compilable = [global_jit_level, cpu_global_jit, fusion_only, fld](
                            const Node* node, const DeviceType& device_type) {
     const XlaOpRegistry::DeviceRegistration* registration;
     if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
@@ -364,6 +519,11 @@ Status MarkForCompilationPass::Run(
     status = fld->GetAttr(*node, kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
+    // Check for fusable ops only if requested.
+    if (global_jit_level > 0 && fusion_only && !IsXlaFusable(node->def())) {
+      return false;
+    }
+
     // Otherwise use the value of global_jit_level.
     // Ignore enable_jit_by_default if global jit compilation for CPU
     // is explicitly requested via tf_xla_cpu_global_jit flag
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 1a8858cccef623185709ab5dc2187a313dd130f7..2e362e0a63f16e4837e63f194920c3f585dd8a46 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -137,7 +138,7 @@ TEST(XlaCompilationTest, CompilableCycles) {
   EXPECT_EQ(clusters["A"], clusters["C"]);
 }
 
-TEST(XlaCompilationTest, UnsupportedTypes) {
+TEST(XlaCompilationTest, Complex128Unsupported) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
   {
@@ -157,6 +158,27 @@ TEST(XlaCompilationTest, UnsupportedTypes) {
   EXPECT_TRUE(clusters.empty());
 }
 
+TEST(XlaCompilationTest, HalfSupported) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Tensor t(DT_HALF, TensorShape());
+    t.scalar<Eigen::half>()() = static_cast<Eigen::half>(0.0f);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_HALF)
+                                         .WithAttr("value", t));
+    Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
+    ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+  EXPECT_FALSE(clusters.empty());
+}
+
 TEST(XlaCompilationTest, ConcatWithConstArg) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
@@ -519,11 +541,11 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
 
   Status status = MarkForCompilation(&graph);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.ToString())
-                  .contains("Edge from c to a would create a cycle.\n"
-                            "+-> a\n"
-                            "|   b\n"
-                            "+-- c\n"));
+  EXPECT_TRUE(str_util::StrContains(status.ToString(),
+                                    "Edge from c to a would create a cycle.\n"
+                                    "+-> a\n"
+                                    "|   b\n"
+                                    "+-- c\n"));
 }
 
 TEST(XlaCompilationTest, Retval) {
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index e5787ca4c8cff436e4404b8488970248b24a5eda..c9e46bc1475aed0e35a48765ad70eef4362e8281 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -17,17 +17,3 @@ cc_library(
     deps = ["//tensorflow/core:framework"],
     alwayslink = 1,
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 8cc79a9bd0b7aa2098ce177a9d7749f4e6c6ac27..6430975335f5eef5b53c80213e6090ffd6166a91 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -92,39 +92,30 @@ uint64 XlaCompilationCache::Signature::Hash::operator()(
 }
 
 Status XlaCompilationCache::BuildSignature(
-    const NameAttrList& function, int num_constant_args,
+    const NameAttrList& function, const std::map<int, Tensor>& constant_args,
     const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
     Signature* signature) {
   signature->name = Canonicalize(function.name(), AttrSlice(&function.attr()));
-  signature->arg_values.resize(num_constant_args);
-
-  signature->arg_types.reserve(ctx->num_inputs() - num_constant_args);
-
-  // Inputs are in the order: constants, non-constants, resource variables.
-  int input_num = 0;
-  // Use the values of compile time constants in the signature->
-  while (input_num < num_constant_args) {
-    signature->arg_values[input_num] = ctx->input(input_num);
-    ++input_num;
-  }
-  // Add the types and shapes of the remaining arguments.
-  while (input_num < ctx->num_inputs() - variable_args.size()) {
-    signature->arg_types.emplace_back(ctx->input_dtype(input_num),
-                                      ctx->input(input_num).shape());
-    ++input_num;
-  }
-  // For variable signatures, use the type and shape of the variable's
-  // current value.
-  for (auto& iterator : variable_args) {
-    const OptionalTensor& variable = iterator.second;
-    TF_RET_CHECK(input_num < ctx->num_inputs());
-    if (variable.present) {
-      signature->arg_types.emplace_back(variable.value.dtype(),
-                                        variable.value.shape());
+  signature->arg_values.reserve(constant_args.size());
+
+  signature->arg_types.reserve(ctx->num_inputs() - constant_args.size());
+
+  for (int i = 0; i < ctx->num_inputs(); ++i) {
+    if (constant_args.count(i) > 0) {
+      // Use the values of compile time constants in the signature.
+      signature->arg_values.push_back(constant_args.at(i));
+    } else if (variable_args.count(i) > 0) {
+      const OptionalTensor& variable = variable_args.at(i);
+      if (variable.present) {
+        signature->arg_types.emplace_back(variable.value.dtype(),
+                                          variable.value.shape());
+      } else {
+        signature->arg_types.emplace_back(DT_INVALID, TensorShape());
+      }
     } else {
-      signature->arg_types.emplace_back(DT_INVALID, TensorShape());
+      signature->arg_types.emplace_back(ctx->input_dtype(i),
+                                        ctx->input(i).shape());
     }
-    ++input_num;
   }
   return Status::OK();
 }
@@ -132,74 +123,58 @@ Status XlaCompilationCache::BuildSignature(
 namespace {
 
 // Builds a XlaCompiler::Argument vector from the arguments to the _XlaLaunch
-// op. The first `num_constant_args` arguments must be host-memory Tensors.
-Status BuildArguments(int num_constant_args,
+// op.
+Status BuildArguments(const std::map<int, Tensor>& constant_args,
                       const std::map<int, OptionalTensor>& variable_args,
                       OpKernelContext* ctx,
                       std::vector<XlaCompiler::Argument>* args) {
   args->resize(ctx->num_inputs());
 
-  int input_num = 0;
-
-  // Handles compile-time constants.
-  TF_RET_CHECK(num_constant_args <= ctx->num_inputs());
-  while (input_num < num_constant_args) {
-    const Tensor& input = ctx->input(input_num);
-    TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-    XlaCompiler::Argument& arg = (*args)[input_num];
-    arg.kind = XlaCompiler::Argument::kConstant;
-    arg.type = input.dtype();
-    arg.shape = input.shape();
-    arg.constant_value = input;
-    ++input_num;
-  }
-
-  // Handles the non-constant arguments.
-  int num_variable_args = variable_args.size();
-  int num_nonconst_args =
-      ctx->num_inputs() - num_variable_args - num_constant_args;
-  TF_RET_CHECK(num_nonconst_args >= 0);
-  while (input_num < num_constant_args + num_nonconst_args) {
-    const Tensor& input = ctx->input(input_num);
-    TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
     XlaCompiler::Argument& arg = (*args)[input_num];
-    if (input.NumElements() > 0) {
-      arg.kind = XlaCompiler::Argument::kParameter;
-    } else {
+    if (constant_args.count(input_num) > 0) {
+      // Handles compile-time constants.
+      const Tensor& input = constant_args.at(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
       arg.kind = XlaCompiler::Argument::kConstant;
+      arg.type = input.dtype();
+      arg.shape = input.shape();
       arg.constant_value = input;
-    }
-    arg.type = input.dtype();
-    arg.shape = input.shape();
-    ++input_num;
-  }
-
-  // Handles resource variables.
-  TF_RET_CHECK(input_num + num_variable_args == ctx->num_inputs());
-  for (auto& iterator : variable_args) {
-    const Tensor& input = ctx->input(input_num);
-    TF_RET_CHECK(input.dtype() == DT_RESOURCE);
-
-    XlaCompiler::Argument& arg = (*args)[input_num];
-
-    arg.name = iterator.second.name;
-    arg.kind = XlaCompiler::Argument::kResource;
-    arg.resource_kind = XlaResource::kVariable;
-    if (iterator.second.present) {
-      const Tensor& value = iterator.second.value;
-      arg.type = value.dtype();
-      arg.shape = value.shape();
-      arg.initialized = true;
+    } else if (variable_args.count(input_num) == 0) {
+      // Handles the non-constant arguments.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      if (input.NumElements() > 0) {
+        arg.kind = XlaCompiler::Argument::kParameter;
+      } else {
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = input;
+      }
+      arg.type = input.dtype();
+      arg.shape = input.shape();
     } else {
-      // The values of uninitialized variables are not passed as inputs, since
-      // they are meaningless. However, it is legal to assign to a resource
-      // variable for the first time inside the XLA computation, so we do permit
-      // uninitialized variables.
-      arg.initialized = false;
-      arg.type = DT_INVALID;
-      arg.shape = TensorShape();
+      // Handles resource variables.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
+      const OptionalTensor& variable = variable_args.at(input_num);
+      arg.name = variable.name;
+      arg.kind = XlaCompiler::Argument::kResource;
+      arg.resource_kind = XlaResource::kVariable;
+      if (variable.present) {
+        const Tensor& value = variable.value;
+        arg.type = value.dtype();
+        arg.shape = value.shape();
+        arg.initialized = true;
+      } else {
+        // The values of uninitialized variables are not passed as inputs, since
+        // they are meaningless. However, it is legal to assign to a resource
+        // variable for the first time inside the XLA computation, so we do
+        // permit uninitialized variables.
+        arg.initialized = false;
+        arg.type = DT_INVALID;
+        arg.shape = TensorShape();
+      }
     }
-    ++input_num;
   }
 
   return Status::OK();
@@ -234,16 +209,43 @@ Status XlaCompilationCache::BuildExecutable(
 
 Status XlaCompilationCache::Compile(
     const XlaCompiler::Options& options, const NameAttrList& function,
-    int num_constant_args, const std::map<int, OptionalTensor>& variable_args,
-    OpKernelContext* ctx,
+    const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable,
     const XlaCompiler::CompileOptions* compile_options) {
+  return CompileImpl(options, function, constant_args, variable_args, ctx,
+                     compilation_result, executable, compile_options, false);
+}
+
+Status XlaCompilationCache::CompileSingleOp(
+    const XlaCompiler::Options& options,
+    const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult** compilation_result,
+    xla::LocalExecutable** executable,
+    const XlaCompiler::CompileOptions* compile_options) {
+  const NodeDef& def = ctx->op_kernel().def();
+  NameAttrList name;
+  name.set_name(def.op());
+  *name.mutable_attr() = def.attr();
+  return CompileImpl(options, name, constant_args, variable_args, ctx,
+                     compilation_result, executable, compile_options, true);
+}
+
+Status XlaCompilationCache::CompileImpl(
+    const XlaCompiler::Options& options, const NameAttrList& function,
+    const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    const XlaCompiler::CompilationResult** compilation_result,
+    xla::LocalExecutable** executable,
+    const XlaCompiler::CompileOptions* compile_options,
+    bool compile_single_op) {
   VLOG(1) << "XlaCompilationCache::Compile " << DebugString();
 
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "num_inputs=" << ctx->num_inputs()
-            << " num_constant_args=" << num_constant_args
+            << " num_constant_args=" << constant_args.size()
             << " num_variable_args=" << variable_args.size();
     for (int i = 0; i < ctx->num_inputs(); i++) {
       TensorShape shape = ctx->input(i).shape();
@@ -264,11 +266,12 @@ Status XlaCompilationCache::Compile(
     }
   }
 
-  TF_RET_CHECK(num_constant_args + variable_args.size() <= ctx->num_inputs());
+  TF_RET_CHECK(constant_args.size() + variable_args.size() <=
+               ctx->num_inputs());
 
   Signature signature;
-  TF_RETURN_IF_ERROR(BuildSignature(function, num_constant_args, variable_args,
-                                    ctx, &signature));
+  TF_RETURN_IF_ERROR(
+      BuildSignature(function, constant_args, variable_args, ctx, &signature));
 
   VLOG(2) << "Signature: " << SignatureDebugString(signature);
   // The outer lock protects the existence of the cache entry. It does not
@@ -295,13 +298,20 @@ Status XlaCompilationCache::Compile(
     // a long time.)
     std::vector<XlaCompiler::Argument> args;
     TF_RETURN_IF_ERROR(
-        BuildArguments(num_constant_args, variable_args, ctx, &args));
+        BuildArguments(constant_args, variable_args, ctx, &args));
 
     XlaCompiler compiler(options);
     entry->compiled = true;
-    entry->compilation_status = compiler.CompileFunction(
-        compile_options ? *compile_options : XlaCompiler::CompileOptions(),
-        function, args, &entry->compilation_result);
+
+    if (compile_single_op) {
+      entry->compilation_status = compiler.CompileSingleOp(
+          compile_options ? *compile_options : XlaCompiler::CompileOptions(),
+          signature.name, ctx, args, &entry->compilation_result);
+    } else {
+      entry->compilation_status = compiler.CompileFunction(
+          compile_options ? *compile_options : XlaCompiler::CompileOptions(),
+          function, args, &entry->compilation_result);
+    }
   }
   *compilation_result = &entry->compilation_result;
   if (entry->compilation_status.ok() && executable) {
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index d5063783140205db54e673a7c7fd8f94b8aa2c65..5c0c79b880c474969464f23b4485734c404cef07 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -52,8 +52,8 @@ class XlaCompilationCache : public ResourceBase {
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
   // to execute an XLA Computation. Compilation results are cached.
   // `function` is the name of a Tensorflow function to compile.
-  // `num_constant_args` is the number of compile-time constant arguments to
-  // `function`. `variable_args` is a snapshot of the current values of the
+  // `constant_args` is a maps of tensorflow argument number to constant value.
+  // `variable_args` is a snapshot of the current values of the
   // resource variable arguments to `function`; uninitialized variables are
   // represented by an absent OptionalTensor.
   // The result of compilation is written to `*compilation_result`, which must
@@ -62,19 +62,40 @@ class XlaCompilationCache : public ResourceBase {
   // executable pointer may be null if the computation has no non-constant
   // outputs.
   Status Compile(const XlaCompiler::Options& options,
-                 const NameAttrList& function, int num_constant_args,
+                 const NameAttrList& function,
+                 const std::map<int, Tensor>& constant_args,
                  const std::map<int, OptionalTensor>& variable_args,
                  OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** compilation_result,
                  xla::LocalExecutable** executable,
                  const XlaCompiler::CompileOptions* compile_options);
 
+  // As above, but calls XlaCompiler::CompileSingleOp instead of
+  // XlaCompiler::CompileFunction.
+  Status CompileSingleOp(
+      const XlaCompiler::Options& options,
+      const std::map<int, Tensor>& constant_args,
+      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      const XlaCompiler::CompilationResult** compilation_result,
+      xla::LocalExecutable** executable,
+      const XlaCompiler::CompileOptions* compile_options);
+
   xla::LocalClient* client() const { return client_; }
   const DeviceType& device_type() const { return device_type_; }
 
   string DebugString() override;
 
  private:
+  // Common implementation of Compile and CompileSingleOp.
+  Status CompileImpl(const XlaCompiler::Options& options,
+                     const NameAttrList& function,
+                     const std::map<int, Tensor>& constant_args,
+                     const std::map<int, OptionalTensor>& variable_args,
+                     OpKernelContext* ctx,
+                     const XlaCompiler::CompilationResult** compilation_result,
+                     xla::LocalExecutable** executable,
+                     const XlaCompiler::CompileOptions* compile_options,
+                     bool compile_single_op);
   // Takes `result` which has been compiled from a Tensorflow subgraph to a
   // XLA computation already, and generates an XLA LocalExecutable `executable`.
   Status BuildExecutable(const XlaCompiler::Options& options,
@@ -104,7 +125,8 @@ class XlaCompilationCache : public ResourceBase {
   static string SignatureDebugString(const Signature& sig);
 
   // Builds the signature for a compilation.
-  Status BuildSignature(const NameAttrList& function, int num_constant_args,
+  Status BuildSignature(const NameAttrList& function,
+                        const std::map<int, Tensor>& constant_args,
                         const std::map<int, OptionalTensor>& variable_args,
                         OpKernelContext* ctx, Signature* signature);
 
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..682d6ea8ccc4a54912ccad4666cf0a7a03a7a698
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -0,0 +1,175 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the XlaCompileOnDemandOp.
+
+#include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+namespace {
+std::map<int, OptionalTensor> GetVariables(OpKernelContext* ctx) {
+  std::map<int, OptionalTensor> variables;
+  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+    if (ctx->input(i).dtype() == DT_RESOURCE) {
+      Var* variable = nullptr;
+      ResourceHandle handle = HandleFromInput(ctx, i);
+      OptionalTensor& optional = variables[i];
+      optional.name = handle.name();
+      if (LookupResource(ctx, handle, &variable).ok()) {
+        tf_shared_lock lock(*variable->mu());
+        optional.present = true;
+        optional.value = *variable->tensor();
+      }
+    }
+  }
+  return variables;
+}
+}  // namespace
+
+Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
+                                 const XlaDevice::Metadata& metadata,
+                                 const XlaCompiler::CompilationResult* result,
+                                 xla::LocalExecutable* executable) {
+  std::map<int, OptionalTensor> variables = GetVariables(ctx);
+  int64 num_resource_args = variables.size();
+
+  xla::LocalClient* client = metadata.client();
+
+  // Builds an XLA allocator for the device.
+  XlaComputationLaunchContext launch_context(
+      num_resource_args, client, client->backend().memory_allocator(), true);
+
+  launch_context.PopulateInputs(ctx, result, variables);
+
+  perftools::gputools::Stream* stream =
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
+  TF_RET_CHECK(stream);
+
+  VLOG(2) << "Executing computation.";
+  xla::ExecutableRunOptions run_options;
+  run_options.set_stream(stream);
+  run_options.set_allocator(client->backend().memory_allocator());
+  run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
+
+  auto run_result = executable->Run(launch_context.arguments(), run_options);
+  TF_RETURN_IF_ERROR(run_result.status());
+
+  launch_context.PopulateOutputs(ctx, result, run_result.ConsumeValueOrDie());
+  return Status::OK();
+}
+
+bool XlaCompileOnDemandOp::MustArgumentBeConstant(const OpKernel* op_kernel,
+                                                  int64 argument_idx) {
+  // TODO(jmolloy): This could be expensive, so memoize.
+  auto* constant_inputs = tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
+      op_kernel->def().op());
+  CHECK(constant_inputs);
+  std::set<int64> constant_input_indices;
+  for (const auto& name : *constant_inputs) {
+    int start, stop;
+    TF_CHECK_OK(op_kernel->InputRange(name, &start, &stop));
+    for (int i = start; i < stop; ++i) {
+      constant_input_indices.insert(i);
+    }
+  }
+  return constant_input_indices.count(argument_idx) > 0;
+}
+
+bool XlaCompileOnDemandOp::ShouldArgumentBeConstant(const OpKernel* op_kernel,
+                                                    int64 argument_idx) {
+  // Right now we only create kConstant arguments when absolutely required, but
+  // there may be benefit in eagerly constant-folding a larger subset of
+  // arguments in the future.
+  return MustArgumentBeConstant(op_kernel, argument_idx);
+}
+
+Status XlaCompileOnDemandOp::Compile(
+    OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+    const XlaCompiler::CompilationResult** result,
+    xla::LocalExecutable** executable) {
+  std::map<int, Tensor> constant_arguments;
+  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+    const Tensor& device_tensor = ctx->input(i);
+    if (const XlaTensor* xla_tensor = XlaTensor::FromTensor(&device_tensor)) {
+      if (xla_tensor->has_host_tensor() &&
+          ShouldArgumentBeConstant(&ctx->op_kernel(), i)) {
+        constant_arguments[i] = xla_tensor->host_tensor();
+      }
+    }
+    if (constant_arguments.count(i) == 0 &&
+        MustArgumentBeConstant(&ctx->op_kernel(), i)) {
+      // Slow path; the argument is not available as a host constant so we must
+      // fetch it synchronously.
+      Tensor host_tensor;
+      AllocatorAttributes attrs;
+      attrs.set_on_host(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_temp(
+          device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
+      Notification n;
+      ctx->op_device_context()->CopyDeviceTensorToCPU(
+          &device_tensor, "ConstantArgument",
+          reinterpret_cast<Device*>(ctx->device()), &host_tensor,
+          [&](Status status) { n.Notify(); });
+      n.WaitForNotification();
+      constant_arguments[i] = host_tensor;
+    }
+  }
+
+  // We store information about the JIT-compiled XLA computation
+  // in the ResourceMgr.
+  ResourceMgr* rm = ctx->resource_manager();
+  CHECK(rm);
+
+  XlaCompilationCache* cache;
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
+      rm->default_container(), "xla_cache", &cache,
+      [&](XlaCompilationCache** cache) {
+        *cache = new XlaCompilationCache(metadata.client(),
+                                         metadata.jit_device_type());
+        return Status::OK();
+      }));
+  // Hold the reference to the JIT during evaluation. (We could probably
+  // free it sooner because the ResourceMgr will retain a reference, but
+  // this is more obviously correct.)
+  core::ScopedUnref cache_ref(cache);
+
+  XlaCompiler::Options options;
+  DeviceType device_type = metadata.jit_device_type();
+  options.device_type = &device_type;
+  options.client = metadata.client();
+  options.flib_def =
+      new FunctionLibraryDefinition(OpRegistry::Global(), FunctionDefLibrary{});
+
+  std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
+  return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx,
+                                result, executable,
+                                /*compile_options=*/nullptr);
+}
+
+void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
+  const XlaCompiler::CompilationResult* result;
+  xla::LocalExecutable* executable;
+  const XlaDevice::Metadata* metadata;
+  OP_REQUIRES_OK(ctx, XlaDevice::GetMetadata(ctx, &metadata));
+  OP_REQUIRES_OK(ctx, Compile(ctx, *metadata, &result, &executable));
+  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..23c6f3903f841a6c39104983c6f7f409757a7319
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The XlaCompileOnDemandOp is an OpKernel that, when its Compute method is
+// called, will generate an xla::Computation and run it asynchronously.
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
+
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// An OpKernel that compiles an op to an XLA computation and runs it. Unlike
+// _XlaLaunch this doesn't rely on any rewrites of the graphdef - it will run a
+// vanilla TensorFlow op as long as the bridge supports it.
+//
+// Importantly _XlaLaunch assumes all input and output tensors are on the host,
+// whereas XlacompileOnDemandOp works with tensors in device memory.
+class XlaCompileOnDemandOp : public OpKernel {
+ public:
+  explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  XlaCompiler::Argument CreateCompilerArgument(OpKernelContext* ctx, int64 i);
+  bool ShouldArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx);
+  bool MustArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx);
+  Status Compile(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+                 const XlaCompiler::CompilationResult** result,
+                 xla::LocalExecutable** executable);
+  Status Run(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+             const XlaCompiler::CompilationResult* result,
+             xla::LocalExecutable* executable);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index e238252751e677eb947f6df03e3b2f2e948ffe19..bc07dbd7bdf005fde781f7a1e6775080e363abfb 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -17,6 +17,8 @@ limitations under the License.
 // operators using XLA via the XLA "Host" (CPU) backend.
 
 #include "tensorflow/compiler/jit/kernels/xla_launch_op.h"
+#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
+#include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -34,14 +36,24 @@ class XlaCpuDeviceFactory : public DeviceFactory {
 Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& options,
                                           const string& name_prefix,
                                           std::vector<Device*>* devices) {
+  legacy_flags::XlaDeviceFlags* flags = legacy_flags::GetXlaDeviceFlags();
+  bool compile_on_demand = flags->tf_xla_compile_on_demand;
+
+  XlaOpRegistry::DeviceRegistration registration;
+  registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
+  registration.requires_compilation = !compile_on_demand;
+  registration.enable_jit_by_default = false;
+  registration.compile_resource_ops = true;
+
   static XlaDeviceOpRegistrations* registrations =
       RegisterXlaDeviceKernels(DEVICE_XLA_CPU, DEVICE_CPU_XLA_JIT);
   (void)registrations;
 
   std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create(
-      "Host", DEVICE_XLA_CPU, 0, DEVICE_CPU_XLA_JIT, options, name_prefix,
-      /*register_device_for_compilation=*/true, &device));
+  TF_RETURN_IF_ERROR(XlaDevice::Create("Host", DEVICE_XLA_CPU, 0,
+                                       DEVICE_CPU_XLA_JIT, options, name_prefix,
+                                       registration,
+                                       /*transfer_as_literal=*/false, &device));
   devices->push_back(device.release());
   return Status::OK();
 }
@@ -50,8 +62,8 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_CPU, XlaCpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 6> kAllXlaCpuTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 7> kAllXlaCpuTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_CPU, XlaLocalLaunchOp, kAllXlaCpuTypes);
 REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_CPU, kAllXlaCpuTypes);
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index d4d8fe1c1d575b4e35d624621cc709e3a16569d5..12f471735f68394a3079541e9ac8532e329bd694 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_set>
 
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
@@ -99,7 +100,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   }
 
   std::unique_ptr<XlaDeviceAllocator> alloc =
-      xla::MakeUnique<XlaDeviceAllocator>(backend, device_ordinal);
+      xla::MakeUnique<XlaDeviceAllocator>();
   XlaDeviceAllocator* alloc_ptr = alloc.get();
   state.allocators_[{backend, device_ordinal}] = std::move(alloc);
   return alloc_ptr;
@@ -108,21 +109,15 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
 /* static */ Status XlaDevice::Create(
     const string& platform_name, const string& device_name, int device_ordinal,
     const string& jit_device_name, const SessionOptions& options,
-    const string& name_prefix, bool register_device_for_compilation,
-    std::unique_ptr<XlaDevice>* device) {
+    const string& name_prefix,
+    const XlaOpRegistry::DeviceRegistration& registration,
+    bool transfer_as_literal, std::unique_ptr<XlaDevice>* device) {
   VLOG(1) << "XlaDevice::Create " << platform_name << " " << device_name << ":"
           << device_ordinal;
 
-  if (register_device_for_compilation) {
-    // These are no-ops if they have already been done previously for
-    // this device_name/compilation_device_name pair.
-    XlaOpRegistry::DeviceRegistration registration;
-    registration.compilation_device_name = jit_device_name;
-    registration.requires_compilation = true;
-    registration.enable_jit_by_default = false;
-    registration.compile_resource_ops = true;
-    XlaOpRegistry::RegisterCompilationDevice(device_name, registration);
-  }
+  // These are no-ops if they have already been done previously for
+  // this device_name/compilation_device_name pair.
+  XlaOpRegistry::RegisterCompilationDevice(device_name, registration);
 
   auto platform = se::MultiPlatformManager::PlatformWithName(platform_name);
   if (!platform.ok()) {
@@ -137,7 +132,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
 
   device->reset(new XlaDevice(options, attrs, device_ordinal,
                               DeviceType(jit_device_name),
-                              platform.ValueOrDie()));
+                              platform.ValueOrDie(), transfer_as_literal));
   return Status::OK();
 }
 
@@ -162,6 +157,7 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
 
 /* static */ Status XlaDevice::GetMetadata(OpKernelContext* ctx,
                                            const Metadata** metadata) {
+  *metadata = nullptr;
   XlaDevice* xla_device =
       dynamic_cast<XlaDevice*>(ctx->device()->UnderlyingDevice());
   if (xla_device == nullptr) {
@@ -177,13 +173,15 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
 
 XlaDevice::XlaDevice(const SessionOptions& options,
                      const DeviceAttributes& attrs, int device_ordinal,
-                     const DeviceType& jit_device_name, se::Platform* platform)
+                     const DeviceType& jit_device_name, se::Platform* platform,
+                     bool transfer_as_literal)
     : LocalDevice(options, attrs),
       xla_metadata_(device_ordinal, platform, jit_device_name),
       device_ordinal_(device_ordinal),
       jit_device_name_(jit_device_name),
       xla_allocator_(nullptr),
-      platform_(platform) {}
+      platform_(platform),
+      transfer_as_literal_(transfer_as_literal) {}
 
 XlaDevice::~XlaDevice() {}
 
@@ -225,7 +223,10 @@ Status XlaDevice::FillContextMap(const Graph* graph,
   VLOG(1) << "XlaDevice::FillContextMap";
   device_context_map->resize(graph->num_node_ids());
   TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-  auto ctx = new XlaDeviceContext(stream);
+  // Call GetAllocator for the side-effect of ensuring the allocator and
+  // XlaTensorInfoManager is created.
+  (void)GetAllocator({});
+  auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_);
   for (Node* n : graph->nodes()) {
     VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name();
     ctx->Ref();
@@ -273,7 +274,7 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
     Notification n;
     TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-    XlaTransferManager manager(stream);
+    XlaTransferManager manager(stream, client(), transfer_as_literal_);
     manager.CopyCPUTensorToDevice(&parsed, this, &copy,
                                   [&n, &status](const Status& s) {
                                     status = s;
@@ -288,19 +289,23 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
 
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
                                                    const char* jit_device) {
+  // Any op assigned to the device that isn't rewritten by the graph rewriter
+  // gets executed by a n XlaCompileOnDemandOp, which compiles it and executes
+  // it just-in-time.
+  kernel_factory::OpKernelRegistrar::Factory factory =
+      [](OpKernelConstruction* context) -> OpKernel* {
+    return new XlaCompileOnDemandOp(context);
+  };
   XlaOpRegistry::RegisterCompilationKernels();
   XlaDeviceOpRegistrations* registrations = new XlaDeviceOpRegistrations;
-  auto dummy_factory = [](OpKernelConstruction* context) -> OpKernel* {
-    return new XlaDeviceDummyOp(context);
-  };
   for (const KernelDef* jit_def : XlaOpRegistry::DeviceKernels(
            jit_device,
            /*include_compilation_only_kernels=*/false)) {
     KernelDef* def = new KernelDef(*jit_def);
     def->set_device_type(device);
     registrations->op_kernel_registrars.emplace_back(
-        new kernel_factory::OpKernelRegistrar(def, "XlaDeviceDummyOp",
-                                              dummy_factory));
+        new kernel_factory::OpKernelRegistrar(def, "XlaCompileOnDemandOp",
+                                              factory));
   }
   return registrations;
 }
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index d2ec38293c429f04f088bf3726ba97eb4e4b0dba..4fe7dd8c9fa9eb954804555e9615160dc4bc3e8a 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -26,6 +26,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
@@ -71,15 +73,20 @@ class XlaDevice : public LocalDevice {
   // Factory function. 'platform_name' is the name of the XLA platform.
   // 'device_name' is the name of the Tensorflow device to create.
   // 'jit_device_name' is the name of the corresponding JIT device.
+  // 'transfer_as_literal' is true if device<->host transfers must be done using
+  // XLA's TransferLiteral{To,From}Device interface. If false, we can use
+  // ThenMemcpy instead.
   static Status Create(const string& platform_name, const string& device_name,
                        int device_ordinal, const string& jit_device_name,
                        const SessionOptions& options, const string& name_prefix,
-                       bool register_device_for_compilation,
+                       const XlaOpRegistry::DeviceRegistration& registration,
+                       bool transfer_as_literal,
                        std::unique_ptr<XlaDevice>* device);
 
   XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs,
             int device_ordinal, const DeviceType& jit_device_name,
-            ::perftools::gputools::Platform* platform);
+            ::perftools::gputools::Platform* platform,
+            bool transfer_as_literal);
   ~XlaDevice() override;
 
   Allocator* GetAllocator(AllocatorAttributes attr) override;
@@ -104,7 +111,7 @@ class XlaDevice : public LocalDevice {
   // Which hardware device in the client's platform this XlaDevice controls.
   const int device_ordinal_;
   // The name of the device that is used to compile Ops for this XlaDevice.
-  const DeviceType& jit_device_name_;
+  DeviceType jit_device_name_;
   // Memory allocator associated with this device.
   Allocator* xla_allocator_;                   // Not owned.
   ::perftools::gputools::Platform* platform_;  // Not owned.
@@ -113,9 +120,12 @@ class XlaDevice : public LocalDevice {
   // copying back and forth between CPU and the device, and
   // computations enqueued by XLA.
   xla::Backend::StreamPtr stream_;
+  // Must we use XLA's transfer manager for correct host<->device transfers? if
+  // false, we can use ThenMemcpy() instead.
+  bool transfer_as_literal_;
 };
 
-// Builds dummy OpKernel registrations on 'device' for the JIT operators
+// Builds OpKernel registrations on 'device' for the JIT operators
 // registered on 'jit_device'. Returns ownership of a XlaDeviceOpRegistrations
 // object that encapsulates the kernel registrations.
 struct XlaDeviceOpRegistrations {
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index c936222f32056e92efced82d5adb3a96c8041a17..6a57831cde1212671c253ef944e3379770db4a8d 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_device_context.h"
 
+#include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -26,33 +27,33 @@ namespace se = ::perftools::gputools;
 namespace tensorflow {
 
 // The allocator used for Tensors assigned to the XLA device.
-XlaDeviceAllocator::XlaDeviceAllocator(const xla::Backend* backend,
-                                       int device_ordinal)
-    : backend_(backend), device_ordinal_(device_ordinal) {}
-
+XlaDeviceAllocator::XlaDeviceAllocator() {}
 XlaDeviceAllocator::~XlaDeviceAllocator() = default;
 
 string XlaDeviceAllocator::Name() { return "xla"; }
 
 void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
-  se::DeviceMemoryBase dmem =
-      backend_->memory_allocator()
-          ->Allocate(device_ordinal_, num_bytes, /*retry_on_failure=*/false)
-          .ValueOrDie();
-  VLOG(2) << "Allocated XLA device tensor " << dmem.opaque() << "(" << num_bytes
-          << ")";
-  return dmem.opaque();
+  // We always return an empty XlaTensor object, encoded as an opaque tagged
+  // pointer. We can return an empty object and ignore num_bytes here because we
+  // have control over all of the uses of this device tensor, and can lazily
+  // allocate memory when used. This allows us to also know the shape of the
+  // allocated Tensor, which is useful if the device's tensor representation
+  // differs from the host.
+  return XlaTensor::ToOpaquePointer(new XlaTensor());
 }
 
 void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
-  se::DeviceMemoryBase dmem(ptr);
-  TF_CHECK_OK(backend_->memory_allocator()->Deallocate(device_ordinal_, &dmem));
-  VLOG(2) << "Deallocated XLA device tensor " << ptr;
+  delete XlaTensor::FromOpaquePointer(ptr);
 }
 
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
 
-XlaTransferManager::XlaTransferManager(se::Stream* stream) : stream_(stream) {}
+XlaTransferManager::XlaTransferManager(se::Stream* stream,
+                                       xla::LocalClient* client,
+                                       bool transfer_as_literal)
+    : stream_(stream),
+      client_(client),
+      transfer_as_literal_(transfer_as_literal) {}
 
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                                Device* device,
@@ -68,18 +69,37 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
 
     void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
     const int64 total_bytes = cpu_tensor->TotalBytes();
-    void* dst_ptr = DMAHelper::base(device_tensor);
-    se::DeviceMemoryBase dev_dst_ptr(dst_ptr, total_bytes);
 
+    XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+    CHECK(xla_tensor);
+    if (!xla_tensor->has_shaped_buffer()) {
+      Status s = xla_tensor->AllocateShapedBuffer(
+          device_tensor->dtype(), device_tensor->shape(), client_,
+          stream_->parent()->device_ordinal());
+      if (!s.ok()) {
+        done(s);
+        return;
+      }
+    }
+
+    se::DeviceMemoryBase dev_dst_ptr =
+        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     Status status;
-    stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
-    // TODO(hpucha): Make this asynchronous.
-    Status block_status = stream_->BlockHostUntilDone();
-    if (!block_status.ok()) {
-      status = xla::InternalError(
-          "Failed to complete data transfer on stream %p: %s", stream_,
-          block_status.error_message().c_str());
+    if (transfer_as_literal_) {
+      status = xla::Unimplemented(
+          "XlaTransferManager::CopyCPUTensorToDevice not implemented for "
+          "literals");
+    } else {
+      stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
+      // TODO(hpucha): Make this asynchronous.
+      Status block_status = stream_->BlockHostUntilDone();
+      if (!block_status.ok()) {
+        status = xla::InternalError(
+            "Failed to complete data transfer on stream %p: %s", stream_,
+            block_status.error_message().c_str());
+      }
     }
+    xla_tensor->set_host_tensor(*cpu_tensor);
 
     done(status);
     return;
@@ -103,18 +123,24 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
             << device_tensor->NumElements();
 
     const int64 total_bytes = cpu_tensor->TotalBytes();
-    void* src_ptr = const_cast<void*>(DMAHelper::base(device_tensor));
-    se::DeviceMemoryBase dev_src_ptr(src_ptr, total_bytes);
+    se::DeviceMemoryBase dev_src_ptr =
+        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     void* dst_ptr = DMAHelper::base(cpu_tensor);
 
     Status status;
-    stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
-    // TODO(hpucha): Make this asynchronous.
-    Status block_status = stream_->BlockHostUntilDone();
-    if (!block_status.ok()) {
-      status = xla::InternalError(
-          "Failed to complete data transfer on stream %p: %s", stream_,
-          block_status.error_message().c_str());
+    if (transfer_as_literal_) {
+      status = xla::Unimplemented(
+          "XlaTransferManager::CopyDeviceTensorToCPU not implemented for "
+          "literals");
+    } else {
+      stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
+      // TODO(hpucha): Make this asynchronous.
+      Status block_status = stream_->BlockHostUntilDone();
+      if (!block_status.ok()) {
+        status = xla::InternalError(
+            "Failed to complete data transfer on stream %p: %s", stream_,
+            block_status.error_message().c_str());
+      }
     }
 
     done(status);
@@ -125,7 +151,9 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   done(Status::OK());
 }
 
-XlaDeviceContext::XlaDeviceContext(se::Stream* stream) : manager_(stream) {}
+XlaDeviceContext::XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
+                                   bool transfer_as_literal)
+    : manager_(stream, client, transfer_as_literal) {}
 
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index c4edcd474e48f791af9340c3cd6e4d031407bb68..a8ad511fbd2d7f06601608101b8346ff30f8fc20 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -26,11 +27,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-// The allocator used for Tensors assigned to the XLA device. It uses
-// XLA backend's allocator.
+// The allocator used for Tensors assigned to the XLA device. The allocator
+// ignores the alignment and size of the request and always returns a new,
+// empty, XlaTensor.
 class XlaDeviceAllocator : public Allocator {
  public:
-  XlaDeviceAllocator(const xla::Backend* backend, int device_ordinal);
+  XlaDeviceAllocator();
   ~XlaDeviceAllocator() override;
 
   string Name() override;
@@ -38,18 +40,14 @@ class XlaDeviceAllocator : public Allocator {
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
-
- private:
-  // Which backend in the client this allocator belongs to.
-  const xla::Backend* backend_;
-  // Which hardware device in the client's backend this allocator belongs to.
-  const int device_ordinal_;
 };
 
 // Helper class for managing data transfers between host and XLA devices.
 class XlaTransferManager {
  public:
-  explicit XlaTransferManager(perftools::gputools::Stream* stream);
+  explicit XlaTransferManager(perftools::gputools::Stream* stream,
+                              xla::LocalClient* client,
+                              bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor, StatusCallback done) const;
@@ -62,6 +60,10 @@ class XlaTransferManager {
   // Stream obtained from a Device, used to transfer tensors between
   // CPU and device.
   perftools::gputools::Stream* stream_;
+  // For the underlying memory allocator and XLA's TransferManager.
+  xla::LocalClient* client_;
+  // True if we must use XLA's TransferManager for correct device transfers.
+  bool transfer_as_literal_;
 };
 
 // DeviceContext for operators assigned to XlaDevice devices. The
@@ -69,7 +71,8 @@ class XlaTransferManager {
 // wraps the methods in XlaTransferManager.
 class XlaDeviceContext : public DeviceContext {
  public:
-  explicit XlaDeviceContext(perftools::gputools::Stream* stream);
+  explicit XlaDeviceContext(perftools::gputools::Stream* stream,
+                            xla::LocalClient* client, bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor,
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 2326070358d67c0cf30ef17fab5c93862cd8932c..ac60423d959ca44e7d92e2d965cf731287b1f83f 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -34,14 +34,21 @@ class XlaGpuDeviceFactory : public DeviceFactory {
 Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options,
                                           const string& name_prefix,
                                           std::vector<Device*>* devices) {
+  XlaOpRegistry::DeviceRegistration registration;
+  registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
+  registration.requires_compilation = true;
+  registration.enable_jit_by_default = false;
+  registration.compile_resource_ops = true;
+
   static XlaDeviceOpRegistrations* registrations =
       RegisterXlaDeviceKernels(DEVICE_XLA_GPU, DEVICE_GPU_XLA_JIT);
   (void)registrations;
 
   std::unique_ptr<XlaDevice> device;
-  Status status = XlaDevice::Create(
-      "CUDA", DEVICE_XLA_GPU, 0, DEVICE_GPU_XLA_JIT, options, name_prefix,
-      /*register_device_for_compilation=*/true, &device);
+  Status status =
+      XlaDevice::Create("CUDA", DEVICE_XLA_GPU, 0, DEVICE_GPU_XLA_JIT, options,
+                        name_prefix, registration,
+                        /*transfer_as_literal=*/false, &device);
   if (!status.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
     VLOG(1) << "Failed to create XLA_GPU device: " << status;
@@ -55,8 +62,9 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_GPU, XlaGpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 6> kAllXlaGpuTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 8> kAllXlaGpuTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL,
+     DT_BFLOAT16}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_GPU, XlaLocalLaunchOp, kAllXlaGpuTypes);
 REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_GPU, kAllXlaGpuTypes);
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index a329451b14a785b17913e3838a6571b62b422804..9e098c46f422b436c722bb909dc58930ab7c0ef6 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -41,10 +41,17 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
       DEVICE_XLA_INTERPRETER, DEVICE_INTERPRETER_XLA_JIT);
   (void)registrations;
 
+  XlaOpRegistry::DeviceRegistration registration;
+  registration.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
+  registration.requires_compilation = true;
+  registration.enable_jit_by_default = false;
+  registration.compile_resource_ops = true;
+
   std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create(
-      "Interpreter", DEVICE_XLA_INTERPRETER, 0, DEVICE_INTERPRETER_XLA_JIT,
-      options, name_prefix, /*register_device_for_compilation=*/true, &device));
+  TF_RETURN_IF_ERROR(XlaDevice::Create("Interpreter", DEVICE_XLA_INTERPRETER, 0,
+                                       DEVICE_INTERPRETER_XLA_JIT, options,
+                                       name_prefix, registration,
+                                       /*transfer_as_literal=*/false, &device));
   devices->push_back(device.release());
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 8322dd2e829a850413f8eee843b78052f6aad549..354be1e1b54b2f2e808b2216cfc1fe110dbb3857 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -52,78 +52,66 @@ std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
   return snapshot;
 }
 
-XlaAllocator::XlaAllocator(const gpu::Platform* platform,
-                           OpKernelContext* op_context)
-    : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {}
+XlaAllocator::XlaAllocator(const gpu::Platform* platform, Allocator* wrapped)
+    : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
 
-XlaAllocator::~XlaAllocator() = default;
+XlaAllocator::~XlaAllocator() {}
 
 xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
-  AllocatorAttributes allocator_attrs;
-  allocator_attrs.set_on_host(false);
-
-  AllocationAttributes allocation_attrs;
-  allocation_attrs.no_retry_on_failure = !retry_on_failure;
-
-  Tensor t;
-  Status status = op_context_->allocate_temp(
-      DT_UINT8, TensorShape({static_cast<int64>(size)}), &t, allocator_attrs,
-      allocation_attrs);
-  if (!status.ok()) {
-    VLOG(2) << "Allocation failed " << size;
-    return status;
+  void* data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size);
+  if (data == nullptr) {
+    return errors::ResourceExhausted("Out of memory while trying to allocate ",
+                                     size, " bytes.");
+  } else {
+    return gpu::DeviceMemoryBase(data, size);
   }
-  void* data =
-      reinterpret_cast<void*>(const_cast<char*>(t.tensor_data().data()));
-  tensors_[data] = t;
-  return gpu::DeviceMemoryBase(data, size);
-}
-
-Status XlaAllocator::RegisterArgument(const Tensor* t) {
-  void* data =
-      reinterpret_cast<void*>(const_cast<char*>(t->tensor_data().data()));
-  tensors_[data] = *t;
-  return Status::OK();
 }
 
 Status XlaAllocator::Deallocate(int device_ordinal,
                                 gpu::DeviceMemoryBase* mem) {
-  if (mem->opaque() != nullptr) {
-    if (tensors_.erase(mem->opaque()) == 0) {
-      return tensorflow::errors::InvalidArgument("Unknown tensor address");
-    }
-  }
+  wrapped_->DeallocateRaw(mem->opaque());
   return Status::OK();
 }
 
-Status XlaAllocator::MakeTensorFromBuffer(gpu::DeviceMemoryBase buffer,
-                                          DataType dtype,
-                                          const TensorShape& shape,
-                                          Tensor* out_tensor) const {
-  void* ptr = const_cast<void*>(buffer.opaque());
-  auto it = tensors_.find(ptr);
-  if (it == tensors_.end()) {
-    return errors::InvalidArgument("Unknown tensor address");
-  }
-  const Tensor& tensor = it->second;
-
-  int64 output_size = DataTypeSize(dtype) * shape.num_elements();
-  if (tensor.TotalBytes() == output_size) {
-    out_tensor->UnsafeCopyFromInternal(tensor, dtype, shape);
-  } else {
-    Tensor slice = tensor.Slice(0, output_size);
-    out_tensor->UnsafeCopyFromInternal(slice, dtype, shape);
+namespace {
+// Return the 'index''th subtree of the given ShapedBuffer as a
+// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
+// subtree, and sets the input's buffer pointers to nullptr for the subtree.
+std::unique_ptr<xla::ScopedShapedBuffer> ExtractSubShapedBuffer(
+    xla::ShapedBuffer* shaped_buffer, int index,
+    xla::DeviceMemoryAllocator* allocator) {
+  xla::Shape on_host_shape = xla::ShapeUtil::GetTupleElementShape(
+      shaped_buffer->on_host_shape(), index);
+  xla::Shape on_device_shape = xla::ShapeUtil::GetTupleElementShape(
+      shaped_buffer->on_device_shape(), index);
+
+  xla::ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
+                                      shaped_buffer->platform(),
+                                      shaped_buffer->device_ordinal());
+
+  auto& shape_tree = shaped_buffer->buffers();
+  auto& sub_shape_tree = sub_shaped_buffer.buffers();
+  sub_shape_tree.CopySubtreeFrom(shape_tree,
+                                 /*source_base_index=*/{index},
+                                 /*target_base_index=*/{});
+  for (auto& index_to_buffer : shape_tree) {
+    if (!index_to_buffer.first.empty() && index_to_buffer.first[0] == index) {
+      index_to_buffer.second = gpu::DeviceMemoryBase(nullptr, 0);
+    }
   }
-  return Status::OK();
+  return xla::ScopedShapedBuffer::MakeScoped(&sub_shaped_buffer, allocator)
+      .ValueOrDie();
 }
+}  // namespace
 
 XlaComputationLaunchContext::XlaComputationLaunchContext(
     int64 num_resource_args, xla::LocalClient* client,
-    XlaAllocator* xla_allocator)
+    xla::DeviceMemoryAllocator* xla_allocator, bool allocate_xla_tensors)
     : num_resource_args_(num_resource_args),
       client_(client),
-      xla_allocator_(xla_allocator) {}
+      xla_allocator_(xla_allocator),
+      allocate_xla_tensors_(allocate_xla_tensors) {}
 
 void XlaComputationLaunchContext::PopulateInputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
@@ -145,29 +133,32 @@ void XlaComputationLaunchContext::PopulateInputs(
       t = &(ctx->input(arg_num));
     }
 
-    gpu::DeviceMemoryBase dmem = gpu::DeviceMemoryBase(
-        const_cast<char*>(t->tensor_data().data()), t->tensor_data().size());
-
     const xla::Shape on_device_shape =
         client_->backend().transfer_manager()->HostShapeToDeviceShape(shape);
-    CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
-        << "On-device shape "
-        << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
-        << " not the same as on-host shape "
-        << xla::ShapeUtil::HumanStringWithLayout(shape);
-    arg_buffers_[i] = xla::MakeUnique<xla::ShapedBuffer>(
-        /*on_host_shape=*/shape, /*on_device_shape=*/shape, client_->platform(),
-        client_->default_device_ordinal());
-    arg_buffers_[i]->set_buffer(dmem, /*index=*/{});
-    arg_ptrs_[i] = arg_buffers_[i].get();
-
-    OP_REQUIRES_OK(ctx, xla_allocator_->RegisterArgument(t));
+    if (xla::ShapeUtil::IsTuple(on_device_shape)) {
+      const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
+      CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
+      arg_ptrs_[i] =
+          const_cast<xla::ShapedBuffer*>(&xla_tensor->shaped_buffer());
+    } else {
+      CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
+          << "On-device shape "
+          << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
+          << " not the same as on-host shape "
+          << xla::ShapeUtil::HumanStringWithLayout(shape);
+      gpu::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
+      arg_buffers_[i] = xla::MakeUnique<xla::ShapedBuffer>(
+          /*on_host_shape=*/shape, /*on_device_shape=*/shape,
+          client_->platform(), client_->default_device_ordinal());
+      arg_buffers_[i]->set_buffer(dmem, /*index=*/{});
+      arg_ptrs_[i] = arg_buffers_[i].get();
+    }
   }
 }
 
 void XlaComputationLaunchContext::PopulateOutputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
-    std::unique_ptr<xla::ShapedBuffer> output) {
+    std::unique_ptr<xla::ScopedShapedBuffer> output) {
   gpu::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
@@ -180,36 +171,59 @@ void XlaComputationLaunchContext::PopulateOutputs(
   // Copy XLA results to the OpOutputList.
   int output_num = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
+    Allocator* allocator = ctx->device()->GetAllocator({});
     if (kernel->outputs[i].is_constant) {
       // Output is a constant.
       const Tensor& const_tensor = kernel->outputs[i].constant_value;
+      Tensor* output_tensor;
       const size_t total_bytes = const_tensor.TotalBytes();
       if (stream && total_bytes > 0) {
         // Copy host -> device. (Empty tensors don't have backing buffers.)
         VLOG(1) << "Constant output tensor on device";
-        Tensor* output_tensor;
-        TF_CHECK_OK(
-            ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
+
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
+        if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
+          OP_REQUIRES_OK(ctx, xla_tensor->AllocateShapedBuffer(
+                                  const_tensor.dtype(), const_tensor.shape(),
+                                  client_, stream->parent()->device_ordinal()));
+        }
 
         const void* src_ptr = DMAHelper::base(&const_tensor);
-        void* dst_ptr = DMAHelper::base(output_tensor);
-        gpu::DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
-        stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
+        gpu::DeviceMemoryBase dst_ptr =
+            XlaTensor::DeviceMemoryFromTensor(*output_tensor);
+        // Memcpying asynchronously is safe for the GPU, but the CPU uses a
+        // shared allocator so hold a reference to the copied-to buffer until
+        // complete.
+        TensorReference ref(*output_tensor);
+        stream->ThenMemcpy(&dst_ptr, src_ptr, total_bytes);
+        stream->ThenDoHostCallback([ref] { ref.Unref(); });
       } else {
         // No copy required.
         ctx->set_output(i, const_tensor);
+        output_tensor = ctx->mutable_output(i);
+      }
+      if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
+        xla_tensor->set_host_tensor(const_tensor);
       }
     } else {
       const TensorShape& shape = kernel->outputs[i].shape;
       VLOG(2) << "Retval " << i << " shape " << shape.DebugString();
 
       gpu::DeviceMemoryBase buffer = output->buffer({output_num});
-      Tensor output_tensor;
-      // Looks up the owning Tensor by buffer address.
-      OP_REQUIRES_OK(ctx, xla_allocator_->MakeTensorFromBuffer(
-                              buffer, ctx->expected_output_dtype(i), shape,
-                              &output_tensor));
-      ctx->set_output(i, output_tensor);
+      if (allocate_xla_tensors_) {
+        Tensor* output_tensor;
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor));
+        XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
+        CHECK(xla_tensor);
+        xla_tensor->set_shaped_buffer(
+            ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_));
+      } else {
+        Tensor output_tensor = XlaTensorBuffer::MakeTensor(
+            ctx->expected_output_dtype(i), shape, buffer, allocator);
+        output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+        ctx->set_output(i, output_tensor);
+      }
       ++output_num;
     }
 
@@ -221,6 +235,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
   // Apply variable updates, if any.
   VLOG(2) << "Applying variable updates";
   for (int i = 0; i < kernel->resource_updates.size(); ++i) {
+    Allocator* allocator = ctx->device()->GetAllocator({});
     const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
     OP_REQUIRES(ctx,
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
@@ -244,10 +259,21 @@ void XlaComputationLaunchContext::PopulateOutputs(
     OP_REQUIRES(ctx, variable->tensor()->dtype() == write.type,
                 errors::Internal("Mismatched type in variable write"));
 
-    // Looks up the owning Tensor by buffer address.
-    OP_REQUIRES_OK(ctx,
-                   xla_allocator_->MakeTensorFromBuffer(
-                       buffer, write.type, write.shape, variable->tensor()));
+    if (allocate_xla_tensors_) {
+      Tensor output_tensor;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(write.type, write.shape, &output_tensor));
+      XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
+      CHECK(xla_tensor);
+      xla_tensor->set_shaped_buffer(
+          ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_));
+      *variable->tensor() = output_tensor;
+    } else {
+      Tensor output_tensor = XlaTensorBuffer::MakeTensor(
+          write.type, write.shape, buffer, allocator);
+      output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+      *variable->tensor() = output_tensor;
+    }
     ++output_num;
   }
 }
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 9fd356fce5896c317196cb31fd5248b6bc3427a8..14f70fe35891040ff3460567adb223be0f1c910f 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -19,8 +19,10 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/variable_ops.h"
@@ -45,24 +47,13 @@ std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
 class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
   XlaAllocator(const perftools::gputools::Platform* platform,
-               OpKernelContext* op_context);
+               Allocator* wrapped);
   ~XlaAllocator() override;
   xla::StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
       int device_ordinal, uint64 size, bool retry_on_failure) override;
   Status Deallocate(int device_ordinal,
                     perftools::gputools::DeviceMemoryBase* mem) override;
 
-  // Register an Tensor (input or resource variable) with the allocator. If
-  // the operation returns an alias to one of its inputs, then the allocator
-  // needs to be able to handle it.
-  Status RegisterArgument(const Tensor* t);
-
-  // Makes 'tensor' a wrapper around the data buffer at 'ptr'. The buffer is
-  // interpreted as having data type 'dtype' and shape 'shape'.
-  Status MakeTensorFromBuffer(perftools::gputools::DeviceMemoryBase buffer,
-                              DataType dtype, const TensorShape& shape,
-                              Tensor* out_tensor) const;
-
   // The Tensorflow BFC allocator used on GPU allows host-side deallocation
   // before GPU execution takes place. Tensorflow uses the ordering of the main
   // compute stream to enforce a happens-before relationship between a memory
@@ -73,20 +64,19 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
   bool AllowsAsynchronousDeallocation() const override { return true; }
 
  private:
-  OpKernelContext* const op_context_;
-
-  // Map from pointer address to the owning Tensor; used by
-  // MakeTensorFromBuffer. Also used to automatically release Tensors when the
-  // allocator is freed.
-  std::unordered_map<void*, Tensor> tensors_;
+  Allocator* wrapped_;
 };
 
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
 class XlaComputationLaunchContext {
  public:
+  // Create a new launch context. 'allocate_xla_tensors' is true if allocated
+  // output tensors and variables are always XlaTensors. If false they are
+  // assumed to be "normal" device pointers.
   XlaComputationLaunchContext(int64 num_resource_args, xla::LocalClient* client,
-                              XlaAllocator* xla_allocator);
+                              xla::DeviceMemoryAllocator* xla_allocator,
+                              bool allocate_xla_tensors);
 
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
   // `variables` is a map from TensorFlow argument number to resource variable.
@@ -97,7 +87,7 @@ class XlaComputationLaunchContext {
   // Given the XLA output in `output`, populate all outputs of `ctx`.
   void PopulateOutputs(OpKernelContext* ctx,
                        const XlaCompiler::CompilationResult* kernel,
-                       std::unique_ptr<xla::ShapedBuffer> output);
+                       std::unique_ptr<xla::ScopedShapedBuffer> output);
 
   // Return the argument list. Only valid after PopulateInputs() has been
   // called.
@@ -106,11 +96,53 @@ class XlaComputationLaunchContext {
  private:
   int64 num_resource_args_;
   xla::LocalClient* client_;
-  XlaAllocator* xla_allocator_;
+  xla::DeviceMemoryAllocator* xla_allocator_;
+  bool allocate_xla_tensors_;
   std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
   std::vector<xla::ShapedBuffer*> arg_ptrs_;
 };
 
+// A simple TensorBuffer implementation that allows us to create Tensors that
+// take ownership of pre-allocated memory.
+class XlaTensorBuffer : public TensorBuffer {
+ public:
+  XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size,
+                  Allocator* allocator)
+      : expected_size_(expected_size),
+        actual_size_(actual_size),
+        allocator_(allocator) {
+    data_ = const_cast<void*>(ptr);
+  }
+
+  ~XlaTensorBuffer() override { allocator_->DeallocateRaw(data_); }
+
+  void* data() const override { return data_; }
+  size_t size() const override { return expected_size_; }
+
+  TensorBuffer* root_buffer() override { return this; }
+
+  void FillAllocationDescription(AllocationDescription* proto) const override {
+    proto->set_allocated_bytes(actual_size_);
+  }
+
+  static Tensor MakeTensor(DataType dtype, const TensorShape& shape,
+                           perftools::gputools::DeviceMemoryBase buffer,
+                           Allocator* allocator) {
+    size_t expected_size = shape.num_elements() * DataTypeSize(dtype);
+    auto* tensor_buffer = new XlaTensorBuffer(buffer.opaque(), expected_size,
+                                              buffer.size(), allocator);
+    Tensor t(dtype, shape, tensor_buffer);
+    tensor_buffer->Unref();
+    return t;
+  }
+
+ private:
+  void* data_;
+  size_t expected_size_;
+  size_t actual_size_;
+  Allocator* allocator_;
+};
+
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..956328e6757f4c903e3995a54635682d19052794
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+
+namespace tensorflow {
+
+/*static*/ XlaTensor* XlaTensor::FromTensor(Tensor* tensor) {
+  if (tensor->NumElements() == 0) {
+    return nullptr;
+  }
+  XlaTensor* xla_tensor =
+      FromOpaquePointer(const_cast<char*>(tensor->tensor_data().data()));
+  return xla_tensor;
+}
+
+/*static*/ const XlaTensor* XlaTensor::FromTensor(const Tensor* tensor) {
+  return FromTensor(const_cast<Tensor*>(tensor));
+}
+
+/*static*/ perftools::gputools::DeviceMemoryBase
+XlaTensor::DeviceMemoryFromTensor(const Tensor& tensor) {
+  const XlaTensor* xla_tensor = FromTensor(&tensor);
+  if (xla_tensor) {
+    CHECK(xla_tensor->has_shaped_buffer());
+    return xla_tensor->shaped_buffer().root_buffer();
+  } else {
+    return perftools::gputools::DeviceMemoryBase(
+        const_cast<char*>(tensor.tensor_data().data()),
+        tensor.tensor_data().size());
+  }
+}
+
+Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+                                       xla::LocalClient* client,
+                                       int device_ordinal) {
+  xla::Shape on_host_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &on_host_shape));
+  xla::Shape on_device_shape =
+      client->backend().transfer_manager()->HostShapeToDeviceShape(
+          on_host_shape);
+
+  xla::ShapedBuffer buffer(on_host_shape, on_device_shape, client->platform(),
+                           device_ordinal);
+  for (auto& index_to_buffer : buffer.buffers()) {
+    xla::Shape subshape =
+        xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
+    uint64 size =
+        client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
+    TF_ASSIGN_OR_RETURN(index_to_buffer.second,
+                        client->backend().memory_allocator()->Allocate(
+                            device_ordinal, size, /*retry_on_failure=*/false));
+  }
+
+  TF_ASSIGN_OR_RETURN(auto scoped_buffer,
+                      xla::ScopedShapedBuffer::MakeScoped(
+                          &buffer, client->backend().memory_allocator()));
+  set_shaped_buffer(std::move(scoped_buffer));
+  return Status::OK();
+}
+
+// The pointer tag, OR-ed into the XlaTensor's address to distinguish it from
+// device-side tensors, which are either CPU or GPU memory pointers. This works
+// because we're guaranteed that CPU and GPU pointers are aligned to > 1 bits.
+namespace {
+constexpr uintptr_t kTag = 0x1ULL;
+}
+
+/*static*/ XlaTensor* XlaTensor::FromOpaquePointer(void* ptr) {
+  uintptr_t value = reinterpret_cast<uintptr_t>(ptr);
+  if (value & kTag) {
+    return reinterpret_cast<XlaTensor*>(value & ~kTag);
+  } else {
+    return nullptr;
+  }
+}
+
+/*static*/ void* XlaTensor::ToOpaquePointer(XlaTensor* tensor) {
+  uintptr_t value = reinterpret_cast<uintptr_t>(tensor);
+  CHECK_EQ(value & kTag, 0);
+  value |= kTag;
+  return reinterpret_cast<XlaTensor*>(value);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ff2fb08f03548260215c6aeded2c124f8d28f43
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -0,0 +1,100 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
+
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// The implementation of a Tensor for an XlaDevice. All device tensors are
+// actually one of these.
+//
+// To distinguish between "normal" device tensors and XlaTensors, the raw
+// pointer data stored in the TensorBuffer is a tagged pointer.
+class XlaTensor {
+ public:
+  // Downcast from a Tensor to an XlaTensor. Return nullptr if the downcast
+  // fails.
+  static XlaTensor* FromTensor(Tensor* tensor);
+  // Downcast from a Tensor to an XlaTensor. Return nullptr if the downcast
+  // fails.
+  static const XlaTensor* FromTensor(const Tensor* tensor);
+
+  // Create a DeviceMemoryBase from a Tensor. The Tensor can be an XlaTensor, in
+  // which case the returned value is shaped_buffer()->root_buffer(), or a
+  // normal Tensor in which case the returned value is
+  // {tensor.tensor_data().data(), tensor.tensor_data().size}.
+  static perftools::gputools::DeviceMemoryBase DeviceMemoryFromTensor(
+      const Tensor& tensor);
+
+  // Assign the internal ShapedBuffer to new memory for the given dtype and
+  // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it
+  // is replaced and the managed memory deallocated.
+  Status AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+                              xla::LocalClient* client, int device_ordinal);
+
+  // Some Tensors can have complex on-device shapes, including tuple shapes. To
+  // manage the memory for these tensors a ShapedBuffer may be required.
+
+  // Return true if this TensorInfo contains a ShapedBuffer.
+  bool has_shaped_buffer() const { return shaped_buffer_ != nullptr; }
+  // Return the contained ShapedBuffer.
+  // REQUIRES: has_shaped_buffer()
+  const xla::ShapedBuffer& shaped_buffer() const {
+    CHECK(has_shaped_buffer());
+    return *shaped_buffer_;
+  }
+  // Mutates the TensorInfo to set the ShapedBuffer.
+  void set_shaped_buffer(
+      std::unique_ptr<xla::ScopedShapedBuffer> shaped_buffer) {
+    shaped_buffer_ = std::move(shaped_buffer);
+  }
+
+  // Some tensors on the device may have known values on the host. We use these
+  // in on-demand mode to avoid re-copying values from the device if we know the
+  // host value already.
+
+  // Return true if this TensorInfo contains a host tensor.
+  bool has_host_tensor() const { return host_tensor_ != nullptr; }
+  // Return the contained host tensor.
+  // REQUIRES: has_host_tensor()
+  const Tensor& host_tensor() const { return *host_tensor_; }
+  // Sets the contained host tensor.
+  void set_host_tensor(const Tensor& tensor) {
+    host_tensor_.reset(new Tensor(tensor));
+  }
+
+  // Convert from a raw pointer to an XlaTensor, removing the pointer tag.
+  static XlaTensor* FromOpaquePointer(void* ptr);
+  // Convert to a raw pointer from an XlaTensor, adding the pointer tag.
+  static void* ToOpaquePointer(XlaTensor* tensor);
+
+ private:
+  // The optional contained ShapedBuffer.
+  std::unique_ptr<xla::ScopedShapedBuffer> shaped_buffer_;
+  // An optional host tensor value.
+  std::unique_ptr<Tensor> host_tensor_;
+};
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
index da4bc44c7a75c9f8faf16c537a17a1f2d16d5d61..238fd15166c0b08ee109d6a3888e16c39f87a603 100644
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@@ -49,17 +49,3 @@ cc_library(
         "//tensorflow/compiler/jit:xla_device",
     ],
 )
-
-#-----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 85a2adab283c273af607b6b80fb4fd76f8dac2b2..edabdc218a3d8782d524aee01833db3179cafbc9 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -86,7 +86,10 @@ tf_xla_py_test(
     # ArgMax needs CustomCall on CPU, which is not available in normal
     # (not precompiled) TensorFlow. The flag below excludes the CPU
     # backend.
-    disabled_backends = "cpu",
+    disabled_backends = [
+        "cpu",
+        "cpu_ondemand",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -315,6 +318,8 @@ tf_xla_py_test(
     name = "function_test",
     size = "small",
     srcs = ["function_test.py"],
+    # Functions are not implemented in the on-demand compilation model yet.
+    disabled_backends = "cpu_ondemand",
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -537,7 +542,6 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["spacetobatch_op_test.py"],
     shard_count = 3,
-    tags = ["notsan"],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -551,6 +555,8 @@ tf_xla_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
+    # Stack ops are not implemented in the on-demand compilation model yet.
+    disabled_backends = "cpu_ondemand",
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -577,6 +583,8 @@ tf_xla_py_test(
     name = "tensor_array_ops_test",
     size = "small",
     srcs = ["tensor_array_ops_test.py"],
+    # TensorArray ops are not implemented in the on-demand compilation model yet.
+    disabled_backends = "cpu_ondemand",
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -827,17 +835,3 @@ tf_xla_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index ba7b9bacd2b794c74409d517a9c05bfbb14a845f..d1d7379c0a32eff4ff96e791dacbe800bbd70b7d 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -190,19 +190,24 @@ class BinaryOpsTest(XLATestCase):
           ],
           equality_test=self.ListsAreClose)
 
-      self._testBinary(
-          gen_nn_ops.sparse_softmax_cross_entropy_with_logits,
-          np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
-                    [0.9, 1.0, 1.1, 1.2]], dtype=dtype),
-          np.array([2, 1, 7], dtype=np.int32),
-          expected=[
-              np.array([1.342536, 1.442536, np.nan], dtype=dtype),
-              np.array([[0.213838, 0.236328, -0.738817, 0.288651],
-                        [0.213838, -0.763672, 0.261183, 0.288651],
-                        [np.nan, np.nan, np.nan, np.nan]],
-                       dtype=dtype),
-          ],
-          equality_test=self.ListsAreClose)
+      # TODO(b/68813416): Fails with bfloat16.
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        self._testBinary(
+            gen_nn_ops.sparse_softmax_cross_entropy_with_logits,
+            np.array(
+                [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
+                 [0.9, 1.0, 1.1, 1.2]],
+                dtype=dtype),
+            np.array([2, 1, 7], dtype=np.int32),
+            expected=[
+                np.array([1.342536, 1.442536, np.nan], dtype=dtype),
+                np.array(
+                    [[0.213838, 0.236328, -0.738817, 0.288651], [
+                        0.213838, -0.763672, 0.261183, 0.288651
+                    ], [np.nan, np.nan, np.nan, np.nan]],
+                    dtype=dtype),
+            ],
+            equality_test=self.ListsAreClose)
 
   def testIntOps(self):
     for dtype in self.int_types:
@@ -260,12 +265,6 @@ class BinaryOpsTest(XLATestCase):
           np.array([[1], [2]], dtype=dtype),
           dtype(7),
           expected=np.array([[8], [9]], dtype=dtype))
-      self._testBinary(
-          math_ops.add,
-          np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
-          np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
-          expected=np.array(
-              [1 << 32, 1 << 36, 1 << 32, 1 << 36], dtype=np.int64))
 
       self._testBinary(
           math_ops.subtract,
@@ -361,6 +360,12 @@ class BinaryOpsTest(XLATestCase):
           np.array([2, -1], dtype=dtype),
           expected=np.array([[[[3, 1], [5, 3]]]], dtype=dtype))
 
+    self._testBinary(
+        math_ops.add,
+        np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
+        np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
+        expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36], dtype=np.int64))
+
   def testComplexOps(self):
     for dtype in self.complex_types:
       ctypes = {np.complex64: np.float32}
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 0528a5415d579a844e68403ace1bb8982a10a841..a9db1c173d33b0bc44248a4b55c678f7083b5527 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -56,7 +56,7 @@ def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
     elif backend == "gpu":
       backend_args += [
           "--test_device=XLA_GPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
+          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_BFLOAT16"
       ]
       backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
@@ -89,4 +89,3 @@ def generate_backend_suites(backends=[]):
     backends = all_backends()
   for backend in backends:
     native.test_suite(name="%s_tests" % backend, tags=["tf_xla_%s" % backend])
-
diff --git a/tensorflow/compiler/tests/cholesky_op_test.py b/tensorflow/compiler/tests/cholesky_op_test.py
index 5010fe5e21d0782e68d4e6d5bf6b4df1b44793a3..1a8989d7c2f617525c301f30fd899a01362310bf 100644
--- a/tensorflow/compiler/tests/cholesky_op_test.py
+++ b/tensorflow/compiler/tests/cholesky_op_test.py
@@ -34,6 +34,13 @@ from tensorflow.python.platform import test
 
 class CholeskyOpTest(XLATestCase):
 
+  # Cholesky defined for float64, float32, complex64, complex128
+  # (https://www.tensorflow.org/api_docs/python/tf/cholesky)
+  @property
+  def float_types(self):
+    return set(super(CholeskyOpTest, self).float_types).intersection(
+        (np.float64, np.float32, np.complex64, np.complex128))
+
   def _verifyCholeskyBase(self, sess, placeholder, x, chol, verification, atol):
     chol_np, verification_np = sess.run([chol, verification], {placeholder: x})
     self.assertAllClose(x, verification_np, atol=atol)
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 2d8236e2cbdfafb35626cd582ee39b1f917aec7f..f9d87c2d1cfe5c1a7487e124c971a54ffcfede15 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import numpy as np
 
 from tensorflow.contrib.compiler import jit
@@ -436,5 +437,55 @@ class XlaCompilationTest(test.TestCase):
     self.assertTrue(InLabels(labels, "_XlaLaunch"))
 
 
+class ElementWiseFusionTest(test.TestCase):
+
+  # Runs a simple test with the input jit_level and fusion_only flag.
+  def simpleTest(self, arg0, arg1, global_jit_level):
+    config = config_pb2.ConfigProto()
+    config.graph_options.optimizer_options.global_jit_level = global_jit_level
+
+    with session_lib.Session(config=config) as sess:
+      a1 = array_ops.placeholder(dtypes.float32, [2, 2], name="a1")
+      a2 = array_ops.placeholder(dtypes.float32, [2, 2], name="a2")
+      # Two element-wise ops. We need at least two ops since single
+      # element clusters are not passed to XLA in fusion_only mode.
+      a3 = a1 * a2
+      a4 = a3 + a1
+      # A matmul to break XLA clustering.
+      a5 = math_ops.matmul(a4, a1)
+      # Two more element-wise ops.
+      a6 = a5 - a4
+      a7 = a6 + a2
+
+      run_metadata = config_pb2.RunMetadata()
+      output = sess.run(
+          a7, {
+              a1: arg0,
+              a2: arg1
+          },
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+
+      labels = RunMetadataLabels(run_metadata)
+      count = sum("_XlaLaunch(" in x for x in labels)
+
+      return output, count
+
+  def testElementWiseClustering(self):
+    arg0 = np.random.rand(2, 2).astype(np.float32)
+    arg1 = np.random.rand(2, 2).astype(np.float32)
+    os.environ["TF_XLA_FLAGS"] = "--tf_xla_fusion_only=true"
+    tf_op, tf_count = self.simpleTest(arg0, arg1,
+                                      config_pb2.OptimizerOptions.OFF)
+    self.assertEqual(0, tf_count)
+
+    tfef_op, tfef_count = self.simpleTest(arg0, arg1,
+                                          config_pb2.OptimizerOptions.ON_1)
+    self.assertEqual(2, tfef_count)
+
+    self.assertAllClose(tf_op, tfef_op, rtol=1e-1)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index cccb7f5789dce39ef8c3d4b3a7573aaa983b3fbd..5819b2bf2b55b9213a039c0ba82dd0bf1c738b00 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -37,6 +37,14 @@ def MakePlaceholder(x):
 
 class MatrixTriangularSolveOpTest(XLATestCase):
 
+  #  MatrixTriangularSolve defined for float64, float32, complex64, complex128
+  # (https://www.tensorflow.org/api_docs/python/tf/matrix_triangular_solve)
+  @property
+  def float_types(self):
+    return set(super(MatrixTriangularSolveOpTest,
+                     self).float_types).intersection(
+                         (np.float64, np.float32, np.complex64, np.complex128))
+
   def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca,
                                  placeholder_b, a, clean_a, b, verification,
                                  atol):
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index e72dd4eea9f127e1df96ab166103c4c16372adb6..e53efc3091d8935e745122af29abd7b8063b1d01 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -83,8 +83,8 @@ string LocalDeviceToFullDeviceName(const string& device) {
   return strings::StrCat("/job:localhost/replica:0/task:0/device:", device);
 }
 
-constexpr std::array<DataType, 4> kAllXlaTypes = {
-    {DT_INT32, DT_FLOAT, DT_BOOL, DT_COMPLEX64}};
+constexpr std::array<DataType, 5> kAllXlaTypes = {
+    {DT_INT32, DT_FLOAT, DT_BOOL, DT_COMPLEX64, DT_INT64}};
 
 // An OpTestBuilder is a graph builder class that takes as input an operator to
 // test, its inputs and attributes, and builds a graph that executes the
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
index 92518aadc4bf5c601cfb4192c093799784b6aa72..60839814931eaeb0b78a20fd1e4f387d241cd56f 100644
--- a/tensorflow/compiler/tests/spacetobatch_op_test.py
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.platform import test
@@ -156,6 +157,12 @@ class SpaceToBatchNDTest(XLATestCase):
     paddings = np.array(paddings).reshape((len(block_shape), 2))
     with self.test_session() as sess, self.test_scope():
       for dtype in self.float_types:
+        # TODO(b/68813416): Skip bfloat16's as the input type for direct is
+        # float32 and results in a mismatch, while making testDirect provide the
+        # correctly typed input results in 'no fill-function for data-type'
+        # error.
+        if dtype == dtypes.bfloat16.as_numpy_dtype:
+          continue
         placeholder = array_ops.placeholder(dtype)
         # outputs = space_to_batch(inputs)
         x_tf = array_ops.space_to_batch_nd(placeholder, block_shape, paddings)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 3d3e112f4821ea8e57ea9589a5b4433647ad294b..17149aa1c8edddadc504e916915a70f78abf8002 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -600,6 +600,20 @@ class UnaryOpsTest(XLATestCase):
               src,
               expected=dst)
 
+  def testBitcast(self):
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.bitcast(x, dtypes.int32),
+        np.array([1, 0x3f800000], np.int32),
+        expected=np.array([1, 0x3f800000], np.int32))
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.bitcast(x, dtypes.float32),
+        np.array([1, 0x3f800000], np.int32),
+        expected=np.array([1e-45, 1.0], np.float32))
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.bitcast(x, dtypes.int32),
+        np.array([1e-45, 1.0], np.float32),
+        expected=np.array([1, 0x3f800000], np.int32))
+
   def testInvertPermutation(self):
     self._assertOpOutputMatchesExpected(
         array_ops.invert_permutation,
@@ -779,7 +793,10 @@ class UnaryOpsTest(XLATestCase):
       self._assertSoftplusMatchesExpected([[-2, 0, 8]], dtype)
       self._assertSoftplusMatchesExpected(
           [[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]], dtype)
-      log_eps = np.log(np.finfo(dtype).eps)
+      if dtype == dtypes.bfloat16.as_numpy_dtype:
+        log_eps = np.log(np.finfo(np.float32).eps)
+      else:
+        log_eps = np.log(np.finfo(dtype).eps)
       one = dtype(1)
       ten = dtype(10)
       self._assertSoftplusMatchesExpected([
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index b08d6ab21e0746558cb3d4818d4c822c45d2e9ee..8ecad00f6e23b3a7746bbb473102ac847bf4cbfd 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -230,7 +230,10 @@ class SliceAssignTest(XLATestCase):
       # shrink shape changes
       checker[1:2, 1] = [66]
       checker[1, 1:2] = [66]
-      checker[1, 1] = 66
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        # TODO(b/68813416): valnp call above results in an ndarray and not a
+        # number for bfloat16s.
+        checker[1, 1] = 66
       # newaxis shape changes
       checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
       # shrink and newaxis
@@ -243,8 +246,11 @@ class SliceAssignTest(XLATestCase):
 
       # Assign vector to scalar (rank-0) using newaxis
       checker2 = StridedSliceAssignChecker(self, 222, dtype=dtype)
-      checker2[()] = 6  # no indices
-      checker2[...] = 6  # ellipsis
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        # TODO(b/68813416): valnp call above results in an ndarray and not a
+        # number for bfloat16s.
+        checker2[()] = 6  # no indices
+        checker2[...] = 6  # ellipsis
       checker2[None] = [6]  # new axis
 
   def testUninitialized(self):
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index cc778f1c3c0098da5ab933f9b4674890a724d160..e924fe1e61454aefda622a5a46a0e483d26db5c1 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import os
 import random
 import re
 
@@ -44,6 +45,8 @@ flags.DEFINE_string('test_device', None,
 flags.DEFINE_string('types', None, 'Types to test. Comma-separated list.')
 flags.DEFINE_string('disabled_manifest', None,
                     'Path to a file with a list of tests that should not run.')
+flags.DEFINE_string('tf_xla_flags', None,
+                    'Value to set the TF_XLA_FLAGS environment variable to')
 
 
 class XLATestCase(test.TestCase):
@@ -97,6 +100,8 @@ class XLATestCase(test.TestCase):
       disabled_tests = []
       disabled_method_types = []
       for l in manifest_file.read().splitlines():
+        if not l:
+          continue
         entry = comments_re.sub('', l).strip().split(' ')
         if len(entry) == 1:
           disabled_tests.append(entry[0])
@@ -113,6 +118,9 @@ class XLATestCase(test.TestCase):
             for name in types])
       manifest_file.close()
 
+    if FLAGS.tf_xla_flags is not None:
+      os.environ['TF_XLA_FLAGS'] = FLAGS.tf_xla_flags
+
   @property
   def all_tf_types(self):
     name = '{}.{}'.format(type(self).__name__, self._testMethodName)
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index eb20ca501c80b01c76198e1ad54173f1c601714d..e7daf4e01c45c3705216fce7dd3db5baa0c261fc 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -332,6 +332,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -462,17 +463,3 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index 311dddca94c458a60fd00afe5532840e0dbf0437..c30bb9cacd48fb93ac359a6a25699ba6a74183c5 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -51,17 +51,3 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index 6f46532419d3389bafe8c3bf41fa41e8a3e173b7..de1008803d69fefa415c7bdbe6c27a62e625b417 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -55,8 +55,10 @@ Status BackwardsConstAnalysis(const Graph& g,
         compile_time_const_args->at(index) = true;
         return;
       }
-      for (const Node* pred : node->in_nodes()) {
-        must_be_const.insert(pred);
+      for (const Edge* pred : node->in_edges()) {
+        if (!pred->IsControlEdge()) {
+          must_be_const.insert(pred->src());
+        }
       }
       return;
     }
diff --git a/tensorflow/compiler/tf2xla/const_analysis_test.cc b/tensorflow/compiler/tf2xla/const_analysis_test.cc
index 9d125f8d499863cfaa0e26b5b633ca02914d1b7d..992b12c06db5efc0ae54284d0ea77017c1c79aca 100644
--- a/tensorflow/compiler/tf2xla/const_analysis_test.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis_test.cc
@@ -79,5 +79,24 @@ TEST(ConstAnalysisTest, TopologicalOrder) {
   }
 }
 
+TEST(ConstAnalysisTest, DontFollowControlDependencies) {
+  Scope root = Scope::NewRootScope();
+
+  Output arg0 = ops::_Arg(root.WithOpName("Arg0"), DT_INT32, 0);
+  Output arg1 = ops::_Arg(root.WithOpName("Arg1"), DT_INT32, 1);
+  Output c1 =
+      ops::Const(root.WithOpName("c1").WithControlDependencies(arg0), 1, {1});
+  Output add = ops::Add(root, arg1, c1);
+  Output reshape = ops::Reshape(root, arg1, add);
+
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph));
+
+  std::vector<bool> const_args(2, false);
+  TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args));
+
+  EXPECT_EQ(const_args, std::vector<bool>({false, true}));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d2fa933cf9c085f92b2f442827a94d72938e4bb2..f1bc7d6af49a09f84ef251eaa1c3d684792d0c1e 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -93,6 +93,7 @@ tf_kernel_library(
         "shape_util.h",
     ],
     deps = [
+        ":if_op",
         ":while_op",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -154,6 +155,22 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "if_op",
+    srcs = ["if_op.cc"],
+    hdrs = ["if_op.h"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 # Kernels that only work on CPU, because they use XLA custom calls.
 # Only link this when using the CPU backend for XLA.
 tf_kernel_library(
@@ -200,17 +217,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index a249b1869f547f8e5aa725f9f5cf391b10429928..931175be1111ed5f70afbdf351ee53c59c1367de 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -118,30 +118,24 @@ class FusedBatchNormGradOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
-
-    auto grad_backprop = ctx->Input(0);
-    auto activations = ctx->Input(1);
-    auto scale = ctx->Input(2);
-    auto mean = ctx->Input(3);
-    auto var = ctx->Input(4);
-
-    TensorShape input_shape = ctx->InputShape(0);
-    int feature_index =
-        GetTensorFeatureDimIndex(input_shape.dims(), data_format_);
-
+    xla::ComputationBuilder* const b = ctx->builder();
     DataType input_dtype = ctx->input_type(0);
     DataType scale_dtype = ctx->input_type(2);
-    xla::PrimitiveType input_type;
-    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_dtype, &input_type));
-    xla::PrimitiveType scale_type;
-    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(scale_dtype, &scale_type));
 
     // TODO(b/69928690): support mixed precision in the XLA batch normalization
     // operators. For now, cast everything to the statistics type (which
     // may be more precise than the input type).
-    grad_backprop = b->ConvertElementType(grad_backprop, scale_type);
-    activations = b->ConvertElementType(activations, scale_type);
+    auto grad_backprop =
+        XlaHelpers::ConvertElementType(b, ctx->Input(0), scale_dtype);
+    auto activations =
+        XlaHelpers::ConvertElementType(b, ctx->Input(1), scale_dtype);
+    auto scale = ctx->Input(2);
+    auto mean = ctx->Input(3);
+    auto var = ctx->Input(4);
+
+    const int input_dims = ctx->InputShape(0).dims();
+    const int feature_index =
+        GetTensorFeatureDimIndex(input_dims, data_format_);
 
     xla::ComputationDataHandle x_backprop;
     xla::ComputationDataHandle scale_backprop;
@@ -156,7 +150,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       offset_backprop = b->GetTupleElement(output, 2);
     } else {
       // Reduce over all dimensions except the feature dim.
-      std::vector<int64> reduction_dims(input_shape.dims() - 1);
+      std::vector<int64> reduction_dims(input_dims - 1);
       std::iota(reduction_dims.begin(), reduction_dims.begin() + feature_index,
                 0);
       std::iota(reduction_dims.begin() + feature_index, reduction_dims.end(),
@@ -165,9 +159,14 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       // scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var +
       // epsilon))
       // x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon))
-      offset_backprop =
-          b->Reduce(grad_backprop, XlaHelpers::Zero(b, scale_dtype),
-                    *ctx->GetOrCreateAdd(scale_dtype), reduction_dims);
+      const DataType accumulation_type =
+          XlaHelpers::SumAccumulationType(scale_dtype);
+      auto converted =
+          XlaHelpers::ConvertElementType(b, grad_backprop, accumulation_type);
+      auto reduce =
+          b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                    *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
+      offset_backprop = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
 
       // scratch1 = rsqrt(pop_var + epsilon)
       auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
@@ -175,17 +174,21 @@ class FusedBatchNormGradOp : public XlaOpKernel {
           b->Pow(b->Add(var, b->ConstantR0<float>(epsilon_)), neg_half);
 
       // scratch2 = sum(y_backprop * (x - mean))
-      auto scratch2 = b->Reduce(
-          b->Mul(grad_backprop, b->Sub(activations, mean, {feature_index})),
-          XlaHelpers::Zero(b, scale_dtype), *ctx->GetOrCreateAdd(scale_dtype),
-          reduction_dims);
+      auto mul =
+          b->Mul(grad_backprop, b->Sub(activations, mean, {feature_index}));
+      converted = XlaHelpers::ConvertElementType(b, mul, accumulation_type);
+      reduce =
+          b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                    *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
+      auto scratch2 = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
 
       x_backprop =
           b->Mul(grad_backprop, b->Mul(scratch1, scale), {feature_index});
       scale_backprop = b->Mul(scratch1, scratch2);
     }
 
-    ctx->SetOutput(0, b->ConvertElementType(x_backprop, input_type));
+    ctx->SetOutput(0,
+                   XlaHelpers::ConvertElementType(b, x_backprop, input_dtype));
     ctx->SetOutput(1, scale_backprop);
     ctx->SetOutput(2, offset_backprop);
     ctx->SetConstantOutput(3, Tensor(scale_dtype, {}));
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index cbade79e85eed10ecb5ead7151ee778c86a0de37..569950c2dfaeb61028049a263a962dfa54a62e09 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -184,9 +184,7 @@ class BatchToSpaceOp : public XlaOpKernel {
  private:
   int block_size_;
 };
-REGISTER_XLA_OP(Name("BatchToSpace")
-                    .CompileTimeConstInput("crops")
-                    .CompileTimeConstInput("block_shape"),
+REGISTER_XLA_OP(Name("BatchToSpace").CompileTimeConstInput("crops"),
                 BatchToSpaceOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index c667b4e3e326b776faba49387760abbd582fcc68..ed33b8ed2e823f313a9a7fe220390bc617288405 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -103,10 +103,15 @@ class BiasAddGradOp : public XlaOpKernel {
     std::iota(reduce_dims.begin(), reduce_dims.begin() + feature_dim, 0);
     std::iota(reduce_dims.begin() + feature_dim, reduce_dims.end(),
               feature_dim + 1);
-    xla::ComputationDataHandle result = ctx->builder()->Reduce(
-        ctx->Input(0), XlaHelpers::Zero(ctx->builder(), input_type(0)),
-        *ctx->GetOrCreateAdd(input_type(0)), reduce_dims);
-    ctx->SetOutput(0, result);
+    xla::ComputationBuilder* const b = ctx->builder();
+    const DataType accumulation_type =
+        XlaHelpers::SumAccumulationType(input_type(0));
+    auto converted =
+        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+    auto reduce =
+        b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                  *ctx->GetOrCreateAdd(accumulation_type), reduce_dims);
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, reduce, input_type(0)));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index 43a6a747c6bcc441f33f276fde4a66f367d99731..c52b2dcb7e9ef81fd52565dfbda05e33a52ed43a 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -62,5 +62,50 @@ class CastOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("Cast"), CastOp);
 
+class BitcastOp : public XlaOpKernel {
+ public:
+  explicit BitcastOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &src_dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("type", &dst_dtype_));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(src_dtype_, &src_type_));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dst_dtype_, &dst_type_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* builder = ctx->builder();
+    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::ComputationDataHandle output;
+
+    if (src_dtype_ == dst_dtype_) {
+      output = input;
+    } else {
+      // The only complex type in XLA is C64, so error out if the bitcast has a
+      // complex source or destination type and the bitcast is not trivial.
+      OP_REQUIRES(ctx,
+                  !xla::primitive_util::IsComplexType(src_type_) &&
+                      !xla::primitive_util::IsComplexType(dst_type_),
+                  errors::Unimplemented("Complex types not supported."));
+      // XLA bitcast requires that the bit-width of the source and destination
+      // matches, and currently only the simple lowering is performed.
+      OP_REQUIRES(ctx,
+                  xla::primitive_util::BitWidth(src_type_) ==
+                      xla::primitive_util::BitWidth(dst_type_),
+                  errors::Unimplemented(
+                      "Only bitcasts between equally sized types supported."));
+      output = builder->BitcastConvertType(input, dst_type_);
+    }
+
+    ctx->SetOutput(0, output);
+  }
+
+ protected:
+  DataType src_dtype_, dst_dtype_;
+  xla::PrimitiveType src_type_, dst_type_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BitcastOp);
+};
+
+REGISTER_XLA_OP(Name("Bitcast"), BitcastOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 81cea6d376d02c956a5257c5475fe5c10b83deb9..c0ee0c9c2ea849a692bee70bba36d32335eed9b5 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -58,7 +58,7 @@ xla::ComputationDataHandle CreateExpandedZero(
 
 // Create a mask for depthwise convolution that will make a normal convolution
 // produce the same results as a depthwise convolution. For a [2, 2, 3, 2]
-// depthwise filter this returns a [2, 2, 3, 6] tesnsor
+// depthwise filter this returns a [2, 2, 3, 6] tensor
 //   1 1 0 0 0 0   1 1 0 0 0 0
 //   0 0 1 1 0 0   0 0 1 1 0 0
 //   0 0 0 0 1 1   0 0 0 0 1 1
@@ -166,6 +166,10 @@ xla::ComputationDataHandle ContractFilterForDepthwiseBackprop(
       CreateExpandedFilterMask(filter_shape, builder), filter_backprop,
       CreateExpandedZero(filter_shape, dtype, builder));
   return builder->Reshape(
+      // This reduce does not need inputs to be converted with
+      // XlaHelpers::SumAccumulationType() since the ExpandedFilterMask with
+      // ExpandedZero guarantees that only one element is non zero, so there
+      // cannot be accumulated precision error.
       builder->Reduce(masked_expanded_filter, XlaHelpers::Zero(builder, dtype),
                       *ctx->GetOrCreateAdd(dtype),
                       {expanded_filter_shape.dims() - 2}),
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index 453a32c494b42e9922bc35fc526f3306530054fd..99470d70e709ddb5593c5eaae061bb897befc168 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -247,6 +247,8 @@ class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel {
     const TensorShape gradient_shape = ctx->InputShape(0);
     xla::ComputationDataHandle input = ctx->Input(1);
     const DataType data_type = ctx->input_type(1);
+    const DataType accumulation_type =
+        XlaHelpers::SumAccumulationType(data_type);
     xla::ComputationDataHandle input_min = ctx->Input(2);
     xla::ComputationDataHandle input_max = ctx->Input(3);
 
@@ -265,15 +267,23 @@ class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel {
     ctx->SetOutput(0, output0);
 
     xla::ComputationDataHandle below_min = b->Lt(input, nudged_input_min);
+    xla::ComputationDataHandle select1 = b->Select(below_min, gradient, zeroes);
+    xla::ComputationDataHandle reduce1 = b->ReduceAll(
+        XlaHelpers::ConvertElementType(b, select1, accumulation_type),
+        XlaHelpers::Zero(b, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type));
     xla::ComputationDataHandle output1 =
-        b->ReduceAll(b->Select(below_min, gradient, zeroes), zero,
-                     *ctx->GetOrCreateAdd(data_type));
+        XlaHelpers::ConvertElementType(b, reduce1, data_type);
     ctx->SetOutput(1, output1);
 
     xla::ComputationDataHandle above_max = b->Gt(input, nudged_input_max);
+    xla::ComputationDataHandle select2 = b->Select(above_max, gradient, zeroes);
+    xla::ComputationDataHandle reduce2 = b->ReduceAll(
+        XlaHelpers::ConvertElementType(b, select2, accumulation_type),
+        XlaHelpers::Zero(b, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type));
     xla::ComputationDataHandle output2 =
-        b->ReduceAll(b->Select(above_max, gradient, zeroes), zero,
-                     *ctx->GetOrCreateAdd(data_type));
+        XlaHelpers::ConvertElementType(b, reduce2, data_type);
     ctx->SetOutput(2, output2);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eefbe55c815d80a608bdf62d454a69d722adb158
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -0,0 +1,226 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/if_op.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+XlaIfOp::XlaIfOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+  const NameAttrList* name_attr;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("then_branch", &name_attr));
+  then_branch_ = *name_attr;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("else_branch", &name_attr));
+  else_branch_ = *name_attr;
+
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tcond", &cond_type_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tin", &input_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &output_types_));
+}
+
+// TODO(b/35949885): There is duplication here with the handling of the
+// while_op. Refactor the common code out/rework.
+void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
+  xla::ComputationBuilder* b = ctx->builder();
+
+  OP_REQUIRES(ctx, cond_type_ == DT_BOOL,
+              errors::InvalidArgument(
+                  "Condition argument must be a boolean for XLA compilation"));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(0)),
+              errors::InvalidArgument(
+                  "Condition argument must be a scalar for XLA compilation"));
+
+  VLOG(1) << "Building If: " << input_types_.size() << " inputs";
+
+  std::vector<xla::ComputationDataHandle> inputs(input_types_.size());
+  std::vector<XlaCompiler::Argument> arguments(input_types_.size());
+  for (int i = 0; i < input_types_.size(); ++i) {
+    XlaCompiler::Argument& arg = arguments[i];
+    DataType type = ctx->input_type(i + 1);
+    if (type == DT_RESOURCE) {
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx, ctx->GetResourceInput(i + 1, &resource));
+
+      arg.initialized = resource->initialized();
+      arg.kind = XlaCompiler::Argument::kResource;
+      arg.resource_kind = resource->kind();
+      OP_REQUIRES_OK(ctx, resource->Pack(&inputs[i], b));
+
+      arg.type = resource->type();
+      arg.shape = resource->shape();
+      OP_REQUIRES(ctx, arg.initialized,
+                  errors::Unimplemented("Uninitialized arguments: ", arg.name));
+      arg.tensor_array_size = resource->tensor_array_size();
+      for (const auto& gradient : resource->tensor_array_gradients()) {
+        arg.tensor_array_gradients.insert(gradient.first);
+      }
+      arg.name = resource->name();
+      VLOG(2) << "Resource " << resource->name()
+              << " type: " << DataTypeString(arg.type)
+              << " shape: " << arg.shape.DebugString()
+              << " initialized: " << arg.initialized;
+    } else {
+      arg.kind = XlaCompiler::Argument::kParameter;
+      arg.type = input_types_[i];
+      arg.shape = ctx->InputShape(i + 1);
+      inputs[i] = ctx->Input(i + 1);
+      VLOG(2) << "Arg type: " << DataTypeString(arg.type)
+              << " shape: " << arg.shape.DebugString();
+    }
+  }
+
+  // Compile both branches of the conditional.
+  XlaCompiler::CompileOptions options;
+  options.use_tuple_arg = true;
+  options.resolve_compile_time_constants = false;
+  options.return_updated_values_for_all_resources = true;
+  options.is_entry_computation = false;
+  XlaCompiler* compiler = ctx->compiler();
+
+  XlaCompiler::CompilationResult then_result;
+  OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, then_branch_,
+                                                arguments, &then_result));
+  XlaCompiler::CompilationResult else_result;
+  OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, else_branch_,
+                                                arguments, &else_result));
+
+  for (XlaCompiler::CompilationResult* result : {&then_result, &else_result}) {
+    for (const XlaCompiler::ResourceUpdate& update : result->resource_updates) {
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetResourceInput(update.input_index + 1, &resource));
+      XlaCompiler::Argument& arg = arguments[update.input_index];
+
+      // Add any TensorArray gradients touched by the then/else computation to
+      // the enclosing graph.
+      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+        VLOG(5) << "TensorArray " << resource->name() << " accessed gradient "
+                << grad_source;
+        XlaResource* gradient;
+        OP_REQUIRES_OK(ctx, resource->GetOrCreateTensorArrayGradient(
+                                grad_source, b, &gradient));
+      }
+      // Add all of the TensorArray gradients to the argument. For simplicity,
+      // we always pass all known gradients.
+      for (const auto& gradient : resource->tensor_array_gradients()) {
+        arg.tensor_array_gradients.insert(gradient.first);
+      }
+    }
+  }
+
+  // Check that both branches have identical input shapes.
+  OP_REQUIRES(ctx, then_result.xla_input_shapes.size() == 1,
+              errors::FailedPrecondition("Expected one input shape"));
+  xla::Shape then_input_shape = then_result.xla_input_shapes[0];
+  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(then_input_shape),
+              errors::FailedPrecondition("Expected tuple shape"));
+  OP_REQUIRES(ctx, else_result.xla_input_shapes.size() == 1,
+              errors::FailedPrecondition("Expected one input shape"));
+  xla::Shape else_input_shape = else_result.xla_input_shapes[0];
+  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(else_input_shape),
+              errors::FailedPrecondition("Expected tuple shape"));
+  OP_REQUIRES(ctx,
+              xla::ShapeUtil::Compatible(then_input_shape, else_input_shape),
+              errors::InvalidArgument(
+                  "Input shapes of then and else branches do not match: ",
+                  xla::ShapeUtil::HumanString(then_input_shape), " vs. ",
+                  xla::ShapeUtil::HumanString(else_input_shape)));
+
+  // Check that both branches have identical output shapes.
+  OP_REQUIRES(
+      ctx,
+      xla::ShapeUtil::Compatible(then_result.xla_output_shape,
+                                 else_result.xla_output_shape),
+      errors::InvalidArgument(
+          "Output shapes of then and else branches do not match: ",
+          xla::ShapeUtil::HumanString(then_result.xla_output_shape), " vs. ",
+          xla::ShapeUtil::HumanString(else_result.xla_output_shape)));
+
+  VLOG(2) << "Input shape: " << xla::ShapeUtil::HumanString(then_input_shape);
+  VLOG(2) << "Output shape: "
+          << xla::ShapeUtil::HumanString(then_result.xla_output_shape);
+
+  // We set return_updated_values_for_all_resources=true and we pass the same
+  // arguments to both computations, so the resource update count must match.
+  OP_REQUIRES(ctx,
+              then_result.resource_updates.size() ==
+                  else_result.resource_updates.size(),
+              errors::FailedPrecondition(
+                  "Different number of resources in then and else branch"));
+  for (int i = 0; i < then_result.resource_updates.size(); ++i) {
+    const auto& lhs = then_result.resource_updates[i];
+    const auto& rhs = else_result.resource_updates[i];
+    bool equal = lhs.input_index == rhs.input_index && lhs.shape == rhs.shape &&
+                 lhs.tensor_array_gradients_accessed ==
+                     rhs.tensor_array_gradients_accessed;
+    OP_REQUIRES(
+        ctx, equal,
+        errors::FailedPrecondition(
+            "Mismatch in resource of then and else branch for resource ", i));
+  }
+
+  xla::ComputationDataHandle outputs =
+      b->Conditional(ctx->Input(0), b->Tuple(inputs), *then_result.computation,
+                     b->Tuple(inputs), *else_result.computation);
+  // Sets non-variable outputs.
+  for (int i = 0; i < output_types_.size(); ++i) {
+    if (ctx->input_type(i) != DT_RESOURCE) {
+      xla::ComputationDataHandle output_handle = b->GetTupleElement(outputs, i);
+      if (VLOG_IS_ON(2)) {
+        LOG(INFO) << "Setting output " << i;
+        auto shape_or = b->GetShape(output_handle);
+        if (shape_or.ok()) {
+          LOG(INFO) << "Shape for output " << i << ": "
+                    << xla::ShapeUtil::HumanString(*shape_or.ValueOrDie());
+        } else {
+          LOG(INFO) << "Shape unknown for output " << i;
+        }
+      }
+      ctx->SetOutput(i, output_handle);
+    }
+  }
+
+  // Updates the values of any resource variables modified by the conditional
+  // bodies.
+  for (XlaCompiler::CompilationResult* result : {&then_result, &else_result}) {
+    for (int i = 0; i < result->resource_updates.size(); ++i) {
+      const XlaCompiler::ResourceUpdate& update = result->resource_updates[i];
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetResourceInput(update.input_index + 1, &resource));
+      if (update.modified) {
+        int pos = result->outputs.size() + i;
+        OP_REQUIRES_OK(ctx,
+                       resource->SetFromPack(
+                           arguments[update.input_index].tensor_array_gradients,
+                           b->GetTupleElement(outputs, pos), b));
+      }
+      VLOG(2) << "If variable: pos: " << update.input_index
+              << " name: " << resource->name()
+              << " modified: " << update.modified
+              << " type: " << DataTypeString(update.type)
+              << " shape: " << update.shape.DebugString();
+    }
+  }
+  VLOG(1) << "Done building If";
+}
+
+REGISTER_XLA_OP(Name("XlaIf").AllowResourceTypes(), XlaIfOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.h b/tensorflow/compiler/tf2xla/kernels/if_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9bc98a198a72dcc0594e61971713bf890ce30b6
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+
+namespace tensorflow {
+
+// This TensorFlow op provides a functional conditional primitive.
+//
+// The outputs of the then/else branches must agree on the number, types, and
+// shapes of the Tensors carried around the two bodies.
+//
+// Computations in then/else bodies may read from and write to resource
+// variables.
+// Resource variables may be passed as arguments to the then/else function's
+// bodies. The XlaCompiler converts resource variable arguments
+// into parameters to the XLA computation and moves them to the end of the
+// parameter list, and by using the `return_updated_values_for_all_variables`
+// we ensure that all variables that appear in the input also appear at the
+// end of the then/else bodies output. This ensures the then/else bodies output
+// signatures match.
+//
+// It is the user's responsibility to ensure that each non-variable _Arg matches
+// the corresponding _Retval.
+class XlaIfOp : public XlaOpKernel {
+ public:
+  explicit XlaIfOp(OpKernelConstruction* ctx);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaIfOp);
+
+  NameAttrList then_branch_;
+  NameAttrList else_branch_;
+  DataType cond_type_;
+  DataTypeVector input_types_;
+  DataTypeVector output_types_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index f22f384256a8ddd8c05de4a1322aba741dc4d7fd..5eeda79a935e8194a596d322b52add27846d378c 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -180,9 +180,13 @@ class AdjustContrastOpV2 : public XlaOpKernel {
 
     DataType type = context->input_type(0);
 
-    auto output = b->Reduce(input, /*init_value=*/XlaHelpers::Zero(b, type),
-                            /*computation=*/*context->GetOrCreateAdd(type),
+    const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
+    auto converted =
+        XlaHelpers::ConvertElementType(b, input, accumulation_type);
+    auto reduce = b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                            *context->GetOrCreateAdd(accumulation_type),
                             {height_dim, width_dim});
+    auto output = XlaHelpers::ConvertElementType(b, reduce, type);
     output = b->Div(output, XlaHelpers::FloatLiteral(b, type, height * width));
 
     std::vector<int64> broadcast_dims(input_shape.dims() - 2);
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index d096415087e47a73503a06526ab133ac34803c5d..c177f08d9c4687bb13b98a4328bb3960519799c4 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -29,21 +29,22 @@ class L2LossOp : public XlaOpKernel {
   explicit L2LossOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
+    std::vector<int64> dims(ctx->InputShape(0).dims());
+    std::iota(dims.begin(), dims.end(), 0);
 
     DataType dtype = ctx->input_type(0);
-    xla::ComputationBuilder* b = ctx->builder();
-
-    auto zero = XlaHelpers::Zero(b, dtype);
-    auto two = XlaHelpers::IntegerLiteral(b, dtype, 2);
-    const xla::Computation& add = *ctx->GetOrCreateAdd(dtype);
-
-    std::vector<int64> dims(input_shape.dims());
-    std::iota(dims.begin(), dims.end(), 0);
+    xla::ComputationBuilder* const b = ctx->builder();
 
     //  output = sum(t ** 2) / 2
-    auto x = ctx->Input(0);
-    ctx->SetOutput(0, b->Div(b->Reduce(b->Mul(x, x), zero, add, dims), two));
+    const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype);
+    auto t =
+        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+    auto square = b->Mul(t, t);
+    auto reduce = b->Reduce(square, XlaHelpers::Zero(b, accumulation_type),
+                            *ctx->GetOrCreateAdd(accumulation_type), dims);
+    auto deconverted = XlaHelpers::ConvertElementType(b, reduce, dtype);
+    auto two = XlaHelpers::IntegerLiteral(b, dtype, 2);
+    ctx->SetOutput(0, b->Div(deconverted, two));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
index 759d1a1a2d996d4f5deb1774be7014bb6de30f40..1cfee3070f384af0a7441a9c860c530dd1b42187 100644
--- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
@@ -47,12 +47,17 @@ class LRNOp : public XlaOpKernel {
 
     // We use a window of depth_radius_ * 2 + 1, to account for the current
     // element and a depth_radius_ on either side.
-    auto squared = builder->Mul(input, input);
-    auto sqr_sum = builder->ReduceWindow(
-        squared, XlaHelpers::Zero(builder, input_type(0)),
-        *ctx->GetOrCreateAdd(input_type(0)),
+    auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
+    auto converted =
+        XlaHelpers::ConvertElementType(builder, input, accumulation_type);
+    auto squared = builder->Mul(converted, converted);
+    auto reduce = builder->ReduceWindow(
+        squared, XlaHelpers::Zero(builder, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
+    auto sqr_sum =
+        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
 
     auto scale = builder->Pow(
         builder->Add(builder->ConstantR0<float>(bias_),
@@ -130,12 +135,17 @@ class LRNGradOp : public XlaOpKernel {
     //     dyi *= out_grads[j]
     //     grads[k] += dyi
 
-    auto squared = builder->Mul(in_image, in_image);
-    auto sqr_sum = builder->ReduceWindow(
-        squared, XlaHelpers::Zero(builder, input_type(0)),
-        *ctx->GetOrCreateAdd(input_type(0)),
+    auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
+    auto converted =
+        XlaHelpers::ConvertElementType(builder, in_image, accumulation_type);
+    auto squared = builder->Mul(converted, converted);
+    auto reduce = builder->ReduceWindow(
+        squared, XlaHelpers::Zero(builder, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
+    auto sqr_sum =
+        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
 
     auto norm =
         builder->Add(builder->ConstantR0<float>(bias_),
@@ -146,11 +156,15 @@ class LRNGradOp : public XlaOpKernel {
                      builder->Div(out_image, norm)),
         in_grads);
 
-    auto dy_reduced = builder->ReduceWindow(
-        dy, XlaHelpers::Zero(builder, input_type(0)),
-        *ctx->GetOrCreateAdd(input_type(0)),
+    auto converted_dy =
+        XlaHelpers::ConvertElementType(builder, dy, accumulation_type);
+    auto dy_reduce = builder->ReduceWindow(
+        converted_dy, XlaHelpers::Zero(builder, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
+    auto dy_reduced =
+        XlaHelpers::ConvertElementType(builder, dy_reduce, input_type(0));
 
     xla::ComputationDataHandle gradients = builder->Add(
         builder->Mul(in_image, dy_reduced),
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 086a9491aa93ebfae99f296dd355ae2e322084ec..5f635dd1bc6122cfcac8163baafd95b13f157715 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -35,8 +35,11 @@ namespace {
 // Superclass of pooling ops.
 class PoolingOp : public XlaOpKernel {
  public:
-  PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims)
-      : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
+  PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims,
+            const DataType reduction_type)
+      : XlaOpKernel(ctx),
+        num_spatial_dims_(num_spatial_dims),
+        reduction_type_(reduction_type) {
     if (ctx->num_inputs() == 1) {
       std::vector<int32> ksize_int;
       std::vector<int32> stride_int;
@@ -63,12 +66,10 @@ class PoolingOp : public XlaOpKernel {
   int num_dims() const { return num_spatial_dims_ + 2; }
 
   // Method that builds an initial value to use in reductions.
-  virtual xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b,
-                                               DataType data_type) = 0;
+  virtual xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b) = 0;
 
   // The reduction operation to apply to each window.
-  virtual const xla::Computation* Reduction(XlaOpKernelContext* ctx,
-                                            DataType dtype) = 0;
+  virtual const xla::Computation* Reduction(XlaOpKernelContext* ctx) = 0;
 
   // A post-processing operation to apply on the outputs of the ReduceWindow.
   virtual xla::ComputationDataHandle PostProcessOutput(
@@ -76,9 +77,6 @@ class PoolingOp : public XlaOpKernel {
       DataType dtype, const TensorShape& input_shape) = 0;
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle input = ctx->Input(0);
-    const TensorShape input_shape = ctx->InputShape(0);
-
     std::vector<int64> ksize = ksize_;
     std::vector<int64> stride = stride_;
     if (ctx->num_inputs() != 1) {
@@ -106,16 +104,20 @@ class PoolingOp : public XlaOpKernel {
       stride.clear();
       OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride));
     }
+    const TensorShape input_shape = ctx->InputShape(0);
     OP_REQUIRES(ctx, input_shape.dims() == num_dims(),
                 errors::InvalidArgument("Input to ", type_string(),
                                         " operator must have ", num_dims(),
                                         " dimensions"));
 
-    const DataType type = input_type(0);
-    xla::ComputationDataHandle pooled = ctx->builder()->ReduceWindow(
-        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize,
-        stride, padding_);
-    ctx->SetOutput(0, PostProcessOutput(ctx, pooled, type, input_shape));
+    xla::ComputationBuilder* const b = ctx->builder();
+    auto input =
+        XlaHelpers::ConvertElementType(b, ctx->Input(0), reduction_type_);
+    auto reduce = ctx->builder()->ReduceWindow(
+        input, InitValue(b), *Reduction(ctx), ksize, stride, padding_);
+    auto pooled = XlaHelpers::ConvertElementType(b, reduce, input_type(0));
+    ctx->SetOutput(0,
+                   PostProcessOutput(ctx, pooled, input_type(0), input_shape));
   }
 
  protected:
@@ -124,21 +126,21 @@ class PoolingOp : public XlaOpKernel {
   std::vector<int64> stride_;
   xla::Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
+  DataType reduction_type_;
 };
 
 class MaxPoolOp : public PoolingOp {
  public:
   MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
-      : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims) {}
+      : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
+                  /*reduction_type=*/ctx->input_type(0)) {}
 
-  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b,
-                                       DataType data_type) override {
-    return XlaHelpers::MinValue(b, data_type);
+  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b) override {
+    return XlaHelpers::MinValue(b, reduction_type_);
   }
 
-  const xla::Computation* Reduction(XlaOpKernelContext* ctx,
-                                    DataType dtype) override {
-    return ctx->GetOrCreateMax(dtype);
+  const xla::Computation* Reduction(XlaOpKernelContext* ctx) override {
+    return ctx->GetOrCreateMax(reduction_type_);
   }
 
   xla::ComputationDataHandle PostProcessOutput(
@@ -209,15 +211,17 @@ static xla::ComputationDataHandle AvgPoolDivideByCount(
     }
 
     // Build a matrix of all 1s, with the same width/height as the input.
+    const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype);
     auto ones = ctx->builder()->Broadcast(
-        XlaHelpers::One(ctx->builder(), dtype), input_dim_sizes);
+        XlaHelpers::One(ctx->builder(), accumulation_type), input_dim_sizes);
 
     // Perform a ReduceWindow with the same window size, strides, and padding
     // to count the number of contributions to each result element.
-    auto counts = ctx->builder()->ReduceWindow(
-        ones, XlaHelpers::Zero(ctx->builder(), dtype),
-        *ctx->GetOrCreateAdd(dtype), window_ksize, window_stride,
+    auto reduce = ctx->builder()->ReduceWindow(
+        ones, XlaHelpers::Zero(ctx->builder(), accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type), window_ksize, window_stride,
         xla::Padding::kSame);
+    auto counts = XlaHelpers::ConvertElementType(ctx->builder(), reduce, dtype);
 
     return ctx->builder()->Div(output, counts, window_dims);
   }
@@ -226,16 +230,16 @@ static xla::ComputationDataHandle AvgPoolDivideByCount(
 class AvgPoolOp : public PoolingOp {
  public:
   AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
-      : PoolingOp(ctx, num_spatial_dims) {}
+      : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
+                  /*reduction_type=*/
+                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
 
-  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b,
-                                       DataType data_type) override {
-    return XlaHelpers::Zero(b, data_type);
+  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b) override {
+    return XlaHelpers::Zero(b, reduction_type_);
   }
 
-  const xla::Computation* Reduction(XlaOpKernelContext* ctx,
-                                    DataType dtype) override {
-    return ctx->GetOrCreateAdd(dtype);
+  const xla::Computation* Reduction(XlaOpKernelContext* ctx) override {
+    return ctx->GetOrCreateAdd(reduction_type_);
   }
 
   xla::ComputationDataHandle PostProcessOutput(
@@ -455,14 +459,12 @@ class AvgPoolGradOp : public XlaOpKernel {
                  gradients_shape, filter_shape, out_backprop_shape, stride_,
                  padding_, data_format_, &dims));
 
+    // The input gradients are computed by a convolution of the output gradients
+    // and the filter, with some appropriate padding. See the comment at the top
+    // of conv_grad_ops.h for details.
+    xla::ComputationBuilder* const b = ctx->builder();
     auto out_backprop = ctx->Input(1);
-
-    // The input gradients are computed by a convolution of the output
-    // gradients
-    // and the filter, with some appropriate padding. See the comment at
-    // the top of conv_grad_ops.h for details.
-    DataType dtype = input_type(1);
-
+    auto dtype = input_type(1);
     xla::Padding xla_padding =
         (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame;
 
@@ -483,17 +485,18 @@ class AvgPoolGradOp : public XlaOpKernel {
       padding->set_interior_padding(dims.spatial_dims[i].stride - 1);
     }
 
-    auto zero = XlaHelpers::Zero(ctx->builder(), dtype);
-    auto padded_gradients =
-        ctx->builder()->Pad(out_backprop_div, zero, padding_config);
+    auto zero = XlaHelpers::Zero(b, dtype);
+    auto padded_gradients = b->Pad(out_backprop_div, zero, padding_config);
 
     // in_backprop = padded_gradients <conv> ones
     std::vector<int64> ones(num_dims(), 1LL);
-    xla::ComputationDataHandle in_backprop = ctx->builder()->ReduceWindow(
-        padded_gradients, zero, *ctx->GetOrCreateAdd(dtype), ksize_,
+    auto accumulation_type = XlaHelpers::SumAccumulationType(dtype);
+    auto in_backprop = b->ReduceWindow(
+        XlaHelpers::ConvertElementType(b, padded_gradients, accumulation_type),
+        XlaHelpers::Zero(b, accumulation_type),
+        *ctx->GetOrCreateAdd(accumulation_type), ksize_,
         /* window_strides=*/ones, xla::Padding::kValid);
-
-    ctx->SetOutput(0, in_backprop);
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, in_backprop, dtype));
   }
 
  protected:
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 03b13b2924f4b81c1017804c91d5ffb81c44ea0b..812d258cd1677e18ef49952044126c76a2f55b19 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -27,7 +27,13 @@ namespace {
 
 class SumOp : public XlaReductionOp {
  public:
-  explicit SumOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit SumOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx,
+                       XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
+  xla::ComputationDataHandle InitialValue(
+      xla::ComputationBuilder* builder) override {
+    return XlaHelpers::Zero(builder, reduction_type_);
+  }
   void BuildReducer(xla::ComputationBuilder* builder,
                     const xla::ComputationDataHandle& scalar_lhs,
                     const xla::ComputationDataHandle& scalar_rhs) override {
@@ -39,11 +45,13 @@ REGISTER_XLA_OP(Name("Sum").CompileTimeConstInput("reduction_indices"), SumOp);
 
 class ProdOp : public XlaReductionOp {
  public:
-  explicit ProdOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit ProdOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx,
+                       XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
-    return XlaHelpers::One(builder, input_type(0));
+    return XlaHelpers::One(builder, reduction_type_);
   }
 
   void BuildReducer(xla::ComputationBuilder* builder,
@@ -58,13 +66,12 @@ REGISTER_XLA_OP(Name("Prod").CompileTimeConstInput("reduction_indices"),
 
 class MinOp : public XlaReductionOp {
  public:
-  explicit MinOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit MinOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
-    xla::PrimitiveType type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type));
-    return builder->ConstantLiteral(xla::Literal::MaxValue(type));
+    return XlaHelpers::MaxValue(builder, reduction_type_);
   }
 
   void BuildReducer(xla::ComputationBuilder* builder,
@@ -78,13 +85,12 @@ REGISTER_XLA_OP(Name("Min").CompileTimeConstInput("reduction_indices"), MinOp);
 
 class MaxOp : public XlaReductionOp {
  public:
-  explicit MaxOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit MaxOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
-    xla::PrimitiveType type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type));
-    return builder->ConstantLiteral(xla::Literal::MinValue(type));
+    return XlaHelpers::MinValue(builder, reduction_type_);
   }
 
   void BuildReducer(xla::ComputationBuilder* builder,
@@ -98,8 +104,14 @@ REGISTER_XLA_OP(Name("Max").CompileTimeConstInput("reduction_indices"), MaxOp);
 
 class MeanOp : public XlaReductionOp {
  public:
-  explicit MeanOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit MeanOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx,
+                       XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
 
+  xla::ComputationDataHandle InitialValue(
+      xla::ComputationBuilder* builder) override {
+    return XlaHelpers::Zero(builder, reduction_type_);
+  }
   void BuildReducer(xla::ComputationBuilder* builder,
                     const xla::ComputationDataHandle& scalar_lhs,
                     const xla::ComputationDataHandle& scalar_rhs) override {
@@ -121,7 +133,8 @@ REGISTER_XLA_OP(Name("Mean").CompileTimeConstInput("reduction_indices"),
 
 class AllOp : public XlaReductionOp {
  public:
-  explicit AllOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit AllOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
@@ -139,7 +152,8 @@ REGISTER_XLA_OP(Name("All").CompileTimeConstInput("reduction_indices"), AllOp);
 
 class AnyOp : public XlaReductionOp {
  public:
-  explicit AnyOp(OpKernelConstruction* ctx) : XlaReductionOp(ctx) {}
+  explicit AnyOp(OpKernelConstruction* ctx)
+      : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
   xla::ComputationDataHandle InitialValue(
       xla::ComputationBuilder* builder) override {
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 9aca6d8fedf92f176b3b7b40c5961d4a2e557a8a..f3181f0dadc2d3f45abb145e009e2663c10490f0 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -33,12 +33,12 @@ namespace tensorflow {
 // xla::ComputationBuilder.
 class XlaReductionOp : public XlaOpKernel {
  public:
-  explicit XlaReductionOp(OpKernelConstruction* ctx);
+  XlaReductionOp(OpKernelConstruction* ctx, DataType reduction_type);
   ~XlaReductionOp() override {}
 
-  // Return the base case for the reduction. Defaults to zero.
+  // Return the base case for the reduction.
   virtual xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder);
+      xla::ComputationBuilder* builder) = 0;
 
   // Implement the (scalar,scalar)->scalar lambda that should be
   // applied to each pair of elements to be reduced. The desired
@@ -63,6 +63,9 @@ class XlaReductionOp : public XlaOpKernel {
  private:
   // True if the number of dimensions should be maintained.
   bool keep_dims_;
+
+ protected:
+  DataType reduction_type_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 4b5d09eb9fd4110cdc4221099ff55767e9132540..64fe765ae9a945c58ea60bc157b1520c83b0d8e7 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -24,19 +24,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx,
+                               DataType reduction_type)
+    : XlaOpKernel(ctx), reduction_type_(reduction_type) {
   const DataType dt = BaseType(input_type(0));
   OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, DT_INT32}, {dt}));
 
   OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
 }
 
-// Return the base case for the reduction. Defaults to zero.
-xla::ComputationDataHandle XlaReductionOp::InitialValue(
-    xla::ComputationBuilder* builder) {
-  return XlaHelpers::Zero(builder, input_type(0));
-}
-
 // Unless BuildFinalizer is overridden the reduction has no
 // finalizer.
 xla::ComputationDataHandle XlaReductionOp::BuildFinalizer(
@@ -100,36 +96,26 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
 
   string desc = ctx->op_kernel().name();
 
-  // Call virtual method to get the initial value.
-  const xla::ComputationDataHandle initial = InitialValue(ctx->builder());
+  xla::ComputationBuilder* const b = ctx->builder();
   // Construct the builder for the reduction lambda.
-  xla::ComputationBuilder r(ctx->builder()->client(),
-                            strings::StrCat(desc, "-reduction"));
+  xla::ComputationBuilder r(b->client(), strings::StrCat(desc, "-reduction"));
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type));
-  // Make two scalar parameters of the desired type for the lambda.
-  xla::ComputationDataHandle rx =
-      r.Parameter(0, xla::ShapeUtil::MakeShape(type, {}), "x");
-  xla::ComputationDataHandle ry =
-      r.Parameter(1, xla::ShapeUtil::MakeShape(type, {}), "y");
-
-  auto data = ctx->Input(0);
+  TF_CHECK_OK(DataTypeToPrimitiveType(reduction_type_, &type));
 
+  auto data = b->ConvertElementType(ctx->Input(0), type);
+  // Call virtual method to get the initial value.
+  auto initial = b->ConvertElementType(InitialValue(b), type);
+  // Make two scalar parameters of the desired type for the lambda.
+  auto rx = r.Parameter(0, xla::ShapeUtil::MakeShape(type, {}), "x");
+  auto ry = r.Parameter(1, xla::ShapeUtil::MakeShape(type, {}), "y");
   // Call virtual method to build the reduction lambda.
   BuildReducer(&r, rx, ry);
   xla::Computation reduction_computation = r.Build().ConsumeValueOrDie();
-  xla::ComputationDataHandle reduce =
-      ctx->builder()->Reduce(data, initial, reduction_computation, xla_axes);
 
-  xla::ComputationDataHandle finalized =
-      BuildFinalizer(ctx->builder(), reduce, num_elements_reduced);
-
-  xla::ComputationDataHandle result;
-  if (keep_dims_) {
-    result = ctx->builder()->Reshape(finalized, final_shape);
-  } else {
-    result = finalized;
-  }
+  auto reduce = b->Reduce(data, initial, reduction_computation, xla_axes);
+  auto deconverted = XlaHelpers::ConvertElementType(b, reduce, input_type(0));
+  auto finalized = BuildFinalizer(b, deconverted, num_elements_reduced);
+  auto result = keep_dims_ ? b->Reshape(finalized, final_shape) : finalized;
   ctx->SetOutput(0, result);
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index ee4a94164c4a43828eb4feedbfa9d1a9e231ef8f..4cfa28a0ce3d7d1f24196ef6ef2775f840b2bcf1 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -66,7 +66,7 @@ class ScanOp : public XlaOpKernel {
                                 -input_shape.dims(), ", ", input_shape.dims(),
                                 "), but got ", axis));
 
-    DataType dtype = ctx->input_type(0);
+    DataType dtype = XlaHelpers::SumAccumulationType(ctx->input_type(0));
 
     if (input_shape.num_elements() == 0) {
       // Exit early if there is nothing to compute.
@@ -91,7 +91,6 @@ class ScanOp : public XlaOpKernel {
       std::swap(padding[axis].first, padding[axis].second);
     }
 
-    xla::ComputationDataHandle input = ctx->Input(0);
     xla::ComputationDataHandle init;
     const xla::Computation* reducer;
     if (sum_) {
@@ -102,7 +101,10 @@ class ScanOp : public XlaOpKernel {
       reducer = ctx->GetOrCreateMul(dtype);
     }
     auto output = builder->ReduceWindowWithGeneralPadding(
-        ctx->Input(0), init, *reducer, window_dims, window_strides, padding);
+        XlaHelpers::ConvertElementType(builder, ctx->Input(0), dtype), init,
+        *reducer, window_dims, window_strides, padding);
+    output =
+        XlaHelpers::ConvertElementType(builder, output, ctx->input_type(0));
 
     // In exclusive mode, we have computed an extra element containing the sum
     // of all the input elements. Slice off this extra "last" element.
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index 80d6df6c48b0141734dcee1c2a3c413926931feb..498342a98881df0c6ff50007eacc1d5ef6196b57 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -83,7 +83,9 @@ class UnsortedSegmentSum : public XlaOpKernel {
   DataType dtype_;
 };
 
-REGISTER_XLA_OP(Name("UnsortedSegmentSum"), UnsortedSegmentSum);
+REGISTER_XLA_OP(
+    Name("UnsortedSegmentSum").CompileTimeConstInput("num_segments"),
+    UnsortedSegmentSum);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index 750a4c2dec8154f97f307978b3d8884271292279..463788b8b461c370a8e7ab4d79a94fc0143b8b45 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace {
@@ -28,7 +29,7 @@ namespace {
 class SoftmaxOp : public XlaOpKernel {
  public:
   explicit SoftmaxOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -42,9 +43,8 @@ class SoftmaxOp : public XlaOpKernel {
     const DataType type = input_type(0);
     auto logits = ctx->Input(0);
 
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationBuilder* const b = ctx->builder();
     const xla::Computation& max_func = *ctx->GetOrCreateMax(type);
-    const xla::Computation& add_func = *ctx->GetOrCreateAdd(type);
 
     // Find the max in each batch, resulting in a tensor of shape [batch]
     auto logits_max =
@@ -52,21 +52,20 @@ class SoftmaxOp : public XlaOpKernel {
     // Subtract the max in batch b from every element in batch b. Broadcasts
     // along the batch dimension.
     auto shifted_logits = b->Sub(logits, logits_max, {kBatchDim});
-    xla::ComputationDataHandle softmax;
-    if (log_) {
-      // softmax = shifted_logits - log(sum(exp(shifted_logits)))
-      auto log_sum_exp =
-          b->Log(b->Reduce(b->Exp(shifted_logits), XlaHelpers::Zero(b, type),
-                           add_func, {kClassDim}));
-      softmax = b->Sub(shifted_logits, log_sum_exp, {kBatchDim});
-    } else {
-      // softmax = exp(shifted_logits) / sum(exp(shifted_logits))
-      auto exp_shifted = b->Exp(shifted_logits);
-      auto sum_exp = b->Reduce(exp_shifted, XlaHelpers::Zero(b, type), add_func,
-                               {kClassDim});
-      softmax = b->Div(exp_shifted, sum_exp, {kBatchDim});
-    }
-
+    auto exp_shifted = b->Exp(shifted_logits);
+    const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
+    auto converted =
+        XlaHelpers::ConvertElementType(b, exp_shifted, accumulation_type);
+    auto reduce =
+        b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                  *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
+    auto sum = XlaHelpers::ConvertElementType(b, reduce, type);
+    auto softmax =
+        log_
+            // softmax = shifted_logits - log(sum(exp(shifted_logits)))
+            ? b->Sub(shifted_logits, b->Log(sum), {kBatchDim})
+            // softmax = exp(shifted_logits) / sum(exp(shifted_logits))
+            : b->Div(exp_shifted, sum, {kBatchDim});
     ctx->SetOutput(0, softmax);
   }
 
@@ -82,7 +81,6 @@ CrossEntropyWithLogits(XlaOpKernelContext* ctx, DataType type,
                        const xla::ComputationDataHandle& logits,
                        const xla::ComputationDataHandle& labels) {
   const xla::Computation& max_func = *ctx->GetOrCreateMax(type);
-  const xla::Computation& add_func = *ctx->GetOrCreateAdd(type);
 
   const int kBatchDim = 0;
   const int kClassDim = 1;
@@ -100,8 +98,12 @@ CrossEntropyWithLogits(XlaOpKernelContext* ctx, DataType type,
   auto exp_shifted_logits = b->Exp(shifted_logits);
 
   // sum_{class} (exp(logits - max_logits))
-  auto sum_exp = b->Reduce(exp_shifted_logits, XlaHelpers::Zero(b, type),
-                           add_func, {kClassDim});
+  const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
+  auto converted =
+      XlaHelpers::ConvertElementType(b, exp_shifted_logits, accumulation_type);
+  auto reduce = b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
+                          *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
+  auto sum_exp = XlaHelpers::ConvertElementType(b, reduce, type);
 
   // log(sum(exp(logits - max_logits)))
   auto log_sum_exp = b->Log(sum_exp);
@@ -110,9 +112,13 @@ CrossEntropyWithLogits(XlaOpKernelContext* ctx, DataType type,
   //    ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
   // along classes
   // (The subtraction broadcasts along the batch dimension.)
-  xla::ComputationDataHandle loss = b->Reduce(
-      b->Mul(b->Neg(labels), b->Sub(shifted_logits, log_sum_exp, {kBatchDim})),
-      XlaHelpers::Zero(b, type), add_func, {kClassDim});
+  auto sub = b->Sub(shifted_logits, log_sum_exp, {kBatchDim});
+  auto mul = b->Mul(b->Neg(labels), sub);
+  auto sum =
+      b->Reduce(XlaHelpers::ConvertElementType(b, mul, accumulation_type),
+                XlaHelpers::Zero(b, accumulation_type),
+                *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
+  auto loss = XlaHelpers::ConvertElementType(b, sum, type);
 
   // backprop: prob - labels, where
   //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index b10880de77e6b9811008076cd4a959c284e558d1..5bb773d97fc5ce90dabceeefd5c29d916597f5ff 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -239,6 +239,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
 
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
 REGISTER_XLA_OP(Name("StatelessRandomUniform")
+                    .CompileTimeConstInput("shape")
                     .TypeConstraint("dtype", DT_FLOAT)
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessRandomUniformOp);
@@ -272,6 +273,7 @@ class StatelessRandomNormalOp : public XlaOpKernel {
 
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
 REGISTER_XLA_OP(Name("StatelessRandomNormal")
+                    .CompileTimeConstInput("shape")
                     .TypeConstraint("dtype", DT_FLOAT)
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessRandomNormalOp);
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 488fda74bf7b5c1d66f8d706a1be3cc1fc29a492..344773c8c5f8e1a552d585d0317c62c56d9f9d46 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -140,17 +140,3 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 98f72b3792eb147f5a1847c5e1ecef18bccbca5f..aeb743a6634673f2e8c4dee9ae1e5017944aae2c 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -39,17 +39,3 @@ tf_gen_op_wrapper_py(
         ":sendrecv_ops",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index 1a0e09758f7cc6714793300c6ece14093a8ad246..5759c72af301785f3ca1110b58eeb2fe7dead713 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -65,8 +66,8 @@ ParseShardingFromDevice(
   if (explicit_sharding.has_value()) {
     return explicit_sharding;
   } else if (!parsed_device.has_type || !parsed_device.has_id ||
-             !StringPiece(parsed_device.type)
-                  .contains(kDeviceSuffixReplicatedCore)) {
+             !str_util::StrContains(parsed_device.type,
+                                    kDeviceSuffixReplicatedCore)) {
     return tensorflow::gtl::optional<xla::OpSharding>();
   } else {
     const int core = parsed_device.id;
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index ed10d80609641b090cf78bf2e17364fe2fa89c31..ae51446204baf14dc03fc6305641048dbf3872b0 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -33,7 +34,7 @@ namespace {
 
 void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
-  EXPECT_TRUE(StringPiece(status.error_message()).contains(str))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), str))
       << "expected error: " << status.error_message() << " to contain: " << str;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 0dc5118c9c659cc1529515f34c9eb43fd07a69e8..86263d847ae02d50e70dafb0129b2664c522f2a3 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -600,6 +600,48 @@ Status XlaCompiler::BuildArguments(
   return Status::OK();
 }
 
+Status XlaCompiler::CompileSingleOp(
+    const XlaCompiler::CompileOptions& options, string const& name,
+    OpKernelContext* ctx, const std::vector<XlaCompiler::Argument>& args,
+    CompilationResult* result) {
+  // TODO(b/74182462): We implement this by creating a new dummy Graph including
+  // _Arg nodes, and let CompileGraph walk it. This could be optimized.
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  Status status;
+  // First create the actual node we care about computing.
+  Node* main_node = graph->AddNode(ctx->op_kernel().def(), &status);
+  TF_RETURN_IF_ERROR(status);
+
+  // Create dummy _Arg nodes. Link these to `node` and also via a control
+  // dependency edge to the _SOURCE node.
+  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+    Node* node;
+    string name = strings::StrCat(ctx->op_kernel().name(), "_", i, "_arg");
+    Status status = NodeBuilder(name, "_Arg")
+                        .ControlInput(graph->source_node())
+                        .Attr("T", ctx->input_dtype(i))
+                        .Attr("index", i)
+                        .Finalize(graph.get(), &node);
+    TF_RETURN_IF_ERROR(status);
+    graph->AddEdge(node, 0, main_node, i);
+  }
+
+  // Similarly with return values, create dummy _Retval nodes fed by `node`.
+  for (int64 i = 0; i < ctx->num_outputs(); ++i) {
+    Node* node;
+    string name = strings::StrCat(ctx->op_kernel().name(), "_", i, "_retval");
+    Status status = NodeBuilder(name, "_Retval")
+                        .Input(main_node, i)
+                        .Attr("T", ctx->expected_output_dtype(i))
+                        .Attr("index", i)
+                        .Finalize(graph.get(), &node);
+    TF_RETURN_IF_ERROR(status);
+  }
+
+  return CompileGraph(options, name, std::move(graph), args, result);
+}
+
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  string const& name,
                                  std::unique_ptr<Graph> graph,
@@ -718,8 +760,8 @@ Status XlaCompiler::GetChannelHandle(const string& key,
 
 namespace {
 
-void SetTransfer(const string& key, const std::vector<DataType>& types,
-                 const std::vector<TensorShape>& shapes,
+void SetTransfer(const string& key, gtl::ArraySlice<DataType> types,
+                 gtl::ArraySlice<TensorShape> shapes,
                  tf2xla::HostTransferMetadata* transfer) {
   transfer->set_key(key);
   CHECK(types.size() == shapes.size());
@@ -733,8 +775,8 @@ void SetTransfer(const string& key, const std::vector<DataType>& types,
 }  // namespace
 
 Status XlaCompiler::SetDeviceToHostMetadata(
-    const string& key, const std::vector<DataType>& types,
-    const std::vector<TensorShape>& shapes) {
+    const string& key, gtl::ArraySlice<DataType> types,
+    gtl::ArraySlice<TensorShape> shapes) {
   if (host_compute_sends_.find(key) != host_compute_sends_.end()) {
     return errors::InvalidArgument(
         "Duplicate calls to SetDeviceToHostMetadata with key ", key);
@@ -760,8 +802,8 @@ Status XlaCompiler::GetDeviceToHostShapes(
 }
 
 Status XlaCompiler::SetHostToDeviceMetadata(
-    const string& key, const std::vector<DataType>& types,
-    const std::vector<TensorShape>& shapes) {
+    const string& key, gtl::ArraySlice<DataType> types,
+    gtl::ArraySlice<TensorShape> shapes) {
   if (host_compute_recvs_.find(key) != host_compute_sends_.end()) {
     return errors::InvalidArgument(
         "Duplicate calls to SetHostToDeviceMetadata with key ", key);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index a70d2637e0b578ddb57dc990cd9550798e675e1d..a6747bbe72e161b2ece55697825cce0e71145a5c 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -289,6 +289,14 @@ class XlaCompiler {
                       const std::vector<Argument>& args,
                       CompilationResult* result);
 
+  // Compiles a single Op, given by an OpKernelContext, into an
+  // xla::Computation. Similar to CompileFunction but takes a single Op as
+  // input.
+  Status CompileSingleOp(const CompileOptions& options, string const& name,
+                         OpKernelContext* ctx,
+                         const std::vector<Argument>& args,
+                         CompilationResult* result);
+
   // Returns the shape of the XLA parameter for an argument 'arg'.
   // See the class comment for more details about the argument passing
   // convention.
@@ -304,8 +312,8 @@ class XlaCompiler {
   // Sets the shapes and types for the device to host transfer associated with
   // 'key'.
   Status SetDeviceToHostMetadata(const string& key,
-                                 const std::vector<DataType>& types,
-                                 const std::vector<TensorShape>& shapes);
+                                 gtl::ArraySlice<DataType> types,
+                                 gtl::ArraySlice<TensorShape> shapes);
 
   // Gets the shapes the device to host transfer associated with 'key'.
   Status GetDeviceToHostShapes(const string& key,
@@ -314,8 +322,8 @@ class XlaCompiler {
   // Sets the shapes and types for the host to device transfer associated with
   // 'key'.
   Status SetHostToDeviceMetadata(const string& key,
-                                 const std::vector<DataType>& types,
-                                 const std::vector<TensorShape>& shapes);
+                                 gtl::ArraySlice<DataType> types,
+                                 gtl::ArraySlice<TensorShape> shapes);
 
   const Options& options() const { return options_; }
   xla::Client* client() const { return options_.client; }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index a18eeacd41808884fac9ec5d617cb0d274ea27d8..096dc7160bfc0a3a751f33e7d646471ebea56070 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -257,10 +258,10 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
                             std::move(graph), args, &result);
   EXPECT_FALSE(status.ok());
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("depends on a parameter"))
+      str_util::StrContains(status.error_message(), "depends on a parameter"))
       << status.error_message();
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("[[Node: C = Reshape"))
+      str_util::StrContains(status.error_message(), "[[Node: C = Reshape"))
       << status.error_message();
 }
 
@@ -597,7 +598,8 @@ TEST_F(XlaCompilerTest, UndefinedFunctionFails) {
       compiler.CompileFunction(XlaCompiler::CompileOptions(), name_attr,
                                /*args=*/{}, &result);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("is not defined."))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
+                                    "is not defined."))
       << status.error_message();
 }
 
@@ -676,11 +678,12 @@ TEST_F(XlaCompilerTest, LocalFunctionWithWrongArgumentsFail) {
 
   ASSERT_FALSE(status.ok());
   // Flib lookup failure.
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("is not defined."))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
+                                    "is not defined."))
       << status.error_message();
   // Local flib lookup failure.
-  EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Attr T is not found"))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
+                                    "Attr T is not found"))
       << status.error_message();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index f048662953e20b2a612271e2daeef6e370c4822a..3b0b2f06ebae4af918cbe6fb8a384004c1858998 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
@@ -273,4 +274,20 @@ Status XlaHelpers::OneHot(xla::ComputationBuilder* builder, int64 depth,
   return Status::OK();
 }
 
+DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
+  if (dtype == DT_BFLOAT16) {
+    return DT_FLOAT;
+  }
+  return dtype;
+}
+
+xla::ComputationDataHandle XlaHelpers::ConvertElementType(
+    xla::ComputationBuilder* const builder,
+    const xla::ComputationDataHandle& operand,
+    const DataType new_element_type) {
+  xla::PrimitiveType convert_to;
+  TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
+  return builder->ConvertElementType(operand, convert_to);
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 2a027db4c839c917f3a7acd27184792d157356bf..68ab93b64a5fa87ad99e0f44d84f6473fc8bbebd 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -107,6 +107,18 @@ class XlaHelpers {
                        const xla::ComputationDataHandle& on_value,
                        const xla::ComputationDataHandle& off_value,
                        xla::ComputationDataHandle* one_hot);
+
+  // Certain DataTypes should use increased precision DataTypes when performing
+  // reductions.  This function remaps a given DataType to a higher precision
+  // DataType if needed.
+  static DataType SumAccumulationType(const DataType& dtype);
+
+  // A helper for creating a ConvertElementType xla op given a DataType rather
+  // than the xla::PrimitiveType.
+  static xla::ComputationDataHandle ConvertElementType(
+      xla::ComputationBuilder* const builder,
+      const xla::ComputationDataHandle& operand,
+      const DataType new_element_type);
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index ff7453194af3a85bded86a5ce298f8779422dccb..e255b01dd7fdcb095c7992d4352d2d9bb7d36ac3 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -51,13 +51,13 @@ constexpr std::array<DataType, 9> kNumericTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BFLOAT16}};
 
-constexpr std::array<DataType, 8> kCpuAllTypes = {
-    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
+constexpr std::array<DataType, 9> kCpuAllTypes = {
+    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BOOL}};
 
-constexpr std::array<DataType, 8> kGpuAllTypes = {
-    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
-     DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 10> kGpuAllTypes = {
+    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
+     DT_COMPLEX64, DT_BOOL, DT_BFLOAT16}};
 
 // Class that manages registrations of operators and devices for the XLA JIT.
 // Not thread-safe.
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index cd13db4d300bb5bba21a734173b6afb9223539d8..751777222fcc7ec073958349aa2677d5b4e6757d 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -654,18 +654,6 @@ tf_cc_test(
 
 # -----------------------------------------------------------------------------
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
 cc_header_only_library(
     name = "xla_headers_lib",
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 24b58bec11bd8d8b5c79ac84c5f43c509644b51d..ea75ad32d5df7bbadd37e89de6144b264ab6d5d1 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 02356699a25e47be50eb15872df4c9c302fc289b..a299c2afd45aa6b785964b8a8e1400ddf54083a4 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -74,6 +74,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
@@ -213,17 +214,3 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index d15ccb0c28522c647617153aaa8e738d029dfaba..3f45167fcb77cd3085c9645fba0b2901329c4bb2 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -177,6 +177,22 @@ StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
   return Transfer(*data, shape_with_output_layout);
 }
 
+StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const ExecutionOptions* execution_options,
+    ExecutionProfile* execution_profile) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<GlobalData> data,
+      Execute(computation, arguments, execution_options, execution_profile));
+
+  const Shape* shape_with_output_layout = nullptr;
+  if (execution_options && execution_options->has_shape_with_output_layout()) {
+    shape_with_output_layout = &execution_options->shape_with_output_layout();
+  }
+  return Transfer(*data, shape_with_output_layout);
+}
+
 StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
   LoadComputationSnapshotRequest request;
   *request.mutable_module() = module;
@@ -231,6 +247,46 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
   return MakeUnique<GlobalData>(stub_, response.output());
 }
 
+StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const ExecutionOptions* execution_options,
+    ExecutionProfile* execution_profile) {
+  ExecuteGraphRequest request;
+  *request.mutable_computation() = computation.proto();
+
+  if (execution_options == nullptr) {
+    *request.mutable_execution_options() = CreateDefaultExecutionOptions();
+  } else {
+    *request.mutable_execution_options() = *execution_options;
+  }
+  for (GlobalData* argument : arguments) {
+    CHECK(argument != nullptr) << "Argument pointers must not be null.";
+    *request.add_arguments() = argument->handle();
+  }
+
+  ExecuteResponse response;
+  VLOG(1) << "making execute request: " << request.ShortDebugString();
+  Status s = stub_->ExecuteGraph(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (execution_profile != nullptr) {
+    *execution_profile = response.profile();
+    if (VLOG_IS_ON(1)) {
+      TF_ASSIGN_OR_RETURN(
+          auto execution_stats,
+          ExecutionStatsAsString(computation, response.profile()));
+      VLOG(1) << execution_stats;
+    }
+  }
+
+  return MakeUnique<GlobalData>(stub_, response.output());
+}
+
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
     tensorflow::gtl::ArraySlice<ComputationInstance> computations) {
   ExecuteParallelRequest request;
@@ -266,6 +322,42 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   return std::move(outputs);
 }
 
+StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
+    tensorflow::gtl::ArraySlice<XlaComputationInstance> computations) {
+  ExecuteGraphParallelRequest request;
+
+  for (const XlaComputationInstance& computation : computations) {
+    ExecuteGraphRequest single_request;
+    *single_request.mutable_computation() = computation.computation.proto();
+    for (GlobalData* argument : computation.arguments) {
+      *single_request.add_arguments() = argument->handle();
+    }
+    *single_request.mutable_execution_options() = computation.execution_options;
+    *request.add_requests() = single_request;
+  }
+
+  ExecuteParallelResponse response;
+  VLOG(1) << "making execute-graph-parallel request: "
+          << request.ShortDebugString();
+  tensorflow::Status s = stub_->ExecuteGraphParallel(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<std::unique_ptr<GlobalData>> outputs;
+  for (size_t i = 0; i < computations.size(); ++i) {
+    outputs.push_back(
+        MakeUnique<GlobalData>(stub_, response.responses(i).output()));
+    if (computations[i].execution_profile != nullptr) {
+      *computations[i].execution_profile = response.responses(i).profile();
+    }
+  }
+
+  return std::move(outputs);
+}
+
 StatusOr<std::vector<DeviceHandle>> Client::GetDeviceHandles(
     int64 device_count) {
   if (device_count < 1) {
@@ -342,6 +434,27 @@ StatusOr<ComputationStats> Client::GetComputationStats(
   return response.stats();
 }
 
+StatusOr<ComputationStats> Client::GetComputationStats(
+    const XlaComputation& computation,
+    const DebugOptions& debug_options) const {
+  ComputationGraphStatsRequest request;
+
+  // TODO(b/74197823): Find a way to avoid the copy of the hlo proto.
+  *request.mutable_computation() = computation.proto();
+  *request.mutable_debug_options() = debug_options;
+  ComputationStatsResponse response;
+
+  VLOG(1) << "making computation graph stats request";
+  Status s = stub_->GetComputationGraphStats(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+  CHECK(response.has_stats());
+  return response.stats();
+}
+
 StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
     const Computation& computation) {
   GetComputationShapeRequest request;
@@ -359,6 +472,12 @@ StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
   return WrapUnique(response.release_program_shape());
 }
 
+StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
+    const XlaComputation& computation) {
+  TF_ASSIGN_OR_RETURN(const auto& result, computation.GetProgramShape());
+  return MakeUnique<ProgramShape>(result);
+}
+
 StatusOr<Shape> Client::GetShape(const GlobalData& data) {
   GetShapeRequest request;
   *request.mutable_data() = data.handle();
@@ -397,6 +516,28 @@ StatusOr<string> Client::ExecutionStatsAsString(
   return string("[Execution Statistics] not available.");
 }
 
+StatusOr<string> Client::ExecutionStatsAsString(
+    const XlaComputation& computation, const ExecutionProfile& profile) {
+  TF_ASSIGN_OR_RETURN(
+      auto computation_stats,
+      GetComputationStats(computation,
+                          legacy_flags::GetDebugOptionsFromFlags()));
+  int64 total_flops =
+      computation_stats.flop_count() + computation_stats.transcendental_count();
+  if (profile.compute_time_ns() > 0) {
+    int64 nanoseconds = profile.compute_time_ns();
+    int64 cycle_count = profile.compute_cycle_count();
+    double gflops = total_flops / nanoseconds;
+    return tensorflow::strings::StrCat(
+        "[Execution Statistics] flop count: ", computation_stats.flop_count(),
+        ", transcendental count: ", computation_stats.transcendental_count(),
+        ", compute execution time: ", nanoseconds, " nsec",
+        ", compute cycles: ", cycle_count, ", performance: ", gflops,
+        "gflop/s");
+  }
+  return string("[Execution Statistics] not available.");
+}
+
 StatusOr<ChannelHandle> Client::CreateChannelHandle() {
   CreateChannelHandleRequest request;
   CreateChannelHandleResponse response;
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index c28380b689c7a0e16bf0bcbf15003f4aa15e42a7..05d707dab1533f44ce827157e888720e218d4c9c 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service_interface.h"
@@ -57,6 +58,21 @@ class Client {
       const ExecutionOptions* execution_options = nullptr,
       ExecutionProfile* execution_profile = nullptr);
 
+  // Executes the computation with the given arguments and returns the global
+  // data that was produced from the execution.
+  // * If execution_options is not nullptr, these options are passed to the
+  //   service to affect how it compiles our computation.  (The pointer does not
+  //   need to live beyond this call.)
+  // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
+  //   will be filled with profile data from the execution.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<GlobalData>> Execute(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const ExecutionOptions* execution_options = nullptr,
+      ExecutionProfile* execution_profile = nullptr);
+
   // A struct to represent a computation instance to be executed.
   // * If execution_options.device_handles is not empty, the computation is
   //   executed on the devices associated with the handles by partitioning the
@@ -83,6 +99,36 @@ class Client {
   StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
       tensorflow::gtl::ArraySlice<ComputationInstance> computations);
 
+  // A struct to represent a computation instance to be executed.
+  // * If execution_options.device_handles is not empty, the computation is
+  //   executed on the devices associated with the handles by partitioning the
+  //   computation based on the attached sharding attributes. Otherwise, a
+  //   device is chosen by the service.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  struct XlaComputationInstance {
+    const XlaComputation& computation;
+    std::vector<GlobalData*> arguments;
+    ExecutionOptions execution_options;
+    ExecutionProfile* execution_profile;
+
+    XlaComputationInstance(const XlaComputation& computation,
+                           std::vector<GlobalData*> arguments,
+                           ExecutionOptions execution_options,
+                           ExecutionProfile* execution_profile)
+        : computation(computation),
+          arguments(std::move(arguments)),
+          execution_options(execution_options),
+          execution_profile(execution_profile) {}
+  };
+
+  // Executes a list XlaComputationInstances and returns global data produced
+  // from each computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
+      tensorflow::gtl::ArraySlice<XlaComputationInstance> computations);
+
   // Requests device_count device handles available on the target. The returned
   // device handles are used to specify the devices to execute the computations
   // (see ExecuteParallel) or to transfer data (see TransferToServer or
@@ -137,6 +183,17 @@ class Client {
       const ExecutionOptions* execution_options = nullptr,
       ExecutionProfile* execution_profile = nullptr);
 
+  // Executes the computation with the given arguments and transfers the result
+  // to the client as a literal. Parameters are defined the same as for
+  // Execute() and Transfer().
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const ExecutionOptions* execution_options = nullptr,
+      ExecutionProfile* execution_profile = nullptr);
+
   // Unregister the memory for the given GlobalData on the device.
   Status Unregister(const GlobalData& data);
 
@@ -148,6 +205,13 @@ class Client {
   StatusOr<ComputationStats> GetComputationStats(
       const Computation& computation, const DebugOptions& debug_options) const;
 
+  // Retrieves the statistics of the given computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<ComputationStats> GetComputationStats(
+      const XlaComputation& computation,
+      const DebugOptions& debug_options) const;
+
   // Returns the Shape of the given array specified by 'data'. The shape
   // includes the Layout of the array as it is stored on the service.
   StatusOr<Shape> GetShape(const GlobalData& data);
@@ -157,6 +221,13 @@ class Client {
   StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
       const Computation& computation);
 
+  // As above, but returns the shape of the provided computation (parameter
+  // types/names and return type).
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
+      const XlaComputation& computation);
+
   // Creates a channel handle that can be used to transfer data between
   // two computations via a pair of Send and Recv instructions.
   StatusOr<ChannelHandle> CreateChannelHandle();
@@ -170,6 +241,8 @@ class Client {
   // ExecutionProfile returned from an execution of the computation.
   StatusOr<string> ExecutionStatsAsString(const Computation& computation,
                                           const ExecutionProfile& profile);
+  StatusOr<string> ExecutionStatsAsString(const XlaComputation& computation,
+                                          const ExecutionProfile& profile);
 
   ServiceInterface* stub_;  // Stub that this client is connected on.
 
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 39d02f0863f78d4094f2cc4805f534713fb7e929..4d3b0ee0d6e9ba82cfa09af0fbff0ae1efa0ac64 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -253,26 +253,6 @@ StatusOr<ProgramShape> ComputationBuilder::GetProgramShape() {
   return std::move(*response.mutable_program_shape());
 }
 
-ComputationDataHandle ComputationBuilder::CheckShape(
-    const ComputationDataHandle& operand, const Shape& expected_shape) {
-  std::unique_ptr<Shape> actual_shape = GetShape(operand).ConsumeValueOrDie();
-  CHECK(ShapeUtil::Equal(expected_shape, *actual_shape))
-      << "want " << ShapeUtil::HumanString(expected_shape) << " got "
-      << ShapeUtil::HumanString(*actual_shape);
-  return operand;
-}
-
-void ComputationBuilder::CheckSameShape(const ComputationDataHandle& lhs,
-                                        const ComputationDataHandle& rhs) {
-  std::unique_ptr<Shape> lhs_shape = GetShape(lhs).ConsumeValueOrDie();
-  std::unique_ptr<Shape> rhs_shape = GetShape(rhs).ConsumeValueOrDie();
-  VLOG(2) << "checking " << ShapeUtil::HumanString(*lhs_shape) << " equals "
-          << ShapeUtil::HumanString(*rhs_shape);
-  CHECK(ShapeUtil::Equal(*lhs_shape, *rhs_shape))
-      << "lhs " << ShapeUtil::HumanString(*lhs_shape) << " rhs "
-      << ShapeUtil::HumanString(*rhs_shape);
-}
-
 ComputationDataHandle ComputationBuilder::Slice(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 2141ebc2065a1a80d2fe820a7b6fe15434c89e28..019c6f3afb5d57bfe453988ded19120a4483cf36 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -104,15 +104,6 @@ class ComputationBuilder {
   // Retrieves the (inferred) result for the current computation's shape.
   StatusOr<ProgramShape> GetProgramShape();
 
-  // Checks that the operand has the given expected shape. Returns the operand
-  // if yes, fails with a CHECK error if no.
-  ComputationDataHandle CheckShape(const ComputationDataHandle& operand,
-                                   const Shape& expected_shape);
-
-  // Checks that the lhs and rhs results have the same shape.
-  void CheckSameShape(const ComputationDataHandle& lhs,
-                      const ComputationDataHandle& rhs);
-
   // Enqueues a constant with the value of the given literal onto the
   // computation.
   ComputationDataHandle ConstantLiteral(const Literal& literal);
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 804e34f5e75ce2d153ac7627b94a543fda88e810..6e3c5cb484b8f1ef053fa287a4d462aeb886e530 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -76,4 +76,35 @@ ExecutableBuildOptions::generate_hlo_graph() const {
   return generate_hlo_graph_;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_dump_optimized_hlo_proto_to(
+    tensorflow::StringPiece dirpath) {
+  dump_optimized_hlo_proto_to_ = dirpath.ToString();
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::dump_optimized_hlo_proto_to() const {
+  return dump_optimized_hlo_proto_to_;
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_dump_per_pass_hlo_proto_to(
+    tensorflow::StringPiece dirpath) {
+  dump_per_pass_hlo_proto_to_ = dirpath.ToString();
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::dump_per_pass_hlo_proto_to() const {
+  return dump_per_pass_hlo_proto_to_;
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_hlo_profile(bool enabled) {
+  hlo_profile_ = enabled;
+  return *this;
+}
+
+tensorflow::gtl::optional<bool> ExecutableBuildOptions::hlo_profile() const {
+  return hlo_profile_;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 3a52dbac9adb155ad9a7d91a8102707f70fe2fbf..11f10983606fe02b1edb11a260edde8e5f9a726f 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
@@ -57,15 +58,36 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_generate_hlo_graph(string regex);
   const tensorflow::gtl::optional<string>& generate_hlo_graph() const;
 
+  // If set, specifies a dirpath to dump the end-of-optimization-pipeline HLO
+  // protobuf to (as in DebugOptions).
+  ExecutableBuildOptions& set_dump_optimized_hlo_proto_to(
+      tensorflow::StringPiece dirpath);
+  const tensorflow::gtl::optional<string>& dump_optimized_hlo_proto_to() const;
+
+  // If set, specifies a dirpath to dump the per-pass-in-pipeline HLO protobufs
+  // to (as in DebugOptions).
+  ExecutableBuildOptions& set_dump_per_pass_hlo_proto_to(
+      tensorflow::StringPiece dirpath);
+  const tensorflow::gtl::optional<string>& dump_per_pass_hlo_proto_to() const;
+
+  // If true, specifies that we should record an HLO profile during execution
+  // and log it after execution (as in DebugOptions). If nullopt the default is
+  // used.
+  ExecutableBuildOptions& set_hlo_profile(bool enabled);
+  tensorflow::gtl::optional<bool> hlo_profile() const;
+
   // Returns a string representation of the build options, suitable for
   // debugging.
   string ToString() const;
 
  private:
+  tensorflow::gtl::optional<bool> hlo_profile_;
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
   tensorflow::gtl::optional<string> generate_hlo_graph_;
+  tensorflow::gtl::optional<string> dump_optimized_hlo_proto_to_;
+  tensorflow::gtl::optional<string> dump_per_pass_hlo_proto_to_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
 };
 
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index fca2bf2688cd21b44f099da3bae3b890cbb069ab..f4673a8204f27e93441c73f6dcc9130d96cfcebc 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -24,6 +24,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -48,17 +50,3 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 24048a1e5a782661ba577ba50e3b5b2914f17c0a..63df449e0b3bdd642d548319dd7d621ca2f59b1d 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 namespace xla {
 namespace {
+
 using InstructionGenerator =
     ComputationDataHandle (*)(ComputationBuilder*, const ComputationDataHandle&,
                               const ComputationDataHandle&);
@@ -47,6 +48,27 @@ Computation CreateScalarComputation(const string& name, PrimitiveType type,
   generator(b.get(), lhs, rhs);
   return b->BuildAndNoteError();
 }
+
+using XlaOpGenerator = XlaOp (*)(XlaBuilder*, const XlaOp&, const XlaOp&);
+
+XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
+                                       XlaBuilder* builder,
+                                       XlaOpGenerator generator) {
+  std::unique_ptr<XlaBuilder> b;
+  if (type == PRED) {
+    b = builder->CreateSubBuilder(name);
+  } else {
+    b = builder->CreateSubBuilder(
+        tensorflow::strings::StrCat(name, "_", PrimitiveType_Name(type)));
+  }
+
+  const Shape scalar = ShapeUtil::MakeShape(type, {});
+  auto lhs = b->Parameter(0, scalar, "lhs");
+  auto rhs = b->Parameter(1, scalar, "rhs");
+  generator(b.get(), lhs, rhs);
+  return b->BuildAndNoteError();
+}
+
 }  // namespace
 
 Computation CreateScalarAddComputation(PrimitiveType type,
@@ -60,7 +82,7 @@ Computation CreateScalarAddComputation(PrimitiveType type,
 Computation CreateScalarMultiplyComputation(PrimitiveType type,
                                             ComputationBuilder* builder) {
   return CreateScalarComputation(
-      "add", type, builder,
+      "mul", type, builder,
       [](ComputationBuilder* b, const ComputationDataHandle& lhs,
          const ComputationDataHandle& rhs) { return b->Mul(lhs, rhs); });
 }
@@ -114,4 +136,75 @@ StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
   return builder->Reduce(predicates, f, logical_or, all_dimensions);
 }
 
+XlaComputation CreateScalarAddComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "add", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Add(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
+                                               XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "mul", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Mul(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarGeComputation(PrimitiveType type,
+                                         XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "ge", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Ge(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMaxComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "max", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Max(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMinComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "min", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Min(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarAndComputation(XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "and", PRED, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->And(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarOrComputation(XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "or", PRED, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Or(lhs, rhs);
+      });
+}
+
+StatusOr<XlaOp> Any(const XlaOp& predicates, XlaBuilder* builder) {
+  auto f = builder->ConstantR0<bool>(false);
+  XlaComputation logical_or = CreateScalarOrComputation(builder);
+  TF_ASSIGN_OR_RETURN(const Shape& predicates_shape,
+                      builder->GetShape(predicates));
+  std::vector<int64> all_dimensions(ShapeUtil::Rank(predicates_shape));
+  std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
+  return builder->Reduce(predicates, f, logical_or, all_dimensions);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index ae89784bc227d837cf15f0a89687dd00dccc2745..f4d3fc801590fedbb84ed3d6283e62f47c56d5c7 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -56,6 +58,48 @@ Computation CreateScalarOrComputation(ComputationBuilder* builder);
 StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
                                     ComputationBuilder* builder);
 
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar add computation and returns it.
+XlaComputation CreateScalarAddComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar multiply computation and returns it.
+XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
+                                               XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar ge computation and returns it.
+XlaComputation CreateScalarGeComputation(PrimitiveType type,
+                                         XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar max computation and returns it.
+XlaComputation CreateScalarMaxComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar min computation and returns it.
+XlaComputation CreateScalarMinComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar logical AND computation and returns it.
+XlaComputation CreateScalarAndComputation(XlaBuilder* builder);
+
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar logical OR computation and returns it.
+XlaComputation CreateScalarOrComputation(XlaBuilder* builder);
+
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Returns whether any predicate in "predicates" is set.
+//
+// Note: if predicates is zero-sized, Any() vacuously returns false.
+StatusOr<XlaOp> Any(const XlaOp& predicates, XlaBuilder* builder);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 91396f055fe4a3ecbd436139be9470e2a35e1c63..30594243dcf51d2b5312b9dcb2bea7d0cd78524d 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -265,6 +265,24 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
                                         updated_options));
 }
 
+StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
+    const XlaComputation& computation,
+    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
+    const ExecutableBuildOptions& options) {
+  ExecutableBuildOptions updated_options = options;
+  if (options.device_ordinal() == -1) {
+    updated_options.set_device_ordinal(default_device_ordinal());
+    VLOG(3) << "Set device ordinal to default value of: "
+            << updated_options.device_ordinal();
+  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      local_service_->CompileExecutable(
+                          computation, argument_layouts, updated_options));
+  return WrapUnique(new LocalExecutable(std::move(executable),
+                                        local_service_->mutable_backend(),
+                                        updated_options));
+}
+
 StatusOr<std::unique_ptr<ScopedShapedBuffer>>
 LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
                                    DeviceMemoryAllocator* allocator) {
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index de0ed13c43f87966c272102b2e9af9ff3be63aea..98ee7c62c94be7c618cedd3dc12ecbfc812ee180 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -123,6 +123,15 @@ class LocalClient : public Client {
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
       const ExecutableBuildOptions& options);
 
+  // Build and return a LocalExecutable object. The executable is compiled using
+  // the given XlaComputation, argument layouts and options.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<LocalExecutable>> Compile(
+      const XlaComputation& computation,
+      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
+      const ExecutableBuildOptions& options);
+
   // Copy the literal data to the device with the given ordinal and return as a
   // ScopedShapedBuffer. If non-null the given memory allocator is used for
   // device memory allocation. If null, the default memory allocator for the
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
index b912889e2627aa01e5a7441e71e6bf002916ba5e..b1dba168565cca86cba0403604736fecd00d6f29 100644
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -25,12 +25,25 @@ filegroup(
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
+cc_library(
+    name = "xla_computation",
+    srcs = ["xla_computation.cc"],
+    hdrs = ["xla_computation.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
 # TODO(b/74197823): Replace computation_builder with xla_builder.
 cc_library(
     name = "xla_builder",
     srcs = ["xla_builder.cc"],
     hdrs = ["xla_builder.h"],
     deps = [
+        ":xla_computation",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -38,6 +51,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:shape_inference",
@@ -56,22 +70,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/core:test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 6328a4f350fc70efaa96102f8202fb00b88b51f2..2d587cc3b9c51d5bd81652d17b23d4ad05c84dd3 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 
+#include <functional>
+#include <numeric>
 #include <string>
 #include <utility>
 
@@ -43,6 +45,7 @@ int64 GetUniqueId() {
 bool CanBeRoot(HloOpcode opcode) {
   switch (opcode) {
     case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
     case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
       return false;
@@ -51,25 +54,35 @@ bool CanBeRoot(HloOpcode opcode) {
   }
 }
 
-void SetOpcode(HloInstructionProto* instr, HloOpcode opcode) {
-  instr->set_opcode(HloOpcodeString(opcode));
+StatusOr<std::vector<Shape>> GetOperandShapes(
+    tensorflow::gtl::ArraySlice<XlaOp> operands) {
+  std::vector<Shape> operand_shapes;
+  for (const XlaOp& operand : operands) {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, operand.GetShape());
+    operand_shapes.push_back(shape);
+  }
+  return operand_shapes;
 }
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Shape>> XlaBuilder::GetShape(const XlaOp& op) const {
+StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
   TF_ASSIGN_OR_RETURN(auto instr, LookUpInstruction(op));
-  return MakeUnique<Shape>(instr->shape());
+  return instr->shape();
 }
 
 StatusOr<Shape> XlaOp::GetShape() const {
-  TF_RET_CHECK(builder_ != nullptr);
-  TF_ASSIGN_OR_RETURN(auto shape, builder_->GetShape(*this));
-  return *shape;
+  if (builder_ == nullptr) {
+    return InvalidArgument(
+        "cannot GetShape for an invalid XlaOp with handle %lld", handle());
+  }
+  return builder_->GetShape(*this);
 }
 
 XlaBuilder::XlaBuilder(const string& computation_name)
-    : name_(computation_name) {}
+    : name_(computation_name), unique_id_(GetUniqueId()) {}
 
 XlaBuilder::~XlaBuilder() {}
 
@@ -85,39 +98,47 @@ void XlaBuilder::NoteError(const Status& error) {
   }
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build() {
+XlaOp XlaBuilder::NoteErrorOrReturn(
+    const std::function<StatusOr<XlaOp>()>& op_creator) {
   if (!first_error_.ok()) {
-    string backtrace;
-    first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
-    return AppendStatus(first_error_, backtrace);
+    return {};
+  }
+  auto op = op_creator();
+  if (!op.ok()) {
+    NoteError(op.status());
+    return {};
   }
+  return op.ConsumeValueOrDie();
+}
 
-  HloComputationProto entry;
-  ProgramShape* program_shape = entry.mutable_program_shape();
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) {
+  TF_RETURN_IF_ERROR(first_error_);
 
-  entry.set_name(name_);
+  TF_RET_CHECK(root_id != nullptr);
+  ProgramShape program_shape;
 
   // Not all instructions can be roots. Walk backwards from the last added
   // instruction until a valid root is found.
-  for (int64 i = instructions_.size() - 1; i >= 0; i--) {
+  int64 index = instructions_.size() - 1;
+  for (; index >= 0; index--) {
     TF_ASSIGN_OR_RETURN(HloOpcode opcode,
-                        StringToHloOpcode(instructions_[i].opcode()));
+                        StringToHloOpcode(instructions_[index].opcode()));
     if (CanBeRoot(opcode)) {
-      entry.set_root_name(instructions_[i].name());
-      *program_shape->mutable_result() = instructions_[i].shape();
       break;
     }
   }
-  if (entry.root_name().empty()) {
+  if (index < 0) {
     return FailedPrecondition("no root instruction was found");
   }
+  *root_id = instructions_[index].id();
+  *program_shape.mutable_result() = instructions_[index].shape();
 
   // Check that the parameter numbers are continuous from 0, and add parameter
   // shapes and names to the program shape.
   const int64 param_count = parameter_numbers_.size();
   for (int64 i = 0; i < param_count; i++) {
-    program_shape->add_parameters();
-    program_shape->add_parameter_names();
+    program_shape.add_parameters();
+    program_shape.add_parameter_names();
   }
   for (const HloInstructionProto& instr : instructions_) {
     // Parameter number uniqueness is guaranteed in XlaBuilder::Parameter(). So
@@ -127,93 +148,275 @@ StatusOr<XlaComputation> XlaBuilder::Build() {
       const int64 index = instr.parameter_number();
       TF_RET_CHECK(index >= 0 && index < param_count)
           << "invalid parameter number: " << index;
-      *program_shape->mutable_parameters(index) = instr.shape();
-      *program_shape->mutable_parameter_names(index) = instr.name();
+      *program_shape.mutable_parameters(index) = instr.shape();
+      *program_shape.mutable_parameter_names(index) = instr.name();
     }
   }
+  return program_shape;
+}
+
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape() {
+  int64 root_id;
+  return GetProgramShape(&root_id);
+}
+
+XlaComputation XlaBuilder::BuildAndNoteError() {
+  DCHECK(parent_builder_ != nullptr);
+  auto build_status = Build();
+  if (!build_status.ok()) {
+    parent_builder_->NoteError(
+        AddStatus(build_status.status(),
+                  tensorflow::strings::StrCat("error from: ", name_)));
+    return {};
+  }
+  return build_status.ConsumeValueOrDie();
+}
+
+StatusOr<XlaComputation> XlaBuilder::Build() {
+  if (!first_error_.ok()) {
+    string backtrace;
+    first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
+    return AppendStatus(first_error_, backtrace);
+  }
+
+  HloComputationProto entry;
+
+  {
+    int64 root_id;
+    ProgramShape program_shape;
+    TF_ASSIGN_OR_RETURN(program_shape, GetProgramShape(&root_id));
+    entry.mutable_program_shape()->Swap(&program_shape);
+    entry.set_root_id(root_id);
+  }
 
   for (auto& instruction : instructions_) {
     entry.add_instructions()->Swap(&instruction);
   }
 
-  const int64 id = GetUniqueId();
-  entry.set_id(id);
-  XlaComputation computation(id);
+  entry.set_id(unique_id_);
+  entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
+  XlaComputation computation(entry.id());
   HloModuleProto* module = computation.mutable_proto();
   module->set_name(entry.name());
+  module->set_id(entry.id());
   module->set_entry_computation_name(entry.name());
+  module->set_entry_computation_id(entry.id());
   *module->mutable_program_shape() = entry.program_shape();
   for (auto& e : embedded_) {
     module->add_computations()->Swap(&e.second);
   }
   module->add_computations()->Swap(&entry);
 
+  // Clear data held by this builder.
+  this->instructions_.clear();
+  this->embedded_.clear();
+  this->parameter_numbers_.clear();
+
   return std::move(computation);
 }
 
-XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs,
-                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  auto op = [&]() -> StatusOr<XlaOp> {
+StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
+    const Shape& shape, const XlaOp& operand,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape;
+  for (int64 dim : broadcast_dimensions) {
+    instr.add_dimensions(dim);
+  }
+  return AddInstruction(std::move(instr), HloOpcode::kBroadcast, {operand});
+}
+
+StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
+                                                 const XlaOp& operand) {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+
+  CHECK(ShapeUtil::IsScalar(operand_shape) ||
+        ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape));
+  Shape broadcast_shape =
+      ShapeUtil::ChangeElementType(output_shape, operand_shape.element_type());
+
+  // Do explicit broadcast for scalar.
+  if (ShapeUtil::IsScalar(operand_shape)) {
+    return InDimBroadcast(broadcast_shape, operand, {});
+  }
+
+  // Do explicit broadcast for degenerate broadcast.
+  std::vector<int64> broadcast_dimensions;
+  std::vector<int64> reshaped_dimensions;
+  for (int i = 0; i < ShapeUtil::Rank(operand_shape); i++) {
+    if (operand_shape.dimensions(i) == output_shape.dimensions(i)) {
+      broadcast_dimensions.push_back(i);
+      reshaped_dimensions.push_back(operand_shape.dimensions(i));
+    } else {
+      TF_RET_CHECK(operand_shape.dimensions(i) == 1)
+          << "An explicit broadcast sequence requires the broadcasted "
+             "dimensions to be trivial; operand shape: "
+          << operand_shape << "; output_shape: " << output_shape;
+    }
+  }
+  // Eliminate the size one dimensions.
+  TF_ASSIGN_OR_RETURN(XlaOp reshaped_operand,
+                      Reshape(ShapeUtil::MakeShape(operand_shape.element_type(),
+                                                   reshaped_dimensions),
+                              operand));
+  // Broadcast 'reshape' up to the larger size.
+  return InDimBroadcast(broadcast_shape, reshaped_operand,
+                        broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferUnaryOpShape(unop, operand_shape));
+    return AddInstruction(std::move(instr), unop, {operand});
+  });
+}
+
+XlaOp XlaBuilder::BinaryOp(
+    HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    SetOpcode(&instr, HloOpcode::kAdd);
-    TF_ASSIGN_OR_RETURN(const auto* lhs_instr, LookUpInstruction(lhs));
-    TF_ASSIGN_OR_RETURN(const auto* rhs_instr, LookUpInstruction(rhs));
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, lhs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, rhs.GetShape());
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferBinaryOpShape(
-                            HloOpcode::kAdd, lhs_instr->shape(),
-                            rhs_instr->shape(), broadcast_dimensions));
-    instr.add_operand_names(lhs_instr->name());
-    instr.add_operand_names(rhs_instr->name());
-    return AddInstruction(std::move(instr));
-  };
-  return NoteErrorOrReturn(op());
+                            binop, lhs_shape, rhs_shape, broadcast_dimensions));
+
+    const int64 lhs_rank = ShapeUtil::Rank(lhs_shape);
+    const int64 rhs_rank = ShapeUtil::Rank(rhs_shape);
+
+    XlaOp updated_lhs = lhs;
+    XlaOp updated_rhs = rhs;
+
+    if (!broadcast_dimensions.empty() && lhs_rank != rhs_rank) {
+      const bool should_broadcast_lhs = lhs_rank < rhs_rank;
+      XlaOp from = should_broadcast_lhs ? lhs : rhs;
+      const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape;
+
+      std::vector<int64> to_size;
+      for (int64 size : instr.shape().dimensions()) {
+        to_size.push_back(size);
+      }
+      for (int64 from_dim = 0; from_dim < ShapeUtil::Rank(from_shape);
+           from_dim++) {
+        int64 to_dim = broadcast_dimensions[from_dim];
+        to_size[to_dim] = from_shape.dimensions(from_dim);
+      }
+
+      const Shape& broadcasted_shape =
+          ShapeUtil::MakeShape(from_shape.element_type(), to_size);
+      TF_ASSIGN_OR_RETURN(
+          XlaOp broadcasted_operand,
+          InDimBroadcast(broadcasted_shape, from, broadcast_dimensions));
+
+      updated_lhs = should_broadcast_lhs ? broadcasted_operand : lhs;
+      updated_rhs = !should_broadcast_lhs ? broadcasted_operand : rhs;
+    }
+
+    TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, updated_lhs.GetShape());
+    if (!ShapeUtil::SameDimensions(instr.shape(), updated_lhs_shape)) {
+      TF_ASSIGN_OR_RETURN(updated_lhs,
+                          AddBroadcastSequence(instr.shape(), updated_lhs));
+    }
+    TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, updated_rhs.GetShape());
+    if (!ShapeUtil::SameDimensions(instr.shape(), updated_rhs_shape)) {
+      TF_ASSIGN_OR_RETURN(updated_rhs,
+                          AddBroadcastSequence(instr.shape(), updated_rhs));
+    }
+
+    return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs});
+  });
+}
+
+XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
+                            const XlaOp& ehs) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, lhs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, rhs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, ehs.GetShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferTernaryOpShape(
+                            triop, lhs_shape, rhs_shape, ehs_shape));
+    XlaOp updated_lhs = lhs;
+    XlaOp updated_rhs = rhs;
+    XlaOp updated_ehs = ehs;
+    if (!ShapeUtil::IsTuple(instr.shape())) {
+      if (!ShapeUtil::IsTuple(lhs_shape) &&
+          !ShapeUtil::SameDimensions(instr.shape(), lhs_shape)) {
+        // lhs is being implicitly broadcasted. Change to explicit.
+        TF_ASSIGN_OR_RETURN(updated_lhs,
+                            AddBroadcastSequence(instr.shape(), lhs));
+      }
+      if (!ShapeUtil::IsTuple(rhs_shape) &&
+          !ShapeUtil::SameDimensions(instr.shape(), rhs_shape)) {
+        // rhs is being implicitly broadcasted. Change to explicit.
+        TF_ASSIGN_OR_RETURN(updated_rhs,
+                            AddBroadcastSequence(instr.shape(), rhs));
+      }
+      if (!ShapeUtil::IsTuple(ehs_shape) &&
+          !ShapeUtil::SameDimensions(instr.shape(), ehs_shape)) {
+        // ehs is being implicitly broadcasted. Change to explicit.
+        TF_ASSIGN_OR_RETURN(updated_ehs,
+                            AddBroadcastSequence(instr.shape(), ehs));
+      }
+    }
+    return AddInstruction(std::move(instr), triop,
+                          {updated_lhs, updated_rhs, updated_ehs});
+  });
+}
+
+XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kAdd, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kMultiply, lhs, rhs, broadcast_dimensions);
 }
 
 XlaOp XlaBuilder::ConstantLiteral(const Literal& literal) {
-  HloInstructionProto instr;
-  SetOpcode(&instr, HloOpcode::kConstant);
-  *instr.mutable_shape() = literal.shape();
-  *instr.mutable_literal() = literal.ToProto();
-  return AddInstruction(std::move(instr));
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = literal.shape();
+    *instr.mutable_literal() = literal.ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kConstant);
+  });
 }
 
 XlaOp XlaBuilder::Call(const XlaComputation& computation,
                        tensorflow::gtl::ArraySlice<XlaOp> operands) {
-  auto op = [&]() -> StatusOr<XlaOp> {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    SetOpcode(&instr, HloOpcode::kCall);
-    std::vector<const Shape*> operand_shapes;
-    for (const auto& operand : operands) {
-      TF_ASSIGN_OR_RETURN(const auto* input, LookUpInstruction(operand));
-      operand_shapes.push_back(&input->shape());
-    }
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferCallShape(
-                            operand_shapes,
-                            /*to_apply=*/computation.GetProgramShape()));
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferCallShape(operand_shape_ptrs,
+                                       /*to_apply=*/called_program_shape));
 
-    // Add input operands.
-    for (const auto& operand : operands) {
-      TF_ASSIGN_OR_RETURN(auto operand_instr, LookUpInstruction(operand));
-      instr.add_operand_names(operand_instr->name());
-    }
+    AddCalledComputation(computation, &instr);
 
-    // Add called computation.
-    *instr.add_called_computation_names() = computation.proto().name();
-    for (const HloComputationProto& e : computation.proto().computations()) {
-      embedded_.insert({e.id(), e});
-    }
-
-    return AddInstruction(std::move(instr));
-  };
-  return NoteErrorOrReturn(op());
+    return AddInstruction(std::move(instr), HloOpcode::kCall, operands);
+  });
 }
 
 XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
                             const string& name) {
-  auto op = [&]() -> StatusOr<XlaOp> {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    SetOpcode(&instr, HloOpcode::kParameter);
     if (parameter_numbers_.find(parameter_number) != parameter_numbers_.end()) {
       return InvalidArgument("parameter %lld already registered",
                              parameter_number);
@@ -222,27 +425,895 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
     instr.set_parameter_number(parameter_number);
     instr.set_name(name);
     *instr.mutable_shape() = shape;
-    return AddInstruction(std::move(instr));
-  };
-  return NoteErrorOrReturn(op());
+    return AddInstruction(std::move(instr), HloOpcode::kParameter);
+  });
+}
+
+XlaOp XlaBuilder::Broadcast(
+    const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(
+        const Shape& shape,
+        ShapeInference::InferBroadcastShape(operand_shape, broadcast_sizes));
+
+    // The client-level broadcast op just appends dimensions on the left (adds
+    // lowest numbered dimensions). The HLO broadcast instruction is more
+    // flexible and can add new dimensions anywhere. The instruction's
+    // dimensions field maps operand dimensions to dimensions in the broadcast
+    // output, so to append dimensions on the left the instruction's dimensions
+    // should just be the n highest dimension numbers of the output shape where
+    // n is the number of input dimensions.
+    const int64 operand_rank = ShapeUtil::Rank(operand_shape);
+    std::vector<int64> dimensions(operand_rank);
+    for (int i = 0; i < operand_rank; ++i) {
+      dimensions[i] = i + ShapeUtil::Rank(shape) - operand_rank;
+    }
+    return InDimBroadcast(shape, operand, dimensions);
+  });
+}
+
+StatusOr<XlaOp> XlaBuilder::Reshape(const Shape& shape, const XlaOp& operand) {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape;
+  return AddInstruction(std::move(instr), HloOpcode::kReshape, {operand});
+}
+
+XlaOp XlaBuilder::Slice(const XlaOp& operand,
+                        tensorflow::gtl::ArraySlice<int64> start_indices,
+                        tensorflow::gtl::ArraySlice<int64> limit_indices,
+                        tensorflow::gtl::ArraySlice<int64> strides) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferSliceShape(operand_shape, start_indices,
+                                        limit_indices, strides));
+    for (int i = 0; i < start_indices.size(); i++) {
+      auto* slice_config = instr.add_slice_dimensions();
+      slice_config->set_start(start_indices[i]);
+      slice_config->set_limit(limit_indices[i]);
+      slice_config->set_stride(strides[i]);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kSlice, {operand});
+  });
+}
+
+XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index,
+                             int64 limit_index, int64 stride, int64 dimno) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                               tensorflow::gtl::ArraySlice<int64> slice_sizes) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDynamicSliceShape(
+                            operand_shape, start_indices_shape, slice_sizes));
+
+    for (int64 size : slice_sizes) {
+      instr.add_dynamic_slice_sizes(size);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice,
+                          {operand, start_indices});
+  });
+}
+
+XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                     const XlaOp& start_indices) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDynamicUpdateSliceShape(
+                            operand_shape, update_shape, start_indices_shape));
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                          {operand, update, start_indices});
+  });
+}
+
+XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                              int64 dimension) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension));
+
+    instr.add_dimensions(dimension);
+
+    return AddInstruction(std::move(instr), HloOpcode::kConcatenate, operands);
+  });
+}
+
+XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
+                      const PaddingConfig& padding_config) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::Reshape(const XlaOp& operand,
+                          tensorflow::gtl::ArraySlice<int64> dimensions,
+                          tensorflow::gtl::ArraySlice<int64> new_sizes) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& shape,
+                        ShapeInference::InferReshapeShape(
+                            operand_shape, dimensions, new_sizes));
+    XlaOp transposed = IsIdentityPermutation(dimensions)
+                           ? operand
+                           : Transpose(operand, dimensions);
+    return Reshape(shape, transposed);
+  });
+}
+
+XlaOp XlaBuilder::Reshape(const XlaOp& operand,
+                          tensorflow::gtl::ArraySlice<int64> new_sizes) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, operand.GetShape());
+    std::vector<int64> dimensions(shape.dimensions_size());
+    std::iota(dimensions.begin(), dimensions.end(), 0);
+    return Reshape(operand, dimensions, new_sizes);
+  });
+}
+
+XlaOp XlaBuilder::Collapse(const XlaOp& operand,
+                           tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return UnimplementedOp();
+}
+
+void XlaBuilder::Trace(const string& tag, const XlaOp& operand) {
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = ShapeUtil::MakeNil();
+    *instr.mutable_literal() = Literal::CreateR1U8(tag)->ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kTrace, {operand});
+  });
+}
+
+XlaOp XlaBuilder::Select(const XlaOp& pred, const XlaOp& on_true,
+                         const XlaOp& on_false) {
+  return TernaryOp(HloOpcode::kSelect, pred, on_true, on_false);
+}
+
+XlaOp XlaBuilder::Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferVariadicOpShape(
+                            HloOpcode::kTuple, operand_shape_ptrs));
+    return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
+  });
+}
+
+XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& tuple_shape, GetShape(tuple_data));
+    if (!ShapeUtil::IsTuple(tuple_shape)) {
+      return InvalidArgument(
+          "Operand to GetTupleElement() is not a tuple; got %s",
+          ShapeUtil::HumanString(tuple_shape).c_str());
+    }
+    *instr.mutable_shape() =
+        ShapeUtil::GetTupleElementShape(tuple_shape, index);
+
+    instr.set_tuple_index(index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kGetTupleElement,
+                          {tuple_data});
+  });
+}
+
+XlaOp XlaBuilder::Eq(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kEq, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Ne(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kNe, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Ge(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kGe, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Gt(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kGt, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Le(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kLe, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kLt, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+
+    DotDimensionNumbers dimension_numbers;
+    dimension_numbers.add_lhs_contracting_dimensions(
+        lhs_shape.dimensions_size() == 1 ? 0 : 1);
+    dimension_numbers.add_rhs_contracting_dimensions(0);
+    return DotGeneral(lhs, rhs, dimension_numbers);
+  });
+}
+
+XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                             const DotDimensionNumbers& dimension_numbers) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
+                                                        dimension_numbers));
+    *instr.mutable_dot_dimension_numbers() = dimension_numbers;
+    return AddInstruction(std::move(instr), HloOpcode::kDot, {lhs, rhs});
+  });
+}
+
+XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs,
+                       tensorflow::gtl::ArraySlice<int64> window_strides,
+                       Padding padding) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::ConvWithGeneralPadding(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::ConvWithGeneralDimensions(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+    const ConvolutionDimensionNumbers& dimension_numbers) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::ConvGeneral(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    const ConvolutionDimensionNumbers& dimension_numbers) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::ConvGeneralDilated(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+    tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
+                      const tensorflow::gtl::ArraySlice<int64> fft_length) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
+  return UnimplementedOp();
+}
+
+void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+                         const string& outfeed_config) {
+  UnimplementedOp();
+}
+
+XlaOp XlaBuilder::CustomCall(const string& call_target_name,
+                             tensorflow::gtl::ArraySlice<XlaOp> operands,
+                             const Shape& shape) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                              const string& channel_name,
+                              int64 cost_estimate_ns, const Shape& shape) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::Complex(
+    const XlaOp& real, const XlaOp& imag,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Conj(const XlaOp& operand) { return UnimplementedOp(); }
+
+XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kSubtract, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Div(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kDivide, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Rem(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kRemainder, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Max(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kMaximum, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Min(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kMinimum, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::And(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kAnd, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Or(const XlaOp& lhs, const XlaOp& rhs,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kOr, lhs, rhs, broadcast_dimensions);
+}
+
+// TODO(b/65209188): Create a dedicated lowering for Xor.
+XlaOp XlaBuilder::Xor(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return Or(And(Not(lhs), rhs, broadcast_dimensions),
+            And(lhs, Not(rhs), broadcast_dimensions));
+}
+
+XlaOp XlaBuilder::Not(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kNot, operand);
+}
+
+XlaOp XlaBuilder::ShiftLeft(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kShiftLeft, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::ShiftRightArithmetic(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs,
+                  broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::ShiftRightLogical(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs,
+                  broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Abs(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kAbs, operand);
+}
+
+XlaOp XlaBuilder::Atan2(
+    const XlaOp& y, const XlaOp& x,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kAtan2, y, x, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::Exp(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kExp, operand);
+}
+
+XlaOp XlaBuilder::Floor(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kFloor, operand);
+}
+
+XlaOp XlaBuilder::Ceil(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kCeil, operand);
+}
+
+XlaOp XlaBuilder::Round(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kRoundNearestAfz, operand);
+}
+
+XlaOp XlaBuilder::Log(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kLog, operand);
+}
+
+XlaOp XlaBuilder::Sign(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kSign, operand);
+}
+
+XlaOp XlaBuilder::Cos(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kCos, operand);
+}
+
+XlaOp XlaBuilder::Sin(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kSin, operand);
+}
+
+XlaOp XlaBuilder::Tanh(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kTanh, operand);
+}
+
+XlaOp XlaBuilder::Real(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kReal, operand);
+}
+
+XlaOp XlaBuilder::Imag(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kImag, operand);
+}
+
+XlaOp XlaBuilder::IsFinite(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kIsFinite, operand);
+}
+
+XlaOp XlaBuilder::Transpose(const XlaOp& operand,
+                            tensorflow::gtl::ArraySlice<int64> permutation) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferTransposeShape(operand_shape, permutation));
+    for (int64 dim : permutation) {
+      instr.add_dimensions(dim);
+    }
+    return AddInstruction(std::move(instr), HloOpcode::kTranspose, {operand});
+  });
+}
+
+XlaOp XlaBuilder::Rev(const XlaOp& operand,
+                      tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::Sort(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kSort, operand);
+}
+
+XlaOp XlaBuilder::SqrtF32(const XlaOp& operand) {
+  return BinaryOp(HloOpcode::kPower, operand, ConstantR0<float>(0.5),
+                  /*broadcast_dimensions=*/{});
+}
+
+XlaOp XlaBuilder::Pow(const XlaOp& lhs, const XlaOp& rhs,
+                      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(HloOpcode::kPower, lhs, rhs, broadcast_dimensions);
+}
+
+XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
+                                     PrimitiveType new_element_type) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand});
+  });
+}
+
+XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
+                                     PrimitiveType new_element_type) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::SquareF32(const XlaOp& operand) {
+  return BinaryOp(HloOpcode::kPower, operand, ConstantR0<float>(2.0),
+                  /*broadcast_dimensions=*/{});
+}
+
+XlaOp XlaBuilder::ReciprocalF32(const XlaOp& operand) {
+  return BinaryOp(HloOpcode::kPower, operand, ConstantR0<float>(-1.0),
+                  /*broadcast_dimensions=*/{});
+}
+
+XlaOp XlaBuilder::Neg(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kNegate, operand);
+}
+
+XlaOp XlaBuilder::Clamp(const XlaOp& min, const XlaOp& operand,
+                        const XlaOp& max) {
+  return TernaryOp(HloOpcode::kClamp, min, operand, max);
+}
+
+XlaOp XlaBuilder::Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                      const XlaComputation& computation,
+                      tensorflow::gtl::ArraySlice<int64> dimensions,
+                      tensorflow::gtl::ArraySlice<XlaOp> static_operands) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
+                        tensorflow::gtl::ArraySlice<XlaOp> parameters,
+                        const Shape& shape) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Check the number of parameters per RNG distribution.
+    switch (distribution) {
+      case RandomDistribution::RNG_NORMAL:
+      case RandomDistribution::RNG_UNIFORM:
+        if (parameters.size() != 2) {
+          return InvalidArgument(
+              "RNG distribution (%s) expects 2 parameters, but got %ld",
+              RandomDistribution_Name(distribution).c_str(), parameters.size());
+        }
+        break;
+      default:
+        LOG(FATAL) << "unhandled distribution " << distribution;
+    }
+
+    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
+    *instr.mutable_shape() = shape;
+
+    instr.set_distribution(distribution);
+
+    return AddInstruction(std::move(instr), HloOpcode::kRng, parameters);
+  });
+}
+
+XlaOp XlaBuilder::RngNormal(const XlaOp& mu, const XlaOp& sigma,
+                            const Shape& shape) {
+  return RngOp(RandomDistribution::RNG_NORMAL, {mu, sigma}, shape);
+}
+
+XlaOp XlaBuilder::RngUniform(const XlaOp& a, const XlaOp& b,
+                             const Shape& shape) {
+  return RngOp(RandomDistribution::RNG_UNIFORM, {a, b}, shape);
+}
+
+XlaOp XlaBuilder::While(const XlaComputation& condition,
+                        const XlaComputation& body, const XlaOp& init) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Infer shape.
+    TF_ASSIGN_OR_RETURN(const auto& body_program_shape, body.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
+                        condition.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferWhileShape(condition_program_shape,
+                                        body_program_shape, init_shape));
+    // Body comes before condition computation in the vector.
+    AddCalledComputation(body, &instr);
+    AddCalledComputation(condition, &instr);
+    return AddInstruction(std::move(instr), HloOpcode::kWhile, {init});
+  });
+}
+
+XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices,
+                         const GatherDimensionNumbers& dimension_numbers,
+                         tensorflow::gtl::ArraySlice<int64> window_bounds) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                              const XlaComputation& true_computation,
+                              const XlaOp& false_operand,
+                              const XlaComputation& false_computation) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::Reduce(
+    const XlaOp& operand, const XlaOp& init_value,
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferReduceShape(
+                            operand_shape, init_shape, dimensions_to_reduce,
+                            called_program_shape));
+
+    for (int64 dim : dimensions_to_reduce) {
+      instr.add_dimensions(dim);
+    }
+
+    AddCalledComputation(computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kReduce,
+                          {operand, init_value});
+  });
+}
+
+XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                            const XlaComputation& computation) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::ReduceWindow(
+    const XlaOp& operand, const XlaOp& init_value,
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
+    const XlaOp& operand, const XlaOp& init_value,
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                                    const XlaOp& offset, float epsilon,
+                                    int64 feature_index) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                                     const XlaOp& offset, const XlaOp& mean,
+                                     const XlaOp& variance, float epsilon,
+                                     int64 feature_index) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                                const XlaOp& batch_mean, const XlaOp& batch_var,
+                                const XlaOp& grad_output, float epsilon,
+                                int64 feature_index) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::CrossReplicaSum(const XlaOp& operand) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::SelectAndScatter(
+    const XlaOp& operand, const XlaComputation& select,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+    const XlaOp& source, const XlaOp& init_value,
+    const XlaComputation& scatter) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
+    const XlaOp& operand, const XlaComputation& select,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    const XlaOp& source, const XlaOp& init_value,
+    const XlaComputation& scatter) {
+  return UnimplementedOp();
+}
+
+XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                                  const int mantissa_bits) {
+  return UnimplementedOp();
 }
 
-XlaOp XlaBuilder::AddInstruction(HloInstructionProto&& instr) {
+void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) {
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Send instruction produces a tuple of {aliased operand, U32 context}.
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    *instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
+    instr.set_channel_id(handle.handle());
+    TF_ASSIGN_OR_RETURN(
+        XlaOp send,
+        AddInstruction(std::move(instr), HloOpcode::kSend, {operand}));
+
+    HloInstructionProto send_done_instr;
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeNil();
+    send_done_instr.set_channel_id(handle.handle());
+    return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
+                          {send});
+  });
+}
+
+XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Recv instruction produces a tuple of {receive buffer, U32 context}.
+    *instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
+    instr.set_channel_id(handle.handle());
+    TF_ASSIGN_OR_RETURN(XlaOp recv,
+                        AddInstruction(std::move(instr), HloOpcode::kRecv, {}));
+
+    HloInstructionProto recv_done_instr;
+    *recv_done_instr.mutable_shape() = shape;
+    recv_done_instr.set_channel_id(handle.handle());
+    return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
+                          {recv});
+  });
+}
+
+StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand,
+                                      int64 num_parameters) {
+  return Unimplemented("IsConstant is not implemented.");
+}
+
+StatusOr<std::unique_ptr<Literal>> XlaBuilder::ComputeConstant(
+    const XlaOp& operand, const Layout* output_layout,
+    tensorflow::gtl::ArraySlice<Literal> parameters) {
+  return Unimplemented("ComputeConstant is not implemented");
+}
+
+std::unique_ptr<XlaBuilder> XlaBuilder::CreateSubBuilder(
+    const string& computation_name) {
+  auto sub_builder = MakeUnique<XlaBuilder>(computation_name);
+  sub_builder->parent_builder_ = this;
+  sub_builder->die_immediately_on_error_ = this->die_immediately_on_error_;
+  return sub_builder;
+}
+
+Status XlaBuilder::SetReturnValue(const XlaOp& operand) {
+  return Unimplemented("SetReturnValue is not implemented.");
+}
+
+/* static */ ConvolutionDimensionNumbers
+XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
+  ConvolutionDimensionNumbers dimension_numbers;
+  dimension_numbers.set_input_batch_dimension(kConvBatchDimension);
+  dimension_numbers.set_input_feature_dimension(kConvFeatureDimension);
+  dimension_numbers.set_output_batch_dimension(kConvBatchDimension);
+  dimension_numbers.set_output_feature_dimension(kConvFeatureDimension);
+  dimension_numbers.set_kernel_output_feature_dimension(
+      kConvKernelOutputDimension);
+  dimension_numbers.set_kernel_input_feature_dimension(
+      kConvKernelInputDimension);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    dimension_numbers.add_input_spatial_dimensions(i + 2);
+    dimension_numbers.add_kernel_spatial_dimensions(i + 2);
+    dimension_numbers.add_output_spatial_dimensions(i + 2);
+  }
+  return dimension_numbers;
+}
+
+/* static */ Status XlaBuilder::Validate(
+    const ConvolutionDimensionNumbers& dnum) {
+  if (dnum.input_spatial_dimensions_size() < 2) {
+    return FailedPrecondition("input spacial dimension < 2: %d",
+                              dnum.input_spatial_dimensions_size());
+  }
+  if (dnum.kernel_spatial_dimensions_size() < 2) {
+    return FailedPrecondition("kernel spacial dimension < 2: %d",
+                              dnum.kernel_spatial_dimensions_size());
+  }
+  if (dnum.output_spatial_dimensions_size() < 2) {
+    return FailedPrecondition("output spacial dimension < 2: %d",
+                              dnum.output_spatial_dimensions_size());
+  }
+
+  if (std::set<int64>(
+          {dnum.input_batch_dimension(), dnum.input_feature_dimension(),
+           dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1)})
+          .size() != 4) {
+    return FailedPrecondition(
+        "dimension numbers for the input are not unique: (%lld, %lld, %lld, "
+        "%lld)",
+        dnum.input_batch_dimension(), dnum.input_feature_dimension(),
+        dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1));
+  }
+  if (std::set<int64>({dnum.kernel_output_feature_dimension(),
+                       dnum.kernel_input_feature_dimension(),
+                       dnum.kernel_spatial_dimensions(0),
+                       dnum.kernel_spatial_dimensions(1)})
+          .size() != 4) {
+    return FailedPrecondition(
+        "dimension numbers for the weight are not unique: (%lld, %lld, %lld, "
+        "%lld)",
+        dnum.kernel_output_feature_dimension(),
+        dnum.kernel_input_feature_dimension(),
+        dnum.kernel_spatial_dimensions(0), dnum.kernel_spatial_dimensions(1));
+  }
+  if (std::set<int64>({dnum.output_batch_dimension(),
+                       dnum.output_feature_dimension(),
+                       dnum.output_spatial_dimensions(0),
+                       dnum.output_spatial_dimensions(1)})
+          .size() != 4) {
+    return FailedPrecondition(
+        "dimension numbers for the output are not unique: (%lld, %lld, %lld, "
+        "%lld)",
+        dnum.output_batch_dimension(), dnum.output_feature_dimension(),
+        dnum.output_spatial_dimensions(0), dnum.output_spatial_dimensions(1));
+  }
+  return Status::OK();
+}
+
+StatusOr<XlaOp> XlaBuilder::AddInstruction(
+    HloInstructionProto&& instr, HloOpcode opcode,
+    tensorflow::gtl::ArraySlice<XlaOp> operands) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   const int64 handle = instructions_.size();
+  instr.set_id(handle);
+  instr.set_opcode(HloOpcodeString(opcode));
   if (instr.name().empty()) {
-    instr.set_name(StrCat(instr.opcode(), ".", handle));
+    instr.set_name(StrCat(instr.opcode(), ".", unique_id_, ".", handle));
   } else {
     // Append the handle to make sure the name is unique.
-    instr.set_name(StrCat(instr.name(), ".", handle));
+    instr.set_name(StrCat(instr.name(), ".", unique_id_, ".", handle));
+  }
+  for (const auto& operand : operands) {
+    if (operand.builder_ == nullptr) {
+      return InvalidArgument("invalid XlaOp with handle %lld",
+                             operand.handle());
+    }
+    if (operand.builder_ != this) {
+      return InvalidArgument("Do not add XlaOp from builder %s to builder %s",
+                             operand.builder_->name().c_str(),
+                             this->name().c_str());
+    }
+    instr.add_operand_ids(operand.handle());
   }
+
+  *instr.mutable_metadata() = metadata_;
+  if (sharding_) {
+    *instr.mutable_sharding() = *sharding_;
+  }
+
   instructions_.push_back(instr);
 
   XlaOp op(handle, this);
   return op;
 }
 
+void XlaBuilder::AddCalledComputation(const XlaComputation& computation,
+                                      HloInstructionProto* instr) {
+  instr->add_called_computation_ids(computation.proto().entry_computation_id());
+  for (const HloComputationProto& e : computation.proto().computations()) {
+    embedded_.insert({e.id(), e});
+  }
+}
+
 StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
     const XlaOp& op) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  if (op.builder_ != this) {
+    return InvalidArgument("invalid XlaOp with handle %lld", op.handle());
+  }
+
   TF_RET_CHECK(op.builder_ == this);
   if (op.handle() >= instructions_.size() || op.handle() < 0) {
     return InvalidArgument("no XlaOp value %lld", op.handle());
@@ -250,4 +1321,9 @@ StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
   return &instructions_[op.handle()];
 }
 
+XlaOp XlaBuilder::UnimplementedOp() {
+  NoteError(Unimplemented("Op not implemented"));
+  return {};
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 7632bd289d792ef487fb667de3cea335e06778bf..0673b86646eeecae45b1076baf0002ed94242846 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -24,8 +24,11 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -49,10 +52,11 @@ class XlaBuilder;
 // TODO(b/74197823): Replace xla::ComputationDataHandle with this one.
 class XlaOp {
  public:
+  XlaOp() : handle_(0), builder_(nullptr) {}
+
   StatusOr<Shape> GetShape() const;
 
  private:
-  XlaOp() : handle_(0), builder_(nullptr) {}
   XlaOp(int64 handle, XlaBuilder* builder)
       : handle_(handle), builder_(builder) {}
 
@@ -63,38 +67,6 @@ class XlaOp {
   XlaBuilder* builder_;  // Not owned.
 };
 
-// The computation graph that the user builds up with the XlaBuilder.
-//
-// TODO(b/74197823): Replace xla::Computation with this one.
-class XlaComputation {
- public:
-  XlaComputation(const XlaComputation&) = delete;
-  XlaComputation& operator=(const XlaComputation&) = delete;
-
-  XlaComputation(XlaComputation&& from) { *this = std::move(from); }
-
-  XlaComputation& operator=(XlaComputation&& from) {
-    proto_ = std::move(from.proto());
-    unique_id_ = from.unique_id_;
-    return *this;
-  }
-
-  // Returns the "program shape" (parameter and return shapes) for this
-  // computation.
-  const ProgramShape& GetProgramShape() const { return proto_.program_shape(); }
-
-  const HloModuleProto& proto() const { return proto_; }
-
- private:
-  // Creates a null Computation.
-  XlaComputation(const int64 unique_id) : unique_id_(unique_id) {}
-  HloModuleProto* mutable_proto() { return &proto_; }
-  friend class XlaBuilder;
-
-  int64 unique_id_;
-  HloModuleProto proto_;
-};
-
 // A convenient interface for building up computations.
 //
 // Thread-compatible.
@@ -113,6 +85,29 @@ class XlaBuilder {
   // Returns the computation name.
   const string& name() const { return name_; }
 
+  // Sets OpMetadata that will be added to all instructions until cleared.
+  //
+  // OpMetadata is often applied to a series of XLA HLO instructions. As a
+  // result, OpMetadata is set on the Computation Builder. All subsequent
+  // instructions generated via this Computation Builder will have the same
+  // OpMetadata attached until a call to ClearOpMetadata.
+  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
+
+  // Clears the HloMetadata state.
+  void ClearOpMetadata() { metadata_.Clear(); }
+
+  // Sets an OpSharding that will be attached to all instructions until cleared.
+  void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
+
+  // Clears the sharding. Ops will be sharded according to the default placement
+  // policy.
+  void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
+
+  // Returns the OpSharding that will be attached to all instructions.
+  const tensorflow::gtl::optional<OpSharding>& sharding() const {
+    return sharding_;
+  }
+
   // Sets the builder to a mode where it will die immediately when an error is
   // encountered, rather than producing it in a deferred fashion when Build() is
   // called (which is the default).
@@ -120,14 +115,6 @@ class XlaBuilder {
     die_immediately_on_error_ = enabled;
   }
 
-  // Enqueues an add instruction onto the computation.
-  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a call instruction onto the computation.
-  XlaOp Call(const XlaComputation& computation,
-             tensorflow::gtl::ArraySlice<XlaOp> operands);
-
   // Enqueues a "retrieve parameter value" instruction for a parameter that was
   // passed to the computation.
   XlaOp Parameter(int64 parameter_number, const Shape& shape,
@@ -155,16 +142,669 @@ class XlaBuilder {
   // corresponding native type yet.
   template <typename NativeT>
   XlaOp ConstantR0(NativeT value);
+  template <typename NativeT>
+  XlaOp ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values);
+  XlaOp ConstantR1(const tensorflow::core::Bitmap& values);
+  template <typename NativeT>
+  XlaOp ConstantR2(
+      std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  XlaOp ConstantFromArrayWithLayout(const Array<NativeT>& values,
+                                    const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantFromArray(const Array<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR2FromArray2DWithLayout(const Array2D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR2FromArray2D(const Array2D<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR3FromArray3DWithLayout(const Array3D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR3FromArray3D(const Array3D<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR4FromArray4DWithLayout(const Array4D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
 
-  // Returns the shape of the given op.
-  StatusOr<std::unique_ptr<Shape>> GetShape(const XlaOp& op) const;
+  // Enqueues a rank one constant (vector) onto the computation. The vector has
+  // size 'length' and every element has the value 'value'.
+  template <typename NativeT>
+  XlaOp ConstantR1(int64 length, NativeT value);
+
+  // Adds dimensions to an array by duplicating the data in the array.
+  //
+  // The new dimensions are inserted on the left, i.e. if
+  // broadcast_sizes has values {a0, ..., aN} and the operand shape
+  // has dimensions {b0, ..., bM} then the shape of the output has
+  // dimensions {a0, ..., aN, b0, ..., bM}.
+  //
+  // The new dimensions index into copies of the operand, i.e.
+  //
+  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
+  XlaOp Broadcast(const XlaOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+  // Enqueues a pad operation onto the computation that pads the given value on
+  // the edges as well as between the elements of the input. padding_config
+  // specifies the padding amount for each dimension.
+  XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+            const PaddingConfig& padding_config);
+
+  // Enqueues an operation onto the computation that flattens the operand based
+  // on the dimension order (major/slowest-varying to minor/fastest-varying)
+  // given, followed by reshaping it into the shape with the given dimension
+  // sizes (also major to minor). Conceptually, this is a limited form of
+  // "shape casting".
+  XlaOp Reshape(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> dimensions,
+                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  // Enqueues an operation onto the computation that collapses the operand, from
+  // first to last dimension (C order), then reshapes it to the given dimension
+  // sizes. Conceptually, this is a limited form of "shape casting".
+  XlaOp Reshape(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  // Wrapper for Reshape.
+  // Enqueues an operation to collapse the provided dimensions; e.g. an
+  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
+  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
+  // be a consecutive, in-order subsequence of the operand dimensions.
+  //
+  // Note that collapsing a single dimension does nothing:
+  //
+  //    {256} collapsing {0} => {256}
+  //    {1} collapsing {0} => {1}
+  //
+  // Collapsing multiple dimensions produces a single result dimension:
+  //
+  //    {256, 2} collapsing {0,1} => {512}
+  //    {256, 2, 3} collapsing {0,1} => {512, 3}
+  //
+  // This could potentially cause data to be moved -- it provides a more
+  // structured form of reshaping than an arbitrary Reshape operation.
+  XlaOp Collapse(const XlaOp& operand,
+                 tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Enqueues a slice operation onto the computation that slices the operand
+  // from the start indices to the limit indices; e.g.
+  //
+  //        x
+  //   [ 0 1 2 3 ]
+  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
+  //   [ 8 9 a b ]
+  //
+  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
+  // range notation.
+  // The strides parameter determines the stride over the slice
+  XlaOp Slice(const XlaOp& operand,
+              tensorflow::gtl::ArraySlice<int64> start_indices,
+              tensorflow::gtl::ArraySlice<int64> limit_indices,
+              tensorflow::gtl::ArraySlice<int64> strides);
+
+  // Enqueues a slice operation in a given dimension, taking all other
+  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
+  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
+  // for:
+  //
+  //  array[:, 2:4:1, :]
+  XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
+                   int64 stride, int64 dimno);
+
+  // Enqueues a slice operation onto the computation that slices the 'operand'
+  // from dynamic start indices which are passed in 'start_indices'.
+  // The size of the slice in each dimension is passed in 'slice_sizes',
+  // which specify the end point of exclusive slice intervals in each
+  // dimension [start, start + size).
+  // The shape of 'start_indices' must be rank == 1, with dimension size
+  // equal to the rank of the 'operand'.
+  // Slice index calculations are computed modulo input dimension sizes to
+  // prevent dynamic start indices from generating out-of-bound array accesses.
+  XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                     tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+  // Enqueues a dynamic update slice operation onto the computation, which
+  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
+  // The shape of 'update' determines the shape of the slice of 'operand'
+  // which is updated.
+  // The indices specified in 'start_indices' specify the offset of the slice
+  // of 'operand' which is updated.
+  //
+  //               update = {10, 11} // calculated at runtime.
+  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
+  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
+  //   [7 8 9]                                                  [7 8  9 ]
+  //
+  // The shape of 'start_indices' must be rank == 1, with dimension size
+  // equal to the rank of the 'operand'.
+  // Slice index calculations are computed modulo update dimension sizes to
+  // prevent dynamic start indices from generating out-of-bound array accesses.
+  XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                           const XlaOp& start_indices);
+
+  // Enqueues a concatenate instruction onto the computation. 'operands' must
+  // have >= 1 entry.
+  XlaOp ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                    int64 dimension);
+
+  // Enqueue a tracing operation onto the computation; the computation will emit
+  // a logging message with the operand.
+  void Trace(const string& tag, const XlaOp& operand);
+
+  // Enqueues a conditional-move-like select operation onto the computation;
+  // predicated on pred, selects between on_true and on_false.
+  XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
+
+  // Enqueues a tuple-creation instruction onto the computation.
+  XlaOp Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements);
+
+  // Enqueues a tuple-element-get instruction onto the computation.
+  XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
+
+  // Enqueues an equal-to comparison instruction onto the computation.
+  XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a not-equal comparison instruction onto the computation.
+  XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a greater-or-equal comparison instruction onto the computation.
+  XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a greater-than comparison instruction onto the computation.
+  XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a less-than comparison instruction onto the computation.
+  XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a less-or-equal comparison instruction onto the computation.
+  XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a dot instruction onto the computation.
+  XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+
+  // Enqueues a general dot instruction onto the computation.
+  XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                   const DotDimensionNumbers& dimension_numbers);
+
+  // Default dimension numbers used for a 2D convolution.
+  static constexpr int64 kConvBatchDimension = 0;
+  static constexpr int64 kConvFeatureDimension = 1;
+  static constexpr int64 kConvFirstSpatialDimension = 2;
+  static constexpr int64 kConvSecondSpatialDimension = 3;
+  static constexpr int64 kConvKernelOutputDimension = 0;
+  static constexpr int64 kConvKernelInputDimension = 1;
+  static constexpr int64 kConvKernelFirstSpatialDimension = 2;
+  static constexpr int64 kConvKernelSecondSpatialDimension = 3;
+
+  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
+  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
+  // the kernel operand
+  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
+  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
+      int num_spatial_dims = 2);
+
+  // Returns an error if the convolution dimension numbers have conflicts.
+  static Status Validate(const ConvolutionDimensionNumbers& dnum);
+
+  // Enqueues a convolution instruction onto the computation, which uses the
+  // default convolution dimension numbers.
+  XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+             tensorflow::gtl::ArraySlice<int64> window_strides,
+             Padding padding);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration in the format returned by MakePadding().
+  XlaOp ConvWithGeneralPadding(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided dimension numbers configuration.
+  XlaOp ConvWithGeneralDimensions(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration as well as the dimension numbers.
+  XlaOp ConvGeneral(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration, dilation factors and dimension numbers.
+  XlaOp ConvGeneralDilated(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues an FFT instruction onto the computation, of the given type and
+  // with the given FFT length.
+  XlaOp Fft(const XlaOp& operand, FftType fft_type,
+            tensorflow::gtl::ArraySlice<int64> fft_length);
+
+  // Enqueues an infeed instruction onto the computation, which writes data of
+  // the given shape to the infeed buffer of the device.
+  XlaOp Infeed(const Shape& shape, const string& config = "");
+
+  // Enqueues an outfeed instruction onto the computation. This instruction
+  // generates outgoing data transfers for the given data.
+  //
+  // shape_with_layout communicates the laid out shape that we want to outfeed
+  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
+  // will occur.
+  void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+               const string& outfeed_config);
+
+  // Enqueues a call instruction onto the computation.
+  XlaOp Call(const XlaComputation& computation,
+             tensorflow::gtl::ArraySlice<XlaOp> operands);
+
+  // Enqueues a custom call instruction onto the computation.
+  // During code generation, a call instruction is emitted which targets a
+  // symbol with the name |call_target_name|.  The |operands| are passed to the
+  // call instruction.  |shape| is the resultant shape.
+  XlaOp CustomCall(const string& call_target_name,
+                   tensorflow::gtl::ArraySlice<XlaOp> operands,
+                   const Shape& shape);
+
+  // Enqueues a pseudo-op to represent host-side computation data-dependencies.
+  // During code generation, host send and receive operations will be generated
+  // to transfer |operands| to the host and a single result of |shape| back to
+  // the device.  Host send/recv operations are emitted using |channel_name|.
+  // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
+  // instruction scheduling.
+  XlaOp HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                    const string& channel_name, int64 cost_estimate_ns,
+                    const Shape& shape);
+
+  // The following methods enqueue element-wise binary arithmetic operations
+  // onto the computation. The shapes of the operands have to match unless one
+  // of the operands is a scalar, or an explicit broadcast dimension is given
+  // (see g3doc for more details).
+
+  // Enqueues a complex compose instruction onto the computation.
+  XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a complex conjugate instruction onto the computation.
+  XlaOp Conj(const XlaOp& operand);
+
+  // Enqueues an add instruction onto the computation.
+  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a subtract instruction onto the computation.
+  XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a multiply instruction onto the computation.
+  XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a divide instruction onto the computation.
+  XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a remainder instruction onto the computation.
+  XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a max instruction onto the computation.
+  XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a min instruction onto the computation.
+  XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Element-wise logical operators
+  XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Not(const XlaOp& operand);
+
+  XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+  XlaOp ShiftRightArithmetic(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+  XlaOp ShiftRightLogical(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Reduces an array among the provided dimensions, given "computation" as a
+  // reduction operator.
+  XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+               const XlaComputation& computation,
+               tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+
+  // Convenience wrapper around the above that reduces all the dimensions in the
+  // operand shape.
+  XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                  const XlaComputation& computation);
+
+  // Enqueues a windowed reduce instruction onto the computation.
+  XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
+                     const XlaComputation& computation,
+                     tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                     tensorflow::gtl::ArraySlice<int64> window_strides,
+                     Padding padding);
+
+  // As ReduceWindow(), but the padding is given in the format
+  // returned by MakePadding().
+  XlaOp ReduceWindowWithGeneralPadding(
+      const XlaOp& operand, const XlaOp& init_value,
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+  // Returns the sum of the operand value across all replicas. All replicas
+  // supply one input to the sum and all replicas receive the resulting sum.
+  XlaOp CrossReplicaSum(const XlaOp& operand);
+
+  // Enqueues an operation that scatters the `source` array to the selected
+  // indices of each window.
+  XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
+                         tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                         tensorflow::gtl::ArraySlice<int64> window_strides,
+                         Padding padding, const XlaOp& source,
+                         const XlaOp& init_value,
+                         const XlaComputation& scatter);
+
+  // As SelectAndScatter(), but the padding is given in the format
+  // returned by MakePadding().
+  XlaOp SelectAndScatterWithGeneralPadding(
+      const XlaOp& operand, const XlaComputation& select,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const XlaOp& source, const XlaOp& init_value,
+      const XlaComputation& scatter);
+
+  // Enqueues an abs instruction onto the computation.
+  XlaOp Abs(const XlaOp& operand);
+
+  // Enqueues a atan2 instruction onto the computation.
+  XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues an exp instruction onto the computation.
+  XlaOp Exp(const XlaOp& operand);
+
+  // Enqueues a floor instruction onto the computation.
+  XlaOp Floor(const XlaOp& operand);
+
+  // Enqueues a ceil instruction onto the computation.
+  XlaOp Ceil(const XlaOp& operand);
+
+  // Enqueues a round instruction onto the computation, rounding to nearest even
+  // with half-way cases rounding away from zero.
+  XlaOp Round(const XlaOp& operand);
+
+  // Enqueues an log instruction (natural logarithm) onto the computation.
+  XlaOp Log(const XlaOp& operand);
+
+  // Enqueues a sign instruction onto the computation.
+  XlaOp Sign(const XlaOp& operand);
+
+  // Enqueues a cosine instruction onto the computation.
+  XlaOp Cos(const XlaOp& operand);
+
+  // Enqueues a sine instruction onto the computation.
+  XlaOp Sin(const XlaOp& operand);
+
+  // Enqueues a tanh instruction onto the computation.
+  XlaOp Tanh(const XlaOp& operand);
+
+  // Enqueues a real-part instruction onto the computation.
+  XlaOp Real(const XlaOp& operand);
+
+  // Enqueues an imaginary-part instruction onto the computation.
+  XlaOp Imag(const XlaOp& operand);
+
+  // Enqueues a float32 sqrt instruction onto the computation.
+  // (float32 is specified as there is an implicit float32 0.5f constant
+  // exponent).
+  XlaOp SqrtF32(const XlaOp& operand);
+
+  // Enqueues a float32 square instruction onto the computation.
+  // (float32 is specified as there is an implicit float32 2.0f constant
+  // exponent).
+  XlaOp SquareF32(const XlaOp& operand);
+
+  // Enqueues a lhs^rhs computation onto the computation.
+  XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues an operator that tests if the operand's values are finite, i.e.,
+  // not Inf or NaN. Defined only for floating-point types. Returns an array of
+  // booleans with the same shape where entries are true iff the corresponding
+  // entry was NaN.
+  XlaOp IsFinite(const XlaOp& operand);
+
+  // Enqueues a convert instruction onto the computation that changes the
+  // element type of the operand array to primitive_type.
+  XlaOp ConvertElementType(const XlaOp& operand,
+                           PrimitiveType new_element_type);
+
+  // Enqueues a no-op instruction onto the computation that changes
+  // the element type of the operand array to primitive_type. The
+  // bit-widths of the source and destination element types must be
+  // identical.
+  XlaOp BitcastConvertType(const XlaOp& operand,
+                           PrimitiveType new_element_type);
+
+  // Enqueues a float32 reciprocal instruction onto the computation.
+  // (float32 is specified as there is an implicit float32 -1.0f constant
+  // exponent).
+  //
+  // TODO(b/34468990) axe F32 suffix, can be determined by reflecting on the
+  // shape of the operand.
+  XlaOp ReciprocalF32(const XlaOp& operand);
+
+  // Enqueues a negate instruction onto the computation.
+  XlaOp Neg(const XlaOp& operand);
+
+  // Enqueues a transpose instruction onto the computation.
+  XlaOp Transpose(const XlaOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> permutation);
+
+  // Enqueues a reverse instruction onto the computation. The order of the
+  // elements in the given dimensions is reversed (i.e., the element at index i
+  // is moved to index dimension_size - 1 - i).
+  XlaOp Rev(const XlaOp& operand,
+            tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Enqueues a sort (as increasing order) instruction onto the computation.
+  XlaOp Sort(const XlaOp& operand);
+
+  // Enqueues a clamp instruction onto the computation.
+  XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+
+  // Enqueues a map instruction onto the computation.
+  XlaOp Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
+            const XlaComputation& computation,
+            tensorflow::gtl::ArraySlice<int64> dimensions,
+            tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
+
+  // Enqueues a N(mu, sigma) random number generation instruction onto the
+  // computation.
+  XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
+
+  // Enqueues a U(a, b) random number generation instruction onto the
+  // computation. Returns values in the semi-open interval [a, b).
+  XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+
+  // Enqueues a while node onto the computation.
+  XlaOp While(const XlaComputation& condition, const XlaComputation& body,
+              const XlaOp& init);
+
+  // Enqueues a conditional node onto the computation.
+  XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                    const XlaComputation& true_computation,
+                    const XlaOp& false_operand,
+                    const XlaComputation& false_computation);
+
+  // Enqueues a ReducePrecision node onto the computation.
+  XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                        const int mantissa_bits);
+
+  // Enqueues a Gather node onto the computation.
+  XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+               const GatherDimensionNumbers& dimension_numbers,
+               tensorflow::gtl::ArraySlice<int64> window_bounds);
+
+  // Enqueues a Send node onto the computation, to send the given operand to
+  // a Recv instruction that shares the same channel handle.
+  void Send(const XlaOp& operand, const ChannelHandle& handle);
+
+  // Enqueues a Recv node onto the computation. The data comes from a Send
+  // instruction that shares the same channel handle and its shape must
+  // be the same as the given shape.
+  XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
+
+  // Returns true if 'operand' is a compile-time constant. A compile-time
+  // constant does not depend on parameters with index greater than or equal to
+  // `num_parameters`, or on stateful operators such as `RngNormal` or `Infeed`.
+  // Unlike `ComputeConstant`, `IsConstant` tests whether a computation is a
+  // compile-time constant without evaluating the computation.
+  StatusOr<bool> IsConstant(const XlaOp& operand, int64 num_parameters = 0);
+
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
+  // is the normalized result and batch_mean and batch_var are the mean and
+  // variance, respectively, across batch for the operand.
+  XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                          const XlaOp& offset, float epsilon,
+                          int64 feature_index);
+
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
+  // computing `mean` and `variance` for each batch inside the operation. It
+  // uses the input `mean` and `variance` instead as estimated values. The
+  // purpose of this op is to reduce latency in inference, hence the name
+  // `BatchNormInference`.
+  //
+  // The output has the same shape as `operand`, and contains the normalized
+  // values for each batch.
+  XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                           const XlaOp& offset, const XlaOp& mean,
+                           const XlaOp& variance, float epsilon,
+                           int64 feature_index);
+
+  // Calculates the gradients of a batch norm op.
+  //
+  // The inputs `batch_mean` and `batch_var` represent the mean and variance
+  // across the batch.
+  //
+  // Returns a tuple of three elements:
+  //   - grad_operand: Gradient with respect to input `operand`
+  //   - grad_offset: Gradient with respect to input `offset`
+  //   - grad_scale: Gradient with respect to input `scale`
+  XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                      const XlaOp& batch_mean, const XlaOp& batch_var,
+                      const XlaOp& grad_output, float epsilon,
+                      int64 feature_index);
+
+  // Computes the value of a constant indicated by a XlaOp using a non-optimized
+  // interpreter on the host.
+  //
+  // The operand must represent a constant value, which in this case
+  // means that it must not statically depend on any parameter of the
+  // computation that is being built other then the ones specified on the
+  // parameter list. The parameters in the list will be indexed by their
+  // parameter id property so the number of parameters specified should be at
+  // least as many as the largest used parameter index.
+  //
+  // `IsConstant` can be used to test whether a computation is a compile-time
+  // constant without evaluation it. `ComputeConstant` only succeeds for
+  // computations where `IsConstant` returns true.
+  //
+  // This functionality can be useful when translating a computation
+  // into XLA where something that looked dynamic is required by
+  // XLA to be specified as a constant. E.g. the source
+  // computation (outside of XLA) may include a dynamic
+  // computation of the shape of something and ComputeConstant lets
+  // you determine what the value of that computation is in the case
+  // where the value can be determined at compile time.
+  //
+  // If output_layout is non-null, then the output of the computation
+  // will be stored using that layout.
+  StatusOr<std::unique_ptr<Literal>> ComputeConstant(
+      const XlaOp& operand, const Layout* output_layout = nullptr,
+      tensorflow::gtl::ArraySlice<Literal> parameters = {});
+
+  // Returns a new XlaBuilder whose resultant Computation is used only by this
+  // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
+  // behavior as the parent.
+  std::unique_ptr<XlaBuilder> CreateSubBuilder(const string& computation_name);
+
+  // Modifies the computation being built so that executions of it will return
+  // the value associated with operand, rather than the last expression enqueued
+  // on the XlaBuilder. Any subsequent operations added to the XlaBuilder will
+  // not have any effect unless SetReturnValue is called again.
+  Status SetReturnValue(const XlaOp& operand);
 
   // Builds the computation with the requested operations, or returns a non-ok
   // status.
   StatusOr<XlaComputation> Build();
 
+  // Builds the computation with the requested operations, or notes an error in
+  // the parent XlaBuilder and returns an empty computation if building failed.
+  // This function is intended to be used where the returned XlaComputation is
+  // only used by the parent XlaBuilder and hence further operation on the
+  // returned XlaComputation will simply be error'ed out if an error occurred
+  // while building this computation. If the built computation is to be used by
+  // a XlaBuilder other than the parent XlaBuilder then Build() should be used
+  // instead.
+  XlaComputation BuildAndNoteError();
+
+  // Returns the first error that was encountered while building the
+  // computation. When an error is encountered, by default we return a vacuous
+  // XlaOp and inform the user of the error that occurred while
+  // building the computation when they make a final call to Build().
+  //
+  // See also set_die_immediately_on_error().
+  Status first_error() const { return first_error_; }
+
+  // Returns the shape of the given op.
+  StatusOr<Shape> GetShape(const XlaOp& op) const;
+
+  // Returns the (inferred) result for the current computation's shape.
+  StatusOr<ProgramShape> GetProgramShape();
+
  private:
-  XlaOp AddInstruction(HloInstructionProto&& instr);
+  StatusOr<XlaOp> AddInstruction(
+      HloInstructionProto&& instr, HloOpcode opcode,
+      tensorflow::gtl::ArraySlice<XlaOp> operands = {});
+
+  void AddCalledComputation(const XlaComputation& computation,
+                            HloInstructionProto* instr);
 
   // Notes that the error occurred by:
   // * storing it internally and capturing a backtrace if it's the first error
@@ -172,17 +812,49 @@ class XlaBuilder {
   // * dying if die_immediately_on_error_ is true
   void NoteError(const Status& error);
 
-  XlaOp NoteErrorOrReturn(StatusOr<XlaOp>&& op) {
-    if (!op.ok()) {
-      NoteError(op.status());
-      return XlaOp();
-    }
-    return op.ConsumeValueOrDie();
-  }
+  XlaOp NoteErrorOrReturn(const std::function<StatusOr<XlaOp>()>& op_creator);
+
+  // Helper method that creates an empty op and notes error.
+  XlaOp UnimplementedOp();
 
   StatusOr<const HloInstructionProto*> LookUpInstruction(const XlaOp& op) const;
 
-  string name_;  // Name to use for the built computation.
+  // Internal helper method that does the building for an arbitrary unary op.
+  XlaOp UnaryOp(HloOpcode unop, const XlaOp& operand);
+
+  // Internal helper method that does the building for an arbitrary binary op.
+  // broadcast_dimensions specifies which dimensions to use for broadcasting
+  // when the operation is between tensors of different ranks.
+  XlaOp BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
+                 tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Internal helper method that does the building for an arbitrary ternary op.
+  XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
+                  const XlaOp& ehs);
+
+  XlaOp RngOp(RandomDistribution distribution,
+              tensorflow::gtl::ArraySlice<XlaOp> parameters,
+              const Shape& shape);
+
+  StatusOr<XlaOp> InDimBroadcast(
+      const Shape& shape, const XlaOp& operand,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Internal helper method that creates a sequence of instructions that
+  // performs an explicit broadcast of the operand to the target shape.
+  StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
+                                       const XlaOp& operand);
+
+  // Internal helper method for creating a Reshape op with the already inferred
+  // shape.
+  StatusOr<XlaOp> Reshape(const Shape& shape, const XlaOp& operand);
+
+  // Returns the (inferred) result for the program shape for the current
+  // computation and fills the root_id in the pointer.
+  StatusOr<ProgramShape> GetProgramShape(int64* root_id);
+
+  string name_;      // Name to use for the built computation.
+  int64 unique_id_;  // The unique id for the built computation.
 
   // The first error encountered while building the computation.
   // This is OK until the first error is encountered.
@@ -202,8 +874,19 @@ class XlaBuilder {
   // The unique parameter numbers.
   tensorflow::gtl::FlatSet<int64> parameter_numbers_;
 
+  // The metadata to attach to each op. This is structured as a "modal"-like
+  // operation, in order to simplify client code (and not sprinkle this metadata
+  // throughout the TensorFlow op kernel implementations).
+  OpMetadata metadata_;
+
+  // Sharding for this operator. This is structured as a "model"-like operation,
+  // in order to simplify client code, similar to metadata_.
+  tensorflow::gtl::optional<OpSharding> sharding_;
+
   // Mode bit that indicates whether to die when a first error is encountered.
   bool die_immediately_on_error_ = false;
+
+  XlaBuilder* parent_builder_{nullptr};
 };
 
 template <typename NativeT>
@@ -211,6 +894,76 @@ XlaOp XlaBuilder::ConstantR0(NativeT value) {
   return ConstantLiteral(*Literal::CreateR0<NativeT>(value));
 }
 
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values) {
+  return ConstantLiteral(*Literal::CreateR1<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) {
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
+  literal.PopulateWithValue(value);
+  return ConstantLiteral(literal);
+}
+
+inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) {
+  return ConstantLiteral(*Literal::CreateR1(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2(
+    std::initializer_list<std::initializer_list<NativeT>> values) {
+  return ConstantLiteral(*Literal::CreateR2<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array<NativeT>& values,
+                                              const Layout& layout) {
+  return ConstantLiteral(
+      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantFromArray(const Array<NativeT>& values) {
+  return ConstantLiteral(*Literal::CreateFromArray<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout(
+    const Array2D<NativeT>& values, const Layout& layout) {
+  return ConstantLiteral(
+      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D<NativeT>& values) {
+  return ConstantLiteral(*Literal::CreateR2FromArray2D<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout(
+    const Array3D<NativeT>& values, const Layout& layout) {
+  return ConstantLiteral(
+      *Literal::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D<NativeT>& values) {
+  return ConstantFromArray(values);
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout(
+    const Array4D<NativeT>& values, const Layout& layout) {
+  return ConstantFromArrayWithLayout(values, layout);
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
+  return ConstantFromArray(values);
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
index a400e4e78b044ae633a0135b0011d5267eacc115..ce984564d016ce65fa6c932f3cda290cc0d75a4a 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -39,7 +40,8 @@ class XlaBuilderTest : public ::testing::Test {
     TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build());
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
-                        HloModule::CreateModuleConfigFromProto(proto));
+                        HloModule::CreateModuleConfigFromProto(
+                            proto, legacy_flags::GetDebugOptionsFromFlags()));
     return HloModule::CreateFromProto(proto, config);
   }
 
@@ -57,16 +59,16 @@ TEST_F(XlaBuilderTest, OnePlusTwo) {
   EXPECT_THAT(root, op::Add(op::Constant(), op::Constant()));
 }
 
-TEST_F(XlaBuilderTest, ParamPlusConstant) {
+TEST_F(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) {
   XlaBuilder b(TestName());
   auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {3, 5}), "x");
   b.Add(x, b.ConstantR0<float>(1.0));
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Parameter(), op::Constant()));
+  EXPECT_THAT(root, op::Add(op::Parameter(), op::Broadcast(op::Constant())));
 }
 
-TEST_F(XlaBuilderTest, ParamPlusParam) {
+TEST_F(XlaBuilderTest, ParamPlusParamHasBroadcast) {
   XlaBuilder b(TestName());
   const auto& x_shape = ShapeUtil::MakeShape(S32, {2, 4, 6});
   const auto& y_shape = ShapeUtil::MakeShape(S32, {2, 4});
@@ -79,7 +81,7 @@ TEST_F(XlaBuilderTest, ParamPlusParam) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Parameter(0), op::Parameter(1)));
+  EXPECT_THAT(root, op::Add(op::Parameter(0), op::Broadcast(op::Parameter(1))));
 }
 
 TEST_F(XlaBuilderTest, XPlusX) {
@@ -133,5 +135,103 @@ TEST_F(XlaBuilderTest, Call) {
                             op::Call(op::Constant(), op::Constant())));
 }
 
+TEST_F(XlaBuilderTest, BinopHasDegenerateBroadcast) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {1, 2, 3}), "x");
+  auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {1, 2, 1}), "y");
+  b.Add(x, y);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+
+  // Expected:
+  //
+  //  x: f32[1,2,3]  y: f32[1,2,1]
+  //      |               |
+  //      |          reshape: f32[1,2]
+  //      |               |
+  //      |          broadcast: f32[1,2,3]
+  //       \             /
+  //            add
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Parameter(0),
+                            op::Broadcast(op::Reshape(op::Parameter(1)))));
+}
+
+TEST_F(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 3}), "x");
+  auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {2, 1, 4}), "y");
+  b.Add(x, y, /*broadcast_dimensions=*/{0, 1});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+
+  // The binary operation has in-dim broadcast and degenerate broadcast, should
+  // first do the in-dim broadcast then convert the degnerate broadcast into a
+  // reshape and a broadcast.
+  //
+  // Expected:
+  //
+  //  x: f32[2,3]            y: f32[2,1,4]
+  //      |                        |
+  //  broadcast: f32[2,3,4]  reshape: f32[2,4]
+  //      |                        |
+  //      |                  broadcast: f32[2,3,4]
+  //       \                      /
+  //                 add
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Broadcast(op::Parameter(0)),
+                            op::Broadcast(op::Reshape(op::Parameter(1)))));
+}
+
+TEST_F(XlaBuilderTest, OperandFromWrongBuilder) {
+  XlaBuilder b1("b1");
+  auto p0 = b1.Parameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+  XlaBuilder builder("main");
+  builder.Add(p0, p0);
+  auto statusor = builder.Build();
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Do not add XlaOp from builder b1 to builder main"));
+}
+
+TEST_F(XlaBuilderTest, ReshapeDefaultOrder) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x");
+  b.Reshape(x, /*new_sizes=*/{6, 35});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Reshape(op::Parameter()));
+}
+
+TEST_F(XlaBuilderTest, ReshapeHasTranspose) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x");
+  b.Reshape(x, /*dimensions=*/{3, 2, 1, 0}, /*new_sizes=*/{6, 35});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Reshape(op::Transpose(op::Parameter())));
+}
+
+TEST_F(XlaBuilderTest, Transpose) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
+  b.Transpose(x, /*permutation=*/{1, 0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Transpose(op::Parameter()));
+}
+
+// TODO(b/65209188): Create a dedicated lowering for Xor.
+TEST_F(XlaBuilderTest, Xor) {
+  XlaBuilder b(TestName());
+  auto x = b.Parameter(0, ShapeUtil::MakeShape(PRED, {}), "x");
+  auto y = b.Parameter(1, ShapeUtil::MakeShape(PRED, {}), "y");
+  b.Xor(x, y);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  LOG(ERROR) << module->ToString();
+  EXPECT_THAT(root,
+              op::Or(op::And(op::Not(op::Parameter(0)), op::Parameter(1)),
+                     op::And(op::Parameter(0), op::Not(op::Parameter(1)))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6752c601026518825c7994f6b6fa20d20f34f24
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+
+StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
+  TF_RET_CHECK(proto_.has_program_shape());
+  return proto_.program_shape();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a3c6952667a434b68ca0c5e4e9874397da173d3
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// The computation graph that the user builds up with the XlaBuilder.
+//
+// TODO(b/74197823): Replace xla::Computation with this one.
+class XlaComputation {
+ public:
+  XlaComputation() : unique_id_(-1) {}
+
+  XlaComputation(const XlaComputation&) = delete;
+  XlaComputation& operator=(const XlaComputation&) = delete;
+
+  XlaComputation(XlaComputation&& from) = default;
+
+  XlaComputation& operator=(XlaComputation&& from) = default;
+
+  // Returns the "program shape" (parameter and return shapes) for this
+  // computation.
+  StatusOr<ProgramShape> GetProgramShape() const;
+
+  const HloModuleProto& proto() const { return proto_; }
+
+ private:
+  XlaComputation(const int64 unique_id) : unique_id_(unique_id) {}
+  HloModuleProto* mutable_proto() { return &proto_; }
+  friend class XlaBuilder;
+
+  int64 unique_id_;
+  HloModuleProto proto_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD
index 0a9725db0a4fcf963cadcacf2cbc1d95d2c7239d..89353448e29ec3d97275dac288e23aa8e96e31b2 100644
--- a/tensorflow/compiler/xla/legacy_flags/BUILD
+++ b/tensorflow/compiler/xla/legacy_flags/BUILD
@@ -75,17 +75,3 @@ tf_cc_test(
             "//tensorflow/core:test",
         ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
index a3b4286f4c12bf39a44c63dd6e7d303a46a418c3..7b6ae311c1099dccb8dceb2f49743c1b185cd5ab 100644
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
+++ b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 0a24db046a390eb447bc3518476c3dd9897d973c..13675b7d0074592043b7e12de0aad948a3e9848f 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -929,7 +929,7 @@ string Literal::GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
     case U64:
       return StrCat(Get<uint64>(multi_index, shape_index));
     case F16:
-      return StrCat(Get<half>(multi_index, shape_index));
+      return StrCat(static_cast<float>(Get<half>(multi_index, shape_index)));
     case F32:
       return StrCat(Get<float>(multi_index, shape_index));
     case BF16:
@@ -979,7 +979,8 @@ string Literal::GetSparseElementAsString(int64 sparse_element_number,
       return StrCat(
           GetSparseElement<uint64>(sparse_element_number, shape_index));
     case F16:
-      return StrCat(GetSparseElement<half>(sparse_element_number, shape_index));
+      return StrCat(static_cast<float>(
+          GetSparseElement<half>(sparse_element_number, shape_index)));
     case F32:
       return StrCat(
           GetSparseElement<float>(sparse_element_number, shape_index));
@@ -1384,8 +1385,9 @@ void Literal::EachCellAsString(
 }
 
 namespace {
-template <typename NativeSrcT, typename NativeDestT>
-std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
+template <typename NativeSrcT, typename NativeDestT, typename ConverterType>
+std::unique_ptr<Literal> ConvertBetweenNativeTypesWithConverter(
+    const Literal& src_literal, const ConverterType& converter) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
   auto result_literal = MakeUnique<Literal>(ShapeUtil::ChangeElementType(
       src_literal.shape(),
@@ -1395,11 +1397,18 @@ std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
   int64 num_elements = src_literal.element_count();
 
   for (int64 i = 0; i < num_elements; ++i) {
-    dest_data[i] = static_cast<NativeDestT>(src_data[i]);
+    dest_data[i] = converter(src_data[i]);
   }
   return result_literal;
 }
 
+template <typename NativeSrcT, typename NativeDestT>
+std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
+  auto converter = [](NativeSrcT src) { return static_cast<NativeDestT>(src); };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
+      src_literal, converter);
+}
+
 template <PrimitiveType primitive_src_type>
 std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
@@ -1462,6 +1471,9 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
 StatusOr<std::unique_ptr<Literal>> Literal::Convert(
     PrimitiveType primitive_dest_type) const {
   TF_RET_CHECK(ShapeUtil::IsArray(shape()));
+  if (shape().element_type() == primitive_dest_type) {
+    return CloneToUnique();
+  }
   switch (shape().element_type()) {
 #define CONVERT_IF_DEST_TYPE_MATCHES(type) \
   case (type):                             \
@@ -1488,8 +1500,16 @@ StatusOr<std::unique_ptr<Literal>> Literal::Convert(
 }
 
 StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
-    const Shape& dest_shape) const {
+    const Shape& dest_shape, bool round_f32_to_bf16) const {
   if (!ShapeUtil::IsTuple(dest_shape)) {
+    if (round_f32_to_bf16 && shape().element_type() == F32 &&
+        dest_shape.element_type() == BF16) {
+      auto converter = [](float src) {
+        return tensorflow::bfloat16::round_to_bfloat16(src);
+      };
+      return ConvertBetweenNativeTypesWithConverter<float, bfloat16>(*this,
+                                                                     converter);
+    }
     return Convert(dest_shape.element_type());
   }
   std::vector<Literal> elements;
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index e24f5285d9a14cf26216e4a16c6d1e516afc413f..a96a76fbb4e1a46e225d33b715f073c05fe6275a 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -340,8 +340,14 @@ class Literal {
 
   // Converts this literal to the given shape. Returns an error is the
   // conversion is not possible.
+  //
+  // round_f32_to_bf16: if true, converting F32 elements to BF16 uses rounding
+  // instead of truncation; otherwise, truncation is used.
+  //
+  // TODO(b/69266521): remove the round_to_bfloat16 flag when rounding becomes
+  // the default behavior.
   StatusOr<std::unique_ptr<Literal>> ConvertToShape(
-      const Shape& dest_shape) const;
+      const Shape& dest_shape, bool round_f32_to_bf16 = false) const;
 
   // Creates a scalar literal value zero of the given primitive type.
   static Literal Zero(PrimitiveType primitive_type);
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 04e45f00491b0bef94f3c0af1c875b2d007194fd..7627762074b6132655c58690a7fffbaf2717e279 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -1702,7 +1702,7 @@ TEST_F(LiteralUtilTest, GetSparseElementAsString) {
   ASSERT_EQ(Literal::CreateSparse<half>(dimensions, indices,
                                         {half{1.0}, half{2.0}, half{3.0}})
                 ->GetSparseElementAsString(1),
-            tensorflow::strings::StrCat(half{2.0}));
+            tensorflow::strings::StrCat(static_cast<float>(half{2.0})));
   ASSERT_EQ(
       Literal::CreateSparse<complex64>(
           dimensions, indices,
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index e2972f06016ab3555c4fc0cc4616993fe6764b1e..0517a5502e686def4ffea59f929aef225186a8aa 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -72,15 +72,3 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla/service:cpu_plugin",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index b21ab3044fae7136071f50bdba6e74b799a309d5..2bacc6a9142971f6d14b3929fb1a69e2a40052e2 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -521,6 +521,17 @@ ComputationDataHandle LocalComputationBuilder::Conditional(
                               false_computation.computation());
 }
 
+StatusOr<bool> LocalComputationBuilder::IsConstant(
+    const ComputationDataHandle& operand, int64 num_parameters) {
+  return builder_.IsConstant(operand, num_parameters);
+}
+
+StatusOr<std::unique_ptr<Literal>> LocalComputationBuilder::ComputeConstant(
+    const ComputationDataHandle& operand, const Layout* output_layout,
+    tensorflow::gtl::ArraySlice<Literal> parameters) {
+  return builder_.ComputeConstant(operand, output_layout, parameters);
+}
+
 #define _FORWARD(method_name, return_sig, args_sig, args)    \
   return_sig LocalComputationBuilder::method_name args_sig { \
     return builder_.method_name args;                        \
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index a7375c8965e9041226ffee08dab6ffafa25312af..31046e60f11af9cc89ddec4c5fd16babfc8eb231 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -268,6 +268,13 @@ class LocalComputationBuilder {
                                     const ComputationDataHandle& false_operand,
                                     const LocalComputation& false_computation);
 
+  StatusOr<bool> IsConstant(const ComputationDataHandle& operand,
+                            int64 num_parameters);
+
+  StatusOr<std::unique_ptr<Literal> > ComputeConstant(
+      const ComputationDataHandle& operand, const Layout* output_layout,
+      tensorflow::gtl::ArraySlice<Literal> parameters);
+
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index b5354131c94930b75ea66036ddb61ecd3993414f..ac792e8189bda9eda472e7d282db86ac988c57b9 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -141,6 +141,33 @@ bool GetIntAttr(PyObject* o, const char* field, int64* result) {
   return true;
 }
 
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleStringAttribute(PyObject* o,
+                           const char* attr_name,
+                           std::function<void(string s)> f) {
+  if (!PyObject_HasAttrString(o, attr_name)) {
+    return true;  // It's ok for the object to not have the attribute.
+  }
+  PyObject* attr = PyObject_GetAttrString(o, attr_name);
+  if (attr == nullptr) {
+    return false;  // An error occurred getting the attribute.
+  }
+  if (attr == Py_None) {
+    Py_DECREF(attr);
+    return true;  // The attribute is None, which we consider ok.
+  }
+  if (!PyString_Check(attr)) {
+    string message = tensorflow::strings::Printf("%s must be a string or none; got %s",
+        attr_name, numpy::PyObjectCppRepr(attr).c_str());
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyString_AsString(attr));
+  Py_DECREF(attr);
+  return true;  // Handled string attribute, ok!
+}
+
 }
 }
 %}
@@ -155,7 +182,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const ComputationDataHandle& (ComputationDataHandle temp) {
   const int64 handle = numpy::PyIntOrPyLongToLong($input);
   if (handle == -1 && PyErr_Occurred()) {
-    return NULL;
+    SWIG_fail;
   }
   temp.set_handle(handle);
   $1 = &temp;
@@ -174,7 +201,7 @@ tensorflow::ImportNumpy();
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -184,7 +211,7 @@ tensorflow::ImportNumpy();
     $result = numpy::PyObjectFromXlaLiteral(*value);
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -197,7 +224,7 @@ tensorflow::ImportNumpy();
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -206,7 +233,16 @@ tensorflow::ImportNumpy();
     $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<bool> {
+  if ($1.ok()) {
+    $result = PyBool_FromLong($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
   }
 }
 
@@ -214,8 +250,9 @@ tensorflow::ImportNumpy();
   if (!$1.ok()) {
     PyErr_SetString(
         PyExc_RuntimeError, $1.ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
+  Py_INCREF(Py_None);
   $result = Py_None;
 }
 
@@ -225,7 +262,7 @@ tensorflow::ImportNumpy();
     (std::vector<int64> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.resize(size);
@@ -237,13 +274,13 @@ tensorflow::ImportNumpy();
           PyExc_TypeError,
           "Argument sequence element cannot be converted to int");
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps[i] = numpy::PyIntOrPyLongToLong(py_int);
     if (temps[i] == -1 && PyErr_Occurred()) {
       Py_DECREF(py_int);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     Py_DECREF(py_int);
     Py_DECREF(o);
@@ -257,7 +294,7 @@ tensorflow::ImportNumpy();
     (std::vector<ComputationDataHandle> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.resize(size);
@@ -268,13 +305,13 @@ tensorflow::ImportNumpy();
       PyErr_SetString(
           PyExc_TypeError,
           "Argument sequence element cannot be converted to int");
-      return NULL;
+      SWIG_fail;
     }
     const int64 handle = numpy::PyIntOrPyLongToLong(py_int);
     if (handle == -1 && PyErr_Occurred()) {
       Py_DECREF(py_int);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps[i].set_handle(handle);
     Py_DECREF(py_int);
@@ -289,7 +326,7 @@ tensorflow::ImportNumpy();
     (std::vector<LocalShapedBuffer*> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.reserve(size);
@@ -298,7 +335,7 @@ tensorflow::ImportNumpy();
     LocalShapedBuffer* lsbp;
     if ((SWIG_ConvertPtr(o, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
                          SWIG_POINTER_EXCEPTION)) == -1) {
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(lsbp);
     Py_DECREF(o);
@@ -312,7 +349,7 @@ tensorflow::ImportNumpy();
   literal_status = numpy::XlaLiteralFromPyObject($input);
   if (!literal_status.ok()) {
     PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   $1 = literal_status.ValueOrDie().get();
 }
@@ -324,7 +361,7 @@ tensorflow::ImportNumpy();
 %typemap(out) StatusOr< std::unique_ptr<Literal> > {
   if (!$1.ok()) {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   $result = numpy::PyObjectFromXlaLiteral(*$1.ValueOrDie());
 }
@@ -332,7 +369,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -341,7 +378,7 @@ tensorflow::ImportNumpy();
     if (!literal_status.ok()) {
       PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(std::move(*literal_status.ConsumeValueOrDie()));
     Py_DECREF(o);
@@ -355,7 +392,7 @@ tensorflow::ImportNumpy();
   StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
   if (!statusor.ok()) {
     PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
@@ -367,7 +404,7 @@ tensorflow::ImportNumpy();
   StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
   if (!statusor.ok()) {
     PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
@@ -382,7 +419,7 @@ tensorflow::ImportNumpy();
     StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
     if (!statusor.ok()) {
       PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      return NULL;
+      SWIG_fail;
     }
     temp = std::move(statusor).ValueOrDie();
     $1 = &temp;
@@ -396,7 +433,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -405,7 +442,7 @@ tensorflow::ImportNumpy();
     Py_DECREF(o);
     if (!statusor.ok()) {
       PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(statusor.ConsumeValueOrDie());
   }
@@ -416,7 +453,7 @@ tensorflow::ImportNumpy();
     std::vector<tensorflow::gtl::optional<Shape> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -428,7 +465,7 @@ tensorflow::ImportNumpy();
       Py_DECREF(o);
       if (!statusor.ok()) {
         PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-        return NULL;
+        SWIG_fail;
       }
       temps.push_back(statusor.ConsumeValueOrDie());
     }
@@ -442,18 +479,18 @@ tensorflow::ImportNumpy();
   PyObject* py_int = numpy::PyNumberToPyInt($input);
   if (!py_int) {
     PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
-    return NULL;
+    SWIG_fail;
   }
   const long value = numpy::PyIntOrPyLongToLong(py_int);
   if (value == -1 && PyErr_Occurred()) {
     Py_DECREF(py_int);
-    return NULL;
+    SWIG_fail;
   }
   if (!PrimitiveType_IsValid(value)) {
     PyErr_SetString(
         PyExc_TypeError, "Argument not valid for PrimitiveType enum");
     Py_DECREF(py_int);
-    return NULL;
+    SWIG_fail;
   }
   $1 = static_cast<PrimitiveType>(value);
 }
@@ -464,19 +501,19 @@ tensorflow::ImportNumpy();
     (std::vector<std::pair<int64, int64> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.reserve(size);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
     if (!o) {
-      return NULL;
+      SWIG_fail;
     }
     PyObject* first = PyTuple_GetItem(o, 0);
     if (!first) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* first_pyint = numpy::PyNumberToPyInt(first);
     if (!first_pyint) {
@@ -484,13 +521,13 @@ tensorflow::ImportNumpy();
           PyExc_TypeError,
           "First pair item cannot be converted to int");
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* second = PyTuple_GetItem(o, 1);
     if (!second) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* second_pyint = numpy::PyNumberToPyInt(second);
     if (!second_pyint) {
@@ -499,21 +536,21 @@ tensorflow::ImportNumpy();
           "Second pair item cannot be converted to int");
       Py_DECREF(o);
       Py_DECREF(first_pyint);
-      return NULL;
+      SWIG_fail;
     }
     const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
     if (first_value == -1 && PyErr_Occurred()) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
       Py_DECREF(second_pyint);
-      return NULL;
+      SWIG_fail;
     }
     const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
     if (second_value == -1 && PyErr_Occurred()) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
       Py_DECREF(second_pyint);
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(std::make_pair(first_value, second_value));
     Py_DECREF(o);
@@ -531,26 +568,26 @@ tensorflow::ImportNumpy();
   PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
       $input, "lhs_contracting_dimensions");
   if (!lhs_contracting_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(lhs_contracting_dimensions);
   if (length == -1) {
     Py_DECREF(lhs_contracting_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
     if (!item) {
       Py_DECREF(lhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(lhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_lhs_contracting_dimensions(dimension);
     Py_DECREF(item);
@@ -561,26 +598,26 @@ tensorflow::ImportNumpy();
   PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
       $input, "rhs_contracting_dimensions");
   if (!lhs_contracting_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(rhs_contracting_dimensions);
   if (length == -1) {
     Py_DECREF(rhs_contracting_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
     if (!item) {
       Py_DECREF(rhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(rhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_rhs_contracting_dimensions(dimension);
     Py_DECREF(item);
@@ -591,26 +628,26 @@ tensorflow::ImportNumpy();
   PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
       $input, "lhs_batch_dimensions");
   if (!lhs_batch_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(lhs_batch_dimensions);
   if (length == -1) {
     Py_DECREF(lhs_batch_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
     if (!item) {
       Py_DECREF(lhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(lhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_lhs_batch_dimensions(dimension);
     Py_DECREF(item);
@@ -621,26 +658,26 @@ tensorflow::ImportNumpy();
   PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
       $input, "rhs_batch_dimensions");
   if (!rhs_batch_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(rhs_batch_dimensions);
   if (length == -1) {
     Py_DECREF(rhs_batch_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
     if (!item) {
       Py_DECREF(rhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(rhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_rhs_batch_dimensions(dimension);
     Py_DECREF(item);
@@ -656,20 +693,20 @@ tensorflow::ImportNumpy();
     (PaddingConfig padding_config) {
   PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
   if (!dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   int length = PySequence_Size(dimensions);
   if (length == -1) {
     Py_DECREF(dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(dimensions, i);
     if (!item) {
       Py_DECREF(dimensions);
-      return NULL;
+      SWIG_fail;
     }
     int64 edge_padding_low, edge_padding_high, interior_padding;
     if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
@@ -677,7 +714,7 @@ tensorflow::ImportNumpy();
         || !GetIntAttr(item, "interior_padding", &interior_padding)) {
       Py_DECREF(item);
       Py_DECREF(dimensions);
-      return NULL;
+      SWIG_fail;
     }
     Py_DECREF(item);
 
@@ -699,32 +736,32 @@ tensorflow::ImportNumpy();
   int64 value;
 
   if (!GetIntAttr($input, "input_batch_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_input_batch_dimension(value);
 
   if (!GetIntAttr($input, "input_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_input_feature_dimension(value);
 
   if (!GetIntAttr($input, "output_batch_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_output_batch_dimension(value);
 
   if (!GetIntAttr($input, "output_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_output_feature_dimension(value);
 
   if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_kernel_output_feature_dimension(value);
 
   if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_kernel_input_feature_dimension(value);
 
@@ -733,24 +770,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "input_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_input_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -759,24 +796,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "kernel_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_kernel_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -785,24 +822,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "output_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_output_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -819,16 +856,32 @@ tensorflow::ImportNumpy();
   if ($input == Py_None) {
     $1 = NULL;
   } else {
-    PyObject* o = PyObject_GetAttrString($input, "generate_hlo_graph");
-    if (!o) {
-      return NULL;
+    if (!HandleStringAttribute($input, "generate_hlo_graph", [&](string s) {
+      build_options.set_generate_hlo_graph(std::move(s));
+    })) {
+      return nullptr;
+    }
+    if (!HandleStringAttribute($input, "dump_optimized_hlo_proto_to", [&](string s) {
+      build_options.set_dump_optimized_hlo_proto_to(std::move(s));
+    })) {
+      return nullptr;
+    }
+    if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) {
+      build_options.set_dump_per_pass_hlo_proto_to(std::move(s));
+    })) {
+      return nullptr;
+    }
+
+    PyObject* o = PyObject_GetAttrString($input, "hlo_profile");
+    if (o == NULL) {
+      SWIG_fail;
     }
     if (o != Py_None) {
-      if (!PyString_Check(o)) {
-        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.generate_hlo_graph must be a string or None.");
-        return NULL;
+      if (!PyBool_Check(o)) {
+        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.hlo_profile must be a bool or None.");
+        SWIG_fail;
       }
-      build_options.set_generate_hlo_graph(PyString_AsString(o));
+      build_options.set_hlo_profile(o == Py_True);
     }
     Py_DECREF(o);
 
@@ -841,7 +894,7 @@ tensorflow::ImportNumpy();
       if (!statusor.ok()) {
         PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat("ExecutableBuildOptions.result_shape could not be created from Python shape value: ", statusor.status().ToString()).c_str());
         Py_DECREF(o);
-        return NULL;
+        SWIG_fail;
       }
       build_options.set_result_layout(statusor.ValueOrDie());
     }
@@ -907,6 +960,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::RngBernoulli;
 %unignore xla::swig::LocalComputationBuilder::While;
 %unignore xla::swig::LocalComputationBuilder::Conditional;
+%unignore xla::swig::LocalComputationBuilder::IsConstant;
 %unignore xla::swig::LocalComputationBuilder::Eq;
 %unignore xla::swig::LocalComputationBuilder::Ne;
 %unignore xla::swig::LocalComputationBuilder::Ge;
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index 3d87480728aab1d4ebbc71c6c7504d37cae5edaf..eec48479c929ab0823fef342fc284bfdc4b1f339 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -170,8 +170,7 @@ static string PyObjectCppStr(PyObject* o) {
   return ExtractStringAndDecref(s);
 }
 
-// Safely returns a repr of the given Python object o as a C++ string.
-static string PyObjectCppRepr(PyObject* o) {
+string PyObjectCppRepr(PyObject* o) {
   PyObject* r = PyObject_Repr(o);
   return ExtractStringAndDecref(r);
 }
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index adfcc3b8588dce01718bb19dea936bace483be4d..9656cb1c31c39dbe54293700c2765d0723255657 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -107,6 +107,9 @@ void CopyLiteralToNumpyArray(const Literal& literal, PyArrayObject* py_array) {
   std::copy(source.begin(), source.end(), dest);
 }
 
+// Safely returns a repr of the given Python object o as a C++ string.
+string PyObjectCppRepr(PyObject* o);
+
 // Workarounds for Python 2 and 3 interop
 
 PyObject* LongToPyIntOrPyLong(long x);  // NOLINT
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 90cda42f3227c80826ffbf4e5473647c2795544d..9c81f6439d0d9f0a0f0d1d3402e9c1ada46e8691 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -320,6 +320,9 @@ class CompileOptions(object):
 
   def __init__(self):
     self.generate_hlo_graph = None
+    self.dump_optimized_hlo_proto_to = None
+    self.dump_per_pass_hlo_proto_to = None
+    self.hlo_profile = False
 
 
 def transfer_to_infeed(value, replica_number=None):
@@ -1025,6 +1028,20 @@ class ComputationBuilder(object):
             _unwrap_data_handle(false_operand),
             false_computation.c_local_computation))
 
+  def IsConstant(self, operand, num_parameters=0):
+    """Enqueues an IsConstant operation onto the computation.
+
+    Args:
+      operand: a ComputationDataHandle to test.
+      num_parameters: optional int, number of computation parameters to treat as
+        constant (default 0).
+
+    Returns: bool indicating whether `operand` is a compile-time constant,
+      meaning its value does not depend on parameters with index greater than or
+      equal to `num_parameters`.
+    """
+    return self._client.IsConstant(_unwrap_data_handle(operand), num_parameters)
+
   def Dot(self, lhs, rhs):
     """Enqueues a dot operation onto the computation.
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 4c16c1f8b07a28d8098e92e27f81a126ed9bdf0c..d97264ea640787ab865f3cd64867addedd73cc1d 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -855,6 +855,17 @@ class SingleOpTest(LocalComputationTest):
     self.assertTrue(np.all(lo <= result))
     self.assertTrue(np.all(result < hi))
 
+  def testIsConstant(self):
+    c = self._NewComputation()
+    a = c.ConstantS32Scalar(3)
+    b = c.ConstantS32Scalar(1)
+    x = c.ParameterFromNumpy(NumpyArrayS32(0))
+    const_expr = c.Sub(b, a)
+    non_const_expr = c.Mul(const_expr, x)
+    self.assertTrue(c.IsConstant(const_expr))
+    self.assertFalse(c.IsConstant(non_const_expr))
+    # self.assertTrue(c.IsConstant(c.Sub(c.Add(x, a), x)))  # TODO(b/77245564)
+
 
 class EmbeddedComputationsTest(LocalComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index fba20c94cafea587bffcd766d1122d6327f32182..3a99d84bea63636870609a01c10f2bb3e0e5e8d7 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -285,6 +285,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dfs_hlo_visitor_with_default_test",
+    srcs = ["dfs_hlo_visitor_with_default_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_runner",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_reachability",
     srcs = ["hlo_reachability.cc"],
@@ -623,6 +640,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
@@ -712,7 +730,6 @@ cc_library(
         ":computation_layout",
         ":device_memory_allocator",
         ":hlo",
-        ":hlo_cost_analysis",
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
         ":pool",
@@ -1129,6 +1146,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1275,6 +1293,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gather_expander_test",
+    srcs = ["gather_expander_test.cc"],
+    deps = [
+        ":gather_expander",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:test_macros_header",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+    ],
+)
+
 cc_library(
     name = "conditional_simplifier",
     srcs = ["conditional_simplifier.cc"],
@@ -1566,6 +1596,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2619,17 +2650,3 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index be7aa307d2c9f70ba8d334b842a4ff29a49687f9..0e4624fd69e623efca780937c5347dbf6bb9afe1 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -302,7 +302,7 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Disable dot strength reduction on platforms where it causes a slowdown.
   bool enable_dot_strength_reduction_;
 
-  // Disable convolution simplication on platforms where it causes a slowdown.
+  // Disable convolution simplification on platforms where it causes a slowdown.
   bool enable_conv_simplification_;
 };
 
@@ -385,7 +385,7 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
     auto* c2 = rhs;
 
     TF_ASSIGN_OR_RETURN(auto* sum_of_constants,
-                        CreateBinaryHlo(HloOpcode::kAdd, c1, c2));
+                        MakeBinaryHlo(HloOpcode::kAdd, c1, c2));
     return ReplaceWithNewInstruction(
         add, HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd,
                                           lhs->mutable_operand(0),
@@ -636,16 +636,14 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   // (A / B) / (C / D)  =>  (A / B)*(D / C) => (A * D) / (B * C)
   if (lhs->opcode() == HloOpcode::kDivide &&
       rhs->opcode() == HloOpcode::kDivide) {
-    TF_ASSIGN_OR_RETURN(
-        auto a_times_d,
-        CreateBinaryHlo(HloOpcode::kMultiply, lhs->mutable_operand(0),
-                        rhs->mutable_operand(1)));
-    TF_ASSIGN_OR_RETURN(
-        auto b_times_c,
-        CreateBinaryHlo(HloOpcode::kMultiply, lhs->mutable_operand(1),
-                        rhs->mutable_operand(0)));
-    TF_ASSIGN_OR_RETURN(auto new_divide, CreateBinaryHlo(HloOpcode::kDivide,
-                                                         a_times_d, b_times_c));
+    TF_ASSIGN_OR_RETURN(auto a_times_d, MakeBinaryHlo(HloOpcode::kMultiply,
+                                                      lhs->mutable_operand(0),
+                                                      rhs->mutable_operand(1)));
+    TF_ASSIGN_OR_RETURN(auto b_times_c, MakeBinaryHlo(HloOpcode::kMultiply,
+                                                      lhs->mutable_operand(1),
+                                                      rhs->mutable_operand(0)));
+    TF_ASSIGN_OR_RETURN(auto new_divide, MakeBinaryHlo(HloOpcode::kDivide,
+                                                       a_times_d, b_times_c));
 
     return ReplaceInstruction(divide, new_divide);
   }
@@ -654,7 +652,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   if (lhs->opcode() == HloOpcode::kDivide) {
     TF_ASSIGN_OR_RETURN(
         auto b_times_c,
-        CreateBinaryHlo(HloOpcode::kMultiply, lhs->mutable_operand(1), rhs));
+        MakeBinaryHlo(HloOpcode::kMultiply, lhs->mutable_operand(1), rhs));
     return ReplaceWithNewInstruction(
         divide,
         HloInstruction::CreateBinary(divide->shape(), HloOpcode::kDivide,
@@ -663,9 +661,8 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
 
   // A / (B / C) => (A*C) / B
   if (rhs->opcode() == HloOpcode::kDivide) {
-    TF_ASSIGN_OR_RETURN(
-        auto a_times_c,
-        CreateBinaryHlo(HloOpcode::kMultiply, lhs, rhs->mutable_operand(1)));
+    TF_ASSIGN_OR_RETURN(auto a_times_c, MakeBinaryHlo(HloOpcode::kMultiply, lhs,
+                                                      rhs->mutable_operand(1)));
     return ReplaceWithNewInstruction(
         divide,
         HloInstruction::CreateBinary(divide->shape(), HloOpcode::kDivide,
@@ -1124,10 +1121,10 @@ bool OutputIsSubsetOfOperandElements(HloInstruction* instruction,
 
 Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
   auto operand = broadcast->mutable_operand(0);
+  auto dims = broadcast->dimensions();
   // A degenerate broadcast of a reshape that does not change the number of
   // elements can be replaced by a reshape.
-  if (std::is_sorted(broadcast->dimensions().begin(),
-                     broadcast->dimensions().end()) &&
+  if (std::is_sorted(dims.begin(), dims.end()) &&
       ShapeUtil::ElementsIn(broadcast->shape()) ==
           ShapeUtil::ElementsIn(operand->shape())) {
     VLOG(10) << "transform broadcast(X) -> reshape(X) where "
@@ -1145,8 +1142,8 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
     VLOG(10) << "transform broadcast(X) -> transpose(X) where "
                 "n(broadcast(X)) == n(X)";
     return ReplaceWithNewInstruction(
-        broadcast, HloInstruction::CreateTranspose(broadcast->shape(), operand,
-                                                   broadcast->dimensions()));
+        broadcast,
+        HloInstruction::CreateTranspose(broadcast->shape(), operand, dims));
   }
 
   // A broadcast of a reshape which merely inserts 1-sized dimensions can
@@ -1160,7 +1157,6 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
     if (merely_inserts_or_deletes_1_sized_dimensions &&
         deleted_indices.empty()) {
       std::reverse(inserted_indices.begin(), inserted_indices.end());
-      auto dims = broadcast->dimensions();
       for (auto inserted_index : inserted_indices) {
         dims.erase(dims.begin() + inserted_index);
       }
@@ -1204,6 +1200,19 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
         return user->ReplaceAllUsesWith(new_broadcast);
       }
     }
+    return Status::OK();
+  }
+
+  // Merge two consecutive broadcasts into a single one.
+  if (operand->opcode() == HloOpcode::kBroadcast) {
+    std::vector<int64> new_dimensions;
+    for (auto dim : operand->dimensions()) {
+      new_dimensions.push_back(dims[dim]);
+    }
+    return ReplaceWithNewInstruction(
+        broadcast,
+        HloInstruction::CreateBroadcast(
+            broadcast->shape(), operand->mutable_operand(0), new_dimensions));
   }
   return Status::OK();
 }
@@ -1300,8 +1309,8 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     }
 
     TF_ASSIGN_OR_RETURN(HloInstruction * nonzero_pad,
-                        CreatePadHlo(pad->mutable_operand(0),
-                                     pad->mutable_operand(1), nonzero_padding));
+                        MakePadHlo(pad->mutable_operand(0),
+                                   pad->mutable_operand(1), nonzero_padding));
     // Copy the layout from the original pad instructions. The new pad and the
     // slice instruction should all have the same layout.
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
@@ -1329,7 +1338,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
 
     TF_ASSIGN_OR_RETURN(
         HloInstruction * slice,
-        CreateSliceHlo(nonzero_pad, start_indices, end_indices, strides));
+        MakeSliceHlo(nonzero_pad, start_indices, end_indices, strides));
 
     // Verify that the slice shape matches the pad shape.
     TF_RET_CHECK(ShapeUtil::Compatible(slice->shape(), pad->shape()));
@@ -1722,18 +1731,29 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
                                   function));
   }
 
-  VLOG(10) << "Considering folding Pad: " << operand->ToString()
-           << "\ninto reduce-window: " << reduce_window->ToString();
-
   // This optimization folds a pad op into reduce_window.
-  if (operand->opcode() != HloOpcode::kPad) {
+  HloInstruction* pad;
+  const HloInstruction* convert = nullptr;
+  if (operand->opcode() == HloOpcode::kPad) {
+    pad = operand;
+  } else if (operand->opcode() == HloOpcode::kConvert &&
+             operand->operand(0)->opcode() == HloOpcode::kPad) {
+    convert = operand;
+    pad = operand->mutable_operand(0);
+  } else {
     VLOG(10) << "Not folding pad into reduce-window as there is no pad.";
     return Status::OK();
   }
 
+  VLOG(10) << "Considering folding Pad: " << pad->ToString()
+           << "\ninto reduce-window: " << reduce_window->ToString()
+           << (convert != nullptr ? tensorflow::strings::StrCat(
+                                        "\nvia convert: ", convert->ToString())
+                                  : "");
+
   // Do not fold interior padding into ReduceWindow since the backends do not
   // support it.
-  const PaddingConfig& pad_config = operand->padding_config();
+  const PaddingConfig& pad_config = pad->padding_config();
   if (HasInteriorPadding(pad_config)) {
     VLOG(10) << "Not folding pad into reduce-window due to interior padding.";
     return Status::OK();
@@ -1741,14 +1761,27 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
 
   // If reduce_window already has padding, the pad value of the pad op and the
   // init value of reduce_window must match to allow folding the pad.
-  const HloInstruction* pad_value = operand->operand(1);
+  const HloInstruction* pad_value = pad->operand(1);
   const HloInstruction* reduce_init_value = reduce_window->operand(1);
   if (pad_value != reduce_init_value) {
+    auto literals_are_equivalent = [&] {
+      auto& pad_literal = pad_value->literal();
+      auto& reduce_init_literal = reduce_init_value->literal();
+      if (pad_literal == reduce_init_literal) {
+        return true;
+      }
+      auto converted_pad_literal = pad_literal.ConvertToShape(
+          reduce_init_value->shape(), /*round_f32_to_bf16=*/true);
+      if (!converted_pad_literal.ok()) {
+        return false;
+      }
+      return *converted_pad_literal.ValueOrDie() == reduce_init_literal;
+    };
     // The pad value is usually a constant, so we handle that case and do not
     // try to get more fancy about proving equivalence in cases beyond that.
     if (pad_value->opcode() != HloOpcode::kConstant ||
         reduce_init_value->opcode() != HloOpcode::kConstant ||
-        pad_value->literal() != reduce_init_value->literal()) {
+        !literals_are_equivalent()) {
       VLOG(10) << "Not folding pad into reduce-window due to different pad "
                   "values.";
       return Status::OK();
@@ -1757,7 +1790,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
 
   // If the pad puts a single non-identity value in each window that we're
   // reducing, then this is a broadcast.
-  HloInstruction* pad_operand = operand->mutable_operand(0);
+  HloInstruction* pad_operand = pad->mutable_operand(0);
   auto is_effective_broadcast = [&] {
     if (window_util::HasStride(window)) {
       VLOG(10) << "Window has stride.";
@@ -1801,6 +1834,18 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     VLOG(10) << "Found window covers a single unpadded element.";
     return true;
   };
+
+  HloInstruction* new_reduce_window_operand;
+  if (convert != nullptr) {
+    new_reduce_window_operand =
+        computation_->AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::ChangeElementType(pad_operand->shape(),
+                                         convert->shape().element_type()),
+            pad_operand));
+  } else {
+    new_reduce_window_operand = pad_operand;
+  }
+
   if (is_effective_broadcast()) {
     VLOG(10) << "Replacing pad/reduce-window with (implicit) broadcast.";
     auto fadd = [this](std::unique_ptr<HloInstruction> x) {
@@ -1809,7 +1854,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     return ReplaceWithNewInstruction(
         reduce_window, HloInstruction::CreateBroadcastSequence(
                            /*output_shape=*/reduce_window->shape(),
-                           /*operand=*/pad_operand, fadd));
+                           /*operand=*/new_reduce_window_operand, fadd));
   }
 
   // Carry out the folding of the pad into reduce_window.
@@ -1826,10 +1871,11 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     window_dim.set_padding_high(window_dim.padding_high() +
                                 pad_dim.edge_padding_high());
   }
+
   return ReplaceWithNewInstruction(
       reduce_window, HloInstruction::CreateReduceWindow(
                          /*shape=*/reduce_window->shape(),
-                         /*operand=*/pad_operand,
+                         /*operand=*/new_reduce_window_operand,
                          /*init_value=*/reduce_window->mutable_operand(1),
                          /*window=*/new_window,
                          /*reduce_computation=*/function));
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 43315f5cdc7afbe79039420320f4a0d0535e11f1..c48196e861a559a5abfa360841ec70b39356fa2b 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-// A pass which performs AlgebraicSimplications.
+// A pass which performs algebraic simplifications.
 class AlgebraicSimplifier : public HloPassInterface {
  public:
   // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
@@ -57,10 +57,10 @@ class AlgebraicSimplifier : public HloPassInterface {
   bool is_layout_sensitive_;
   ValidBitcastCallback valid_bitcast_callback_;
 
-  // Enable dot simplication on platforms where it is profitable.
+  // Enable dot simplification on platforms where it is profitable.
   bool enable_dot_strength_reduction_;
 
-  // Enable convolution simplication on platforms where it is profitable.
+  // Enable convolution simplification on platforms where it is profitable.
   bool enable_conv_simplification_;
 };
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 451294ef5d8367686d7fc22b7f5ebfde89d14d42..20c549562d5153c802c1e675a8ff1c92426b8832 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
+using ::testing::ElementsAre;
+
 namespace xla {
 namespace {
 
@@ -2336,6 +2338,91 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   EXPECT_EQ(root->window().dimensions(3).padding_high(), 102);
 }
 
+// Test that ReduceWindow(Convert(Pad(op, x)), y) can simplify to
+// ReduceWindow(Convert(op), x).
+TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
+  HloModule module(TestName());
+  HloComputation::Builder builder(TestName());
+
+  // Create operand to the pad.
+  HloInstruction* parameter =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(BF16, {1, 2, 3, 4}), "p0"));
+
+  // Create the pad.
+  PaddingConfig padding = MakeNoPaddingConfig(4);
+  padding.mutable_dimensions(1)->set_edge_padding_low(1);
+  padding.mutable_dimensions(3)->set_edge_padding_high(2);
+
+  HloInstruction* pad_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(5.0f)));
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(BF16, {1, 3, 3, 5}), parameter, pad_value, padding));
+
+  HloInstruction* convert =
+      builder.AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(pad->shape(), F32), pad));
+
+  // Create add computation.
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  // Create the reduce-window.
+  Window window;
+  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+    auto* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_padding_low(10);
+    dim->set_padding_high(100);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+  const Shape reduce_window_shape =
+      ShapeUtil::MakeShape(F32, {111, 113, 113, 115});
+  HloInstruction* reduce_init_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(5.0f)));
+  HloInstruction* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          reduce_window_shape, convert, reduce_init_value, window,
+          add_computation));
+
+  // Build the computation and run the simplifier.
+  auto computation = module.AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root, reduce_window);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+
+  // Running simplification again should not result in any further changes.
+  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+
+  // Verify the result
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::ReduceWindow(op::Convert(parameter), op::Constant()));
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
+      << ShapeUtil::HumanString(root->shape()) << " vs "
+      << ShapeUtil::HumanString(reduce_window_shape);
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(1).padding_low(), 11);
+  EXPECT_EQ(root->window().dimensions(2).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(3).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(1).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(2).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(3).padding_high(), 102);
+}
+
 TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   HloComputation::Builder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {448, 2048, 1, 1});
@@ -2462,6 +2549,55 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
               op::DynamicSlice(op::Parameter(), op::Parameter()));
 }
 
+// Test that two consecutive broadcasts can be merged to one.
+TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
+  HloComputation::Builder builder(TestName());
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
+  HloInstruction* input_array = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({3, 4})));
+  HloInstruction* inner_bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r2f32, input_array, {1}));
+  Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 2, 2});
+  builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r3f32, inner_bcast, {0, 2}));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root->dimensions(), ElementsAre(2));
+}
+
+// Test that two consecutive broadcasts can be merged to one.
+TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
+  HloComputation::Builder builder(TestName());
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 3});
+  Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 5, 3});
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  // The initial dimensions go to places 0 and 2 in the 3-dim array,
+  // and to places 1 and 3 in the 4-dim array,
+  HloInstruction* inner_bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r3f32, param0, {0, 2}));
+  Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 2, 5, 3});
+  builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r4f32, inner_bcast, {1, 2, 3}));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Broadcast(op::Parameter(0)));
+  EXPECT_THAT(root->dimensions(), ElementsAre(1, 3));
+}
+
 struct PadReduceWindowEffectiveBroadcastCase {
   std::vector<int64> input_spatials;
   std::vector<int64> symmetric_pad_spatials;
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
index 432448e9bbc7db30ed67a0130d52b060032362d5..08d0152e3cfcfcb7ae1e85f72c2f7dc856f5e8b3 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -34,6 +34,9 @@ class BFloat16ConversionFoldingVisitor : public DfsHloVisitorWithDefault {
 
   Status DefaultAction(HloInstruction* hlo) override;
 
+  // Special handling for cross-replica-sum which can have a tuple output.
+  Status HandleCrossReplicaSum(HloInstruction* crs) override;
+
   static bool Run(HloComputation* computation,
                   const BFloat16Support* bfloat16_support) {
     BFloat16ConversionFoldingVisitor visitor(computation, bfloat16_support);
@@ -84,6 +87,25 @@ Status BFloat16ConversionFoldingVisitor::FoldOperandConversion(
   return Status::OK();
 }
 
+namespace {
+
+// Returns whether hlo has users and all users are conversions from F32 to BF16.
+bool AllUsersAreF32ToBF16Converts(const HloInstruction* hlo) {
+  if (hlo->user_count() == 0 || hlo->shape().element_type() != F32) {
+    return false;
+  }
+  for (const auto user : hlo->users()) {
+    if (user->opcode() == HloOpcode::kConvert &&
+        user->shape().element_type() == BF16) {
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
 Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
     HloInstruction* hlo) {
   std::vector<int64> bf16_to_f32_operands;
@@ -104,22 +126,9 @@ Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
     }
   }
 
-  bool fold_output_conversion = hlo->user_count() > 0 &&
-                                hlo->shape().element_type() == F32 &&
-                                bfloat16_support_->SupportsBF16Output(*hlo) &&
-                                hlo != computation_->root_instruction();
-  if (fold_output_conversion) {
-    for (auto user : hlo->users()) {
-      if (user->opcode() == HloOpcode::kConvert &&
-          user->shape().element_type() == BF16) {
-        continue;
-      }
-      // We should not change the output type if any user is not a conversion
-      // from F32 to BF16.
-      fold_output_conversion = false;
-      break;
-    }
-  }
+  const bool fold_output_conversion =
+      AllUsersAreF32ToBF16Converts(hlo) &&
+      bfloat16_support_->SupportsBF16Output(*hlo);
 
   if (!bfloat16_support_->SupportsMixedPrecisions(*hlo)) {
     if (has_other_f32_operands ||
@@ -171,6 +180,52 @@ Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
   return TryFoldBF16Conversions(hlo);
 }
 
+Status BFloat16ConversionFoldingVisitor::HandleCrossReplicaSum(
+    HloInstruction* crs) {
+  if (!ShapeUtil::IsTuple(crs->shape()) ||
+      !bfloat16_support_->SupportsMixedPrecisions(*crs)) {
+    return DefaultAction(crs);
+  }
+
+  // First use DefaultAction() to handle the operands. It can't handle
+  // tuple-shaped output.
+  TF_RETURN_IF_ERROR(DefaultAction(crs));
+
+  // Then do per-tuple-element handling on the output.
+  std::vector<std::vector<HloInstruction*>> per_tuple_element_gtes(
+      crs->operand_count());
+  for (auto user : crs->users()) {
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      return Status::OK();
+    }
+    per_tuple_element_gtes[user->tuple_index()].push_back(user);
+  }
+
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    // Fold conversions only when all the get-tuple-elements' users are
+    // conversions from F32 to BF16.
+    auto all_gte_users_are_bf16_convert = [&per_tuple_element_gtes, i]() {
+      for (auto gte : per_tuple_element_gtes[i]) {
+        if (!AllUsersAreF32ToBF16Converts(gte)) {
+          return false;
+        }
+      }
+      return true;
+    };
+    if (!all_gte_users_are_bf16_convert()) {
+      continue;
+    }
+
+    ShapeUtil::GetMutableSubshape(crs->mutable_shape(), {i})
+        ->set_element_type(BF16);
+    for (auto gte : per_tuple_element_gtes[i]) {
+      TF_RETURN_IF_ERROR(FoldOutputConversions(gte));
+    }
+  }
+
+  return Status::OK();
+}
+
 StatusOr<bool> BFloat16ConversionFolding::Run(HloModule* module) {
   XLA_VLOG_LINES(
       2, "BFloat16ConversionFolding::Run(), before:\n" + module->ToString());
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index cb37759439debf41a305ec7dccaa548e1bf234cd..28e71c2054f59ba4d5d096bf7d898161877bb42f 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -37,7 +37,8 @@ class TestBFloat16Support : public BFloat16Support {
     if (hlo.opcode() == HloOpcode::kAdd ||
         hlo.opcode() == HloOpcode::kSubtract ||
         hlo.opcode() == HloOpcode::kTuple ||
-        hlo.opcode() == HloOpcode::kGetTupleElement) {
+        hlo.opcode() == HloOpcode::kGetTupleElement ||
+        hlo.opcode() == HloOpcode::kCrossReplicaSum) {
       return true;
     }
     return false;
@@ -47,7 +48,8 @@ class TestBFloat16Support : public BFloat16Support {
     if (hlo.opcode() == HloOpcode::kAdd ||
         hlo.opcode() == HloOpcode::kSubtract ||
         hlo.opcode() == HloOpcode::kTuple ||
-        hlo.opcode() == HloOpcode::kGetTupleElement) {
+        hlo.opcode() == HloOpcode::kGetTupleElement ||
+        hlo.opcode() == HloOpcode::kCrossReplicaSum) {
       return true;
     }
     return false;
@@ -55,7 +57,8 @@ class TestBFloat16Support : public BFloat16Support {
 
   bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
     if (hlo.opcode() == HloOpcode::kAdd || hlo.opcode() == HloOpcode::kTuple ||
-        hlo.opcode() == HloOpcode::kGetTupleElement) {
+        hlo.opcode() == HloOpcode::kGetTupleElement ||
+        hlo.opcode() == HloOpcode::kCrossReplicaSum) {
       return true;
     }
     return false;
@@ -206,4 +209,46 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
   EXPECT_EQ(tuple->operand(1), convert0);
 }
 
+TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, bf16_shape, "a"));
+  HloInstruction* convert_a =
+      builder.AddInstruction(HloInstruction::CreateConvert(f32_shape, a));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32_shape, "b"));
+
+  HloInstruction* crs =
+      builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
+          ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b}));
+  HloInstruction* gte_a = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32_shape, crs, 0));
+  HloInstruction* gte_b = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32_shape, crs, 1));
+  HloInstruction* convert_gte_b =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte_b));
+  HloInstruction* tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({gte_a, convert_gte_b}));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(FoldConversions(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), tuple);
+  EXPECT_EQ(tuple->operand(0), gte_a);
+  EXPECT_EQ(tuple->operand(1), gte_b);
+  EXPECT_EQ(gte_a->shape().element_type(), F32);
+  EXPECT_EQ(gte_b->shape().element_type(), BF16);
+  EXPECT_EQ(crs->operand(0), a);
+  EXPECT_EQ(crs->operand(1), b);
+  EXPECT_EQ(a->shape().element_type(), BF16);
+  EXPECT_EQ(b->shape().element_type(), F32);
+  EXPECT_EQ(ShapeUtil::GetSubshape(crs->shape(), {0}).element_type(), F32);
+  EXPECT_EQ(ShapeUtil::GetSubshape(crs->shape(), {1}).element_type(), BF16);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index 531f36e8c5473ef684e654ed6b89c4d5ef04b401..c26d2feef584faeff013a602409cdd58c2d44a5a 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -606,8 +606,10 @@ Status BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
         continue;
       }
       if (!ShapeUtil::Equal(hlo->literal().shape(), hlo->shape())) {
-        TF_ASSIGN_OR_RETURN(auto converted_literal,
-                            hlo->literal().ConvertToShape(hlo->shape()));
+        TF_ASSIGN_OR_RETURN(
+            auto converted_literal,
+            hlo->literal().ConvertToShape(hlo->shape(),
+                                          /*round_f32_to_bf16=*/true));
         auto new_constant = computation->AddInstruction(
             HloInstruction::CreateConstant(std::move(converted_literal)));
         TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_constant));
@@ -627,6 +629,27 @@ Status BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
   return Status::OK();
 }
 
+Status BFloat16Propagation::RemoveNoopConversions(HloModule* module) {
+  for (auto computation : module->computations()) {
+    for (auto hlo : computation->MakeInstructionPostOrder()) {
+      if (hlo->opcode() != HloOpcode::kConvert) {
+        continue;
+      }
+      auto source = hlo->mutable_operand(0);
+      if (!ShapeUtil::Equal(source->shape(), hlo->shape())) {
+        continue;
+      }
+      const bool is_root = hlo == computation->root_instruction();
+      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(source));
+      if (is_root) {
+        computation->set_root_instruction(source);
+      }
+      TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(hlo));
+    }
+  }
+  return Status::OK();
+}
+
 // The algorithm first does a forward pass (parameters to root) to determine a
 // set of instructions to consider using bfloat16, then does a backward pass to
 // determine the precisions of those instructions according to the need of
@@ -677,6 +700,10 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
   // defining instruction's shape has changed. So we need to adjust the output
   // shapes of instructions according to the HLO values they refer to.
   TF_RETURN_IF_ERROR(ResolveInconsistencyOfAliasingBuffers(module));
+
+  // This pass could have turned an F32 -> BF16 conversion to a no-op (BF16 ->
+  // BF16), so we remove them now.
+  TF_RETURN_IF_ERROR(RemoveNoopConversions(module));
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.h b/tensorflow/compiler/xla/service/bfloat16_propagation.h
index 89a5ac5db1549877a135182ae8df57fa6bf9d579..1744e9db90aeff269daa91eb68a1d61bb0fc3035 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.h
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.h
@@ -133,6 +133,11 @@ class BFloat16Propagation : public HloPassInterface {
   // by the given HLO.
   void AdjustCalledComputationRoot(HloInstruction* hlo);
 
+  // ***************************
+  // Removes no-op conversions (same source and target shapes) that can be
+  // produced this pass.
+  Status RemoveNoopConversions(HloModule* module);
+
   // ***************************
   // Functions called and state used by two or more passes.
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 5950b004b3da439c442eec6e5e09ea2307fcb018..88f83014164ff726a11e45e762b9c082cf12720d 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -617,4 +617,44 @@ TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
   EXPECT_EQ(computation->root_instruction(), dot);
 }
 
+// Tests that if this pass turns an F32 -> BF16 conversion into a no-op (BF16 ->
+// BF16 conversion), then it will remove that conversion.
+TEST_F(BFloat16PropagationTest, NoopConversionRemoved) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {4, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {4, 4});
+
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "param"));
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, param, param));
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, param, param));
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
+  HloInstruction* gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32_shape, tuple, 0));
+  HloInstruction* gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32_shape, tuple, 1));
+  HloInstruction* convert0 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte0));
+  HloInstruction* convert1 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte1));
+  HloInstruction* add2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      bf16_shape, HloOpcode::kAdd, convert0, convert1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), add2);
+  EXPECT_EQ(add2->operand(0), gte0);
+  EXPECT_EQ(add2->operand(1), gte1);
+  EXPECT_EQ(gte0->shape().element_type(), BF16);
+  EXPECT_EQ(gte1->shape().element_type(), BF16);
+  EXPECT_EQ(add0->shape().element_type(), BF16);
+  EXPECT_EQ(add1->shape().element_type(), BF16);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 6664496ab6c603c35c7dce923fcf94c54d1ce714..c83da9eddc8f8b156dd9acfc99b393bf844575da 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -100,7 +100,7 @@ CompileOnlyService::CompileAheadOfTime(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
         CreateModuleConfig(*program_shape, instance.argument_layouts,
-                           &execution_options, *user_computation));
+                           &execution_options, user_computation));
 
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
                         computation_tracker_.BuildHloModule(
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 33e19efc72c6d30ccd7e0b3a13f664a4f42208bf..b4b53ae2ed425a48de5bcb6ba5c37b5d37e1f371 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -127,7 +127,7 @@ class Compiler {
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
   // applied to module. Generally a module should be passed through RunHloPasses
-  // prior to calling this method because the some HLO passes are required for
+  // prior to calling this method because some HLO passes are required for
   // correctness. Takes ownership of the HLO module and is free to transform it.
   //
   // The compiler may optionally specialize to the individual device
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index e9c974a0461da4b79b4d4cf7a15f407ead5eb4bb..40519ecc799c8f0343294ad88009820dbd8535e9 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -78,8 +78,9 @@ SpecialCaseCopyPolicy GetSpecialCaseCopyPolicy(const CallGraphNode& node,
     policy.copy_root_replicated_buffers = true;
   }
   for (const CallSite& site : node.caller_callsites()) {
-    // The kWhile instruction does not have an handling here, as the
-    // AddCopiesForWhile() API takes care of adding its own copies.
+    // The AddCopiesForConditional() already adds copies, but the copy remover
+    // removes them, so we re-add them by returning the policy here. But really
+    // the copy remover should not be removing them.
     if (site.instruction()->opcode() == HloOpcode::kConditional) {
       policy.copy_parameters_and_constants = true;
       policy.copy_root_replicated_buffers = true;
@@ -321,6 +322,29 @@ Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis,
   return Status::OK();
 }
 
+// We add copies for all the indices of the true and false computaiton roots,
+// in order to resolve interference. We later rely on the CopyRemover to drop
+// the unnecessary ones.
+Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
+                               HloInstruction* conditional) {
+  VLOG(2) << "Adding copies for kConditional instruction "
+          << conditional->name();
+  TF_RET_CHECK(conditional->opcode() == HloOpcode::kConditional);
+
+  for (HloComputation* computation :
+       {conditional->true_computation(), conditional->false_computation()}) {
+    HloInstruction* root = computation->root_instruction();
+    std::vector<HloInstruction*> users = root->users();
+    TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy,
+                        computation->DeepCopyInstruction(root));
+    for (HloInstruction* user : users) {
+      TF_RETURN_IF_ERROR(root->ReplaceUseWith(user, deep_copy));
+    }
+    computation->set_root_instruction(deep_copy);
+  }
+  return Status::OK();
+}
+
 // Removes any control dependencies to or from the given instruction.
 Status StripControlDependenciesFrom(HloInstruction* instruction) {
   while (!instruction->control_successors().empty()) {
@@ -348,6 +372,9 @@ Status AddCopiesToResolveInterference(HloModule* module) {
     for (HloInstruction* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kWhile) {
         TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction));
+      } else if (instruction->opcode() == HloOpcode::kConditional) {
+        TF_RETURN_IF_ERROR(
+            AddCopiesForConditional(*alias_analysis, instruction));
       }
     }
   }
@@ -596,6 +623,7 @@ class CopyRemover {
 
       auto is_live_range_before = [this](const ValueNode& a,
                                          const ValueNode& b) {
+        VLOG(3) << "Checking live range of " << *a.value << " WRT " << *b.value;
         if (LiveRangeBefore(a, b)) {
           VLOG(2) << "  Live range of " << a.value->ToShortString()
                   << " is before " << b.value->ToShortString();
@@ -610,7 +638,7 @@ class CopyRemover {
       VLOG(3) << copy->name() << " copies value "
               << src->value->ToShortString();
       VLOG(3) << "Source buffer values: " << ValueListToString(src);
-      VLOG(3) << "Dest buffer values: " << ValueListToString(src);
+      VLOG(3) << "Dest buffer values: " << ValueListToString(dest);
 
       // A kCopy instruction copies an HLO value from a source buffer and
       // defines an HLO value in a destination buffer. Most generally, the
@@ -786,16 +814,16 @@ class CopyRemover {
     // updated as copies are removed.
     bool LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
       if (a.uses.empty()) {
-        VLOG(2) << "Empty uses";
+        VLOG(2) << "Empty uses for " << *a.value;
         return ordering_.IsDefinedBefore(*a.value, *b.value);
       }
       for (const HloUse* use : a.uses) {
-        VLOG(2) << "use: " << *use;
-        VLOG(2) << "is before:" << *b.value;
+        VLOG(2) << "Checking use " << *use << " against " << *b.value;
         if (!ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_)) {
-          VLOG(2) << "Not before";
+          VLOG(2) << "Use " << *use << " is NOT before " << *b.value;
           return false;
         }
+        VLOG(2) << "Use " << *use << " is before " << *b.value;
       }
       return true;
     }
@@ -931,7 +959,6 @@ Status RemoveUnnecessaryCopies(
   CopyRemover copy_remover(*alias_analysis, ordering, module);
   XLA_VLOG_LINES(3, copy_remover.ToString());
 
-  tensorflow::gtl::FlatSet<int> existing_copies;
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kCopy &&
@@ -940,7 +967,6 @@ Status RemoveUnnecessaryCopies(
       }
     }
   }
-
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 91ae66ece11e70459db9a62782d3c24a303829c2..966e2d0fc5b5e21180795a07119cb028913dd176 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -670,6 +670,22 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "ir_emission_utils_test",
+    srcs = ["ir_emission_utils_test.cc"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+    ],
+)
+
 cc_library(
     name = "cpu_layout_assignment",
     srcs = ["cpu_layout_assignment.cc"],
@@ -772,6 +788,31 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "parallel_task_assignment_test",
+    srcs = ["parallel_task_assignment_test.cc"],
+    deps = [
+        ":cpu_executable",
+        ":parallel_task_assignment",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_layout",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:computation_layout",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "cpu_options",
     srcs = ["cpu_options.cc"],
@@ -875,17 +916,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 0a966fd5a7c1ce2c4e367b26701c9186ab2ebf74..e43777c5e5e8afcf08e1e334c8847f6b94d0d047 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -318,7 +318,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
     // Note this is not run for AOT because it would bring in thread pool
     // and thread synchronization dependencies which would likely increase
     // binary size (and most AOT applications are single-threaded).
-    // TODO(29630486) Support multi-threaded AOT.
+    // TODO(b/29630486) Support multi-threaded AOT.
     pipeline.AddPass<ParallelTaskAssigner>(max_parallelism,
                                            ShapeSizeBytesFunction());
   }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 267b89a10b3c038dc2048f0ad5b5b343c88ef0f9..d3502b3a03e27c8f90ed74c4d826dfab1c4e8b75 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -71,11 +71,6 @@ class CpuExecutable : public Executable {
     ir_module_string_ = ir_module_string;
   }
 
-  const Status EqualOrFail(const Executable& executable) {
-    // TODO(b/62952745) Implement equality test on CPU executable.
-    return Unimplemented("Equality test on CPU executable is not implemented.");
-  }
-
   static int64 ShapeSizeBytes(const Shape& shape);
 
   // Type of the computation function we expect in the JIT.
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 6f06256e08e8e3342e77c7c79a2a47465b89eca3..8b1e20d79e90fcc32e985ffb855a1a10cdd2f2b9 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -715,6 +715,11 @@ tensorflow::Status DotOpEmitter::Emit() {
   // which performs the sum-of-products (the reduction loop) before storing
   // the result in the output buffer.
 
+  // This routine assumes that the dot operation is not in a parallelized
+  // enclosing computation.
+  CHECK(
+      dot_.parent()->root_instruction()->outer_dimension_partitions().empty());
+
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index 788217aab6172b4e548452b3f6ffd4197c163ce4..f209a69e3cd0f8d336d61bafd1e22be8bc88ca3f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -34,14 +34,16 @@ bool PotentiallyImplementedAsEigenConvolution(
   //
   // To be sufficient, certain layout constraints need to be satisfied as well.
   const Shape& input_shape = convolution.operand(0)->shape();
-  const Shape& kernel_shape = convolution.operand(0)->shape();
+  const Shape& kernel_shape = convolution.operand(1)->shape();
   if (ShapeUtil::HasZeroElements(input_shape) ||
       ShapeUtil::HasZeroElements(kernel_shape)) {
     return false;
   }
+  // Make sure input and kernel has the same data type.
+  CHECK(
+      ShapeUtil::SameElementTypeIgnoringFpPrecision(input_shape, kernel_shape));
   // TODO(b/65408531): Explore using Eigen dot for complex64 type.
-  if (ShapeUtil::ElementIsComplex(input_shape) ||
-      ShapeUtil::ElementIsComplex(kernel_shape)) {
+  if (ShapeUtil::ElementIsComplex(input_shape)) {
     return false;
   }
   if (window_util::HasWindowReversal(convolution.window())) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..215f48c4cc1a1a6b13d98dff76e0d1f0f773f5c1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+namespace xla {
+namespace {
+
+TEST(IrEmitterTest, ConvWithZeroSizedKernelNotImplementedAsEigen) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithConv
+
+ENTRY Conv {
+  input = f32[32,50,28,28]{3,2,1,0} parameter(0)
+  kernel = f32[0,32,5,5]{3,2,1,0} parameter(1)
+  ROOT convolution = f32[64,50,24,24]{3,2,1,0} convolution(input, kernel),
+    window={size=5x5},
+    dim_labels=b01f_01io->b01f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+
+  HloComputation* entry_computation = module->entry_computation();
+
+  HloInstruction* conv_instr = entry_computation->root_instruction();
+  EXPECT_FALSE(cpu::PotentiallyImplementedAsEigenConvolution(*conv_instr));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 3b8056d50500cac381a1c5ad6b05028476504a47..3405277d449f2d9e558f2d3f83277163655af592 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -438,12 +438,14 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
 
   if (kind == XfeedKind::kInfeed) {
     // Copy to the program buffer address from the acquired buffer.
-    ir_builder_.CreateMemCpy(program_buffer_address, acquired_pointer,
-                             length_32, 1);
+    ir_builder_.CreateMemCpy(program_buffer_address, /*DstAlign=*/1,
+                             acquired_pointer,
+                             /*SrcAlign=*/1, length_32);
   } else {
     // Outfeed -- copy from the in-program address to the acquired buffer.
-    ir_builder_.CreateMemCpy(acquired_pointer, program_buffer_address,
-                             length_32, 1);
+    ir_builder_.CreateMemCpy(acquired_pointer, /*DstAlign=*/1,
+                             program_buffer_address,
+                             /*SrcAlign=*/1, length_32);
   }
 
   ir_builder_.CreateCall(release_func,
@@ -2441,7 +2443,8 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
     target_array.AnnotateLoadStoreInstructionWithMetadata(store_instruction);
   } else {
     auto* memcpy_instruction = ir_builder_.CreateMemCpy(
-        target, source, element_count * primitive_type_size, element_alignment);
+        target, /*DstAlign=*/element_alignment, source,
+        /*SrcAlign=*/element_alignment, element_count * primitive_type_size);
 
     // The memcpy does the load and the store internally.  The aliasing related
     // metadata has to reflect that.
@@ -2905,7 +2908,8 @@ Status IrEmitter::EmitMemcpy(const HloInstruction& source,
   llvm::Value* destination_value = GetEmittedValueFor(&destination);
   int64 source_size = ByteSizeOf(source.shape());
   // TODO(b/63762267): Be more aggressive about specifying alignment.
-  ir_builder_.CreateMemCpy(destination_value, source_value, source_size, 1);
+  ir_builder_.CreateMemCpy(destination_value, /*DstAlign=*/1, source_value,
+                           /*SrcAlign=*/1, source_size);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index c393e9b8ea39bfb4c605ebba8e2cd29726bc4af9..87c0a3df458eb4b3f217192597e0de1576304367 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -83,12 +83,6 @@ class ParallelCpuExecutable : public Executable {
     return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
   }
 
-  const Status EqualOrFail(const Executable& executable) {
-    // TODO(b/62952745) Implement equality test on CPU parallel executable.
-    return Unimplemented(
-        "Equality test on CPU parallel executable is not implemented.");
-  }
-
  private:
   // Allocate buffers required for execution and assign them to the elements of
   // "buffers". "buffers" should be sized to the number of buffers in buffer
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index deb21bf4ef5895cfdbec5c2449b6ce7b306a7008..fb28280fade307ac1f193e7dca481bd2afa855fc 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -71,7 +71,7 @@ class DefaultCostModel : public ParallelCostModel {
     if (flops_to_bytes_ratio <= 1.0) {
       // Limit max parallelism for I/O bound instructions by assuming a
       // sub-linear scaling function (fit based on empirical benchmark results).
-      // TODO(29630486) Develop system bandwidth model.
+      // TODO(b/29630486) Develop system bandwidth model.
       max_parallelism =
           std::ceil(std::sqrt(tensorflow::port::NumSchedulableCPUs()));
       // Use shape size instruction cost and L2 cache size min per-thread cost.
@@ -81,7 +81,7 @@ class DefaultCostModel : public ParallelCostModel {
       // Use max parallelism for compute bound instructions.
       max_parallelism = max_parallelism_;
       // Calculate the instruction cost in cycles.
-      // TODO(29630486) Improve on this linear cost model.
+      // TODO(b/29630486) Improve on this linear cost model.
       // Consider making 'min_cost_per_thread' be a function of the target
       // bandwidth limit for instructions with low arithmetic complexity.
       instruction_cost =
@@ -128,24 +128,25 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
   // one of the following properties:
   // *) Internal threading (library calls to kConv, kDot, kFft, kCustomCall).
   // *) Emit custom loops (kSelectAndScatter, FusionKind::kTransposeDot).
+  // *) Operations that are not thread safe (like infeed and rng).
   // *) Tuple-shaped.
   // TODO(b/27458679) Parallelize instructions which are skipped here.
-  if (instruction->opcode() == HloOpcode::kParameter ||
-      instruction->opcode() == HloOpcode::kConstant ||
-      instruction->opcode() == HloOpcode::kCall ||
-      instruction->opcode() == HloOpcode::kCustomCall ||
-      instruction->opcode() == HloOpcode::kSelectAndScatter ||
-      instruction->opcode() == HloOpcode::kGetTupleElement ||
-      instruction->opcode() == HloOpcode::kBitcast ||
-      instruction->opcode() == HloOpcode::kFft ||
-      (instruction->opcode() == HloOpcode::kConvolution &&
+  auto opcode = instruction->opcode();
+  if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
+      opcode == HloOpcode::kCall || opcode == HloOpcode::kCustomCall ||
+      opcode == HloOpcode::kDot || opcode == HloOpcode::kSelectAndScatter ||
+      opcode == HloOpcode::kGetTupleElement || opcode == HloOpcode::kBitcast ||
+      opcode == HloOpcode::kFft || opcode == HloOpcode::kInfeed ||
+      opcode == HloOpcode::kOutfeed || opcode == HloOpcode::kRng ||
+      (opcode == HloOpcode::kConvolution &&
        PotentiallyImplementedAsEigenConvolution(*instruction)) ||
       PotentiallyImplementedAsEigenDot(*instruction) ||
-      (instruction->opcode() == HloOpcode::kFusion &&
+      (opcode == HloOpcode::kFusion &&
        instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) ||
       ShapeUtil::IsTuple(instruction->shape())) {
     return 1;
   }
+
   // Consult 'cost_model_' to compute target parallel task count.
   return cost_model_->GetParallelTaskCount(instruction);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..13eb75a57213b1a68a5732a4f6061efdf97fa4f4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace xla {
+namespace {
+
+class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
+ protected:
+  const HloCostAnalysis::ShapeSizeFunction shape_size_func_ =
+      cpu::CpuExecutable::ShapeSizeBytes;
+
+  // Use any value larger than 2 since we only test whether a module is
+  // parallelized or not
+  const int max_parallelism_ = 10;
+};
+
+TEST_F(ParallelTaskAssignmentTest, DotOperationNotParallelized) {
+  const string hlo_string = R"(
+    HloModule TestTaskParallel_Dot
+    ENTRY Dot {
+      dot_lhs = f32[196614,2]{1,0} parameter(0)
+      dot_rhs = f32[2,1]{1,0} parameter(1)
+      ROOT dot = f32[196614,1]{1,0} dot(dot_lhs, dot_rhs),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
+                                            max_parallelism_, shape_size_func_)
+                                            .Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ParallelTaskAssignmentTest,
+       FusedComputationWithDotOperationNotParallelized) {
+  const string hlo_string = R"(
+    HloModule TestTaskParallel_DotNestedInFusedComp
+    fused_computation.0 {
+      parameter.0 = f32[196614,2]{1,0} parameter(0)
+      parameter.0.1 = f32[2,1]{1,0} parameter(1)
+      parameter.0.2 = f32[196614,1]{1,0} parameter(2)
+      dot.0 = f32[196614,1]{1,0} dot(parameter.0, parameter.0.1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      ROOT add.0 = f32[196614,1]{1,0} add(dot.0, parameter.0.2)
+
+    }
+    ENTRY DotNestedInFusedComp {
+      parameter = f32[196614,2]{1,0} parameter(0)
+      parameter.1 = f32[2,1]{1,0} parameter(1)
+      parameter.2 = f32[196614,1]{1,0} parameter(2)
+      ROOT fusion = f32[196614,1]{1,0} fusion(parameter, parameter.1,
+        parameter.2), kind=kOutput, calls=fused_computation.0
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
+                                            max_parallelism_, shape_size_func_)
+                                            .Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ParallelTaskAssignmentTest, RngOperationNotParallelized) {
+  const string hlo_string = R"(
+    HloModule TestTaskParallel_rng
+    ENTRY Rng {
+      src0 = f32[] parameter(0)
+      src1 = f32[] parameter(1)
+      ROOT rng0 = f32[1234567,2]{1,0} rng(f32[] src0, f32[] src1),
+      distribution=rng_uniform
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
+                                            max_parallelism_, shape_size_func_)
+                                            .Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
+  const string hlo_string = R"(
+    HloModule TestTaskParallel_infeed_outfeed
+    ENTRY InfeedOutfeed {
+      infeed0 = u32[12345678,2]{1,0} infeed()
+      ROOT outfeed0 = u32[12345678,2]{1,0} outfeed(infeed0)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
+                                            max_parallelism_, shape_size_func_)
+                                            .Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.cc b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
index 61b408b8c24dded134218110d4e219c31f1685a8..42fe955f1917e0268dc739e44fbd0a7afb39185c 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
@@ -20,12 +20,13 @@ namespace cpu {
 
 std::vector<int64> ShapePartitionAssigner::Run(int64 target_partition_count) {
   // Gather outer-most dims where dim_size >= 'target_partition_count'.
-  // Note: always leave inner-dim static for vectorization/optimizations.
+  // This may include the inner-dim as LLVM can vectorize loops with dynamic
+  // bounds.
   std::vector<int64> outer_dims;
   int64 outer_dim_size = 1;
   // TODO(b/27458679) Consider reserving enough minor dimensions (based on
   // target vector register width) to enable vector instructions.
-  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 1; --i) {
+  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 0; --i) {
     const int64 dimension = shape_.layout().minor_to_major(i);
     outer_dims.push_back(dimension);
     outer_dim_size *= shape_.dimensions(dimension);
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
index ee0c53fa6d7c41481a53350e57e5844dea2644c1..ae80a6f4977f85cfd9f872734fd0a69432a1f382 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
@@ -30,105 +30,65 @@ class ShapePartitionAssignerTest : public HloTestBase {
  protected:
   typedef std::vector<int64> Vec;
 
-  void RunR2Test(const Shape& shape, const int64 expected_max_partition_count) {
+  void RunR2Test(const Shape& shape, int64 max_target_partition_count,
+                 const std::vector<int64>* expected_partitions) {
     ShapePartitionAssigner assigner(shape);
-    // Check all partitions of outer dimension.
-    for (int64 i = 1; i <= expected_max_partition_count; ++i) {
-      EXPECT_TRUE(ContainersEqual(Vec({i}),
-                                  assigner.Run(/*target_partition_count=*/i)));
+    // Iterate through 1..max_target_partition_count.
+    for (int64 i = 1; i <= max_target_partition_count; ++i) {
+      std::vector<int64> actual_partitions =
+          assigner.Run(/*target_partition_count=*/i);
+      EXPECT_THAT(actual_partitions, expected_partitions[i - 1]);
     }
-    // Check target_partition_count > outer dimension size.
-    EXPECT_TRUE(ContainersEqual(
-        Vec({expected_max_partition_count}),
-        assigner.Run(
-            /*target_partition_count=*/expected_max_partition_count + 1)));
   }
 };
 
 TEST_F(ShapePartitionAssignerTest, Shape13WithLayout10) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {1, 3}, {1, 0}), 1);
+  std::vector<int64> expected_partitions[] = {{1} /* 1 */, {1, 2} /* 2 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {1, 3}, {1, 0}), 2,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape31WithLayout01) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {3, 1}, {0, 1}), 1);
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */, {1, 2} /* 2 */
+  };
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {3, 1}, {0, 1}), 2,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape53WithLayout10) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0}), 5);
+  std::vector<int64> expected_partitions[] = {{1} /* 1 */, {2} /* 2 */,
+                                              {3} /* 3 */, {4} /* 4 */,
+                                              {5} /* 5 */, {3, 2} /* 6 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0}), 6,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape53WithLayout01) {
-  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {0, 1}), 3);
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */, {2} /* 2 */, {3} /* 3 */, {2, 2} /* 4 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {0, 1}), 4,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape532WithLayout210) {
-  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0});
-  ShapePartitionAssigner assigner(shape);
-
-  for (int64 i = 1; i <= 5; ++i) {
-    EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run(
-                                              /*target_partition_count=*/i)));
-  }
-
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({4, 2}), assigner.Run(/*target_partition_count=*/8)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/10)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/11)));
-  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
-                              assigner.Run(/*target_partition_count=*/12)));
-  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
-                              assigner.Run(/*target_partition_count=*/13)));
-  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
-                              assigner.Run(/*target_partition_count=*/14)));
-  EXPECT_TRUE(ContainersEqual(Vec({5, 3}),
-                              assigner.Run(/*target_partition_count=*/15)));
-  EXPECT_TRUE(ContainersEqual(Vec({5, 3}),
-                              assigner.Run(/*target_partition_count=*/16)));
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */,     {2} /* 2 */,     {3} /* 3 */,     {4} /* 4 */,
+      {5} /* 5 */,     {3, 2} /* 6 */,  {3, 2} /* 7 */,  {4, 2} /* 8 */,
+      {3, 3} /* 9 */,  {3, 3} /* 10 */, {3, 3} /* 11 */, {4, 3} /* 12 */,
+      {4, 3} /* 13 */, {4, 3} /* 14 */, {5, 3} /* 15 */, {4, 2, 2} /* 16 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0}), 16,
+            expected_partitions);
 }
 
 TEST_F(ShapePartitionAssignerTest, Shape532WithLayout201) {
-  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 0, 1});
-  ShapePartitionAssigner assigner(shape);
-
-  for (int64 i = 1; i <= 3; ++i) {
-    EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run(
-                                              /*target_partition_count=*/i)));
-  }
-
-  EXPECT_TRUE(
-      ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/4)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/5)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/8)));
-  EXPECT_TRUE(
-      ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/10)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
-                              assigner.Run(/*target_partition_count=*/11)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
-                              assigner.Run(/*target_partition_count=*/12)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
-                              assigner.Run(/*target_partition_count=*/13)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
-                              assigner.Run(/*target_partition_count=*/14)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 5}),
-                              assigner.Run(/*target_partition_count=*/15)));
-  EXPECT_TRUE(ContainersEqual(Vec({3, 5}),
-                              assigner.Run(/*target_partition_count=*/16)));
+  std::vector<int64> expected_partitions[] = {
+      {1} /* 1 */,     {2} /* 2 */,     {3} /* 3 */,     {2, 2} /* 4 */,
+      {2, 2} /* 5 */,  {3, 2} /* 6 */,  {3, 2} /* 7 */,  {3, 2} /* 8 */,
+      {3, 3} /* 9 */,  {3, 3} /* 10 */, {3, 3} /* 11 */, {3, 4} /* 12 */,
+      {3, 4} /* 13 */, {3, 4} /* 14 */, {3, 5} /* 15 */, {3, 2, 2} /* 16 */};
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 0, 1}), 16,
+            expected_partitions);
 }
 
 class ShapePartitionIteratorTest : public HloTestBase {
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 80c24eaccfc2a83f8f3f311d60860715668d0c08..4198260a222d89c60b58dc2a11bf955715365952 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -87,7 +87,6 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
-      execution_session_(string_pool_),
       symbol_resolver_(llvm::orc::createLegacyLookupResolver(
           [this](const std::string& name) -> llvm::JITSymbol {
             return this->ResolveRuntimeSymbol(name);
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index aaeff2de8785b99d271f13b261c63118bcf7bd4a..f4260a95bc45557b6cd969f7d3fff01c8b392575 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -102,7 +102,6 @@ class SimpleOrcJIT {
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const Disassembler disassembler_;
   const llvm::DataLayout data_layout_;
-  llvm::orc::SymbolStringPool string_pool_;
   llvm::orc::ExecutionSession execution_session_;
   std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
   ObjLayerT object_layer_;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index ecda5288ee17a3856ce95f0caa327c3524fd180b..240faebe62f5cee4f61b3c36b5e8f653cfd6db8e 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -35,6 +35,12 @@ class HloInstruction;
 // DfsHloVisitor with default action based on the HloInstruction being visited.
 // Users should not use this class directly, but use the type aliases
 // DfsHloVisitorWithDefault/ConstDfsHloVisitorWithDefault instead.
+//
+// Do *not* add an override to this class if the opcode is covered by
+// HandleElementwiseUnary/Binary. These opcode handlers dispatch to
+// HandleElementwiseUnary/Binary in DfsHloVisitorBase. Adding such a handler
+// here will break passes which rely on the HandleElementwiseUnary/Binary
+// handling these opcodes.
 template <typename HloInstructionPtr>
 class DfsHloVisitorWithDefaultBase
     : public DfsHloVisitorBase<HloInstructionPtr> {
@@ -70,12 +76,6 @@ class DfsHloVisitorWithDefaultBase
   Status HandleConcatenate(HloInstructionPtr concatenate) override {
     return DefaultAction(concatenate);
   }
-  Status HandleConvert(HloInstructionPtr convert) override {
-    return DefaultAction(convert);
-  }
-  Status HandleCopy(HloInstructionPtr copy) override {
-    return DefaultAction(copy);
-  }
   Status HandleSelect(HloInstructionPtr select) override {
     return DefaultAction(select);
   }
@@ -91,9 +91,6 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCrossReplicaSum(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
-  Status HandleCompare(HloInstructionPtr compare) override {
-    return DefaultAction(compare);
-  }
   Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..825e1436f0ec6d49b555e5e3e9c2c7a19fb7b062
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class DfsHloVisitorWithDefaultTest : public HloTestBase {};
+
+TEST_F(DfsHloVisitorWithDefaultTest, DefaultElementwiseTest) {
+  // Verify that HandleElementwiseBinary and HandleElementwiseUnary are called
+  // on the appropriate HLO ops (elementwise binary/unary ops).
+
+  class ElementwiseTestVisitor : public DfsHloVisitorWithDefault {
+   public:
+    Status DefaultAction(HloInstruction* hlo) override {
+      // The HLO should be neither an elementwise unary nor binary op. These
+      // cases are handled in HandleElementwiseBinary/Unary.
+      TF_RET_CHECK(!(hlo->IsElementwise() && hlo->operand_count() == 2))
+          << hlo->ToString();
+      TF_RET_CHECK(!(hlo->IsElementwise() && hlo->operand_count() == 1))
+          << hlo->ToString();
+      return Status::OK();
+    }
+
+    Status HandleElementwiseBinary(HloInstruction* hlo) override {
+      // HLO should be elementwise binary.
+      TF_RET_CHECK(hlo->IsElementwise() && hlo->operand_count() == 2)
+          << hlo->ToString();
+      return Status::OK();
+    }
+    Status HandleElementwiseUnary(HloInstruction* hlo) override {
+      // HLO should be elementwise unary.
+      TF_RET_CHECK(hlo->IsElementwise() && hlo->operand_count() == 1)
+          << hlo->ToString();
+      return Status::OK();
+    }
+  };
+
+  // HLO module contains are arbitrary mix of elementwise and non-elementwise
+  // operations.
+  const string& hlo_string = R"(
+HloModule TestModule
+
+ENTRY TestComputation {
+  arg = f32[] parameter(0)
+  tuple = (f32[]) tuple(arg)
+  gte = f32[] get-tuple-element(tuple), index=0
+  abs = f32[] abs(arg)
+  add = f32[] add(arg, gte)
+  broadcast = f32[42] broadcast(add), dimensions={}
+  slice = f32[0] slice(broadcast), slice={[1:2]}
+  copy = f32[] copy(arg)
+  eq = pred[] equal-to(arg, gte)
+  neg = f32[] negate(arg)
+  ROOT convert = f64[] convert(f32[] arg)
+})";
+  std::unique_ptr<HloModule> module =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())
+          .ConsumeValueOrDie();
+  ElementwiseTestVisitor visitor;
+  TF_EXPECT_OK(module->entry_computation()->Accept(&visitor));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index be92b1629a2d8dae57b315751bd4f7f9ccddf171..471d2fd6cebcd7a00dfea4aca08da08af534b05f 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -80,6 +80,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
 
   StatusOr<std::unique_ptr<ShapedBuffer>> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
+  TF_RETURN_IF_ERROR(return_value.status());
 
   if (profile != nullptr) {
     VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 0aee535ee780ef000bc5e9963ff48786b3a61eb2..a157235f8af6ea64a488510e427bbae502c46ca6 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -109,14 +108,6 @@ class Executable {
     return execution_profile_;
   }
 
-  // Returns Status::ok() if the two executables are equal to each other.
-  //
-  // An error status is returned otherwise.
-  virtual const Status EqualOrFail(const Executable& executable) {
-    return Unimplemented(
-        "Equality test on this executable is not implemented.");
-  }
-
   const HloProfilePrinterData& hlo_profile_printer_data() const {
     CHECK(hlo_profiling_enabled());
     return *hlo_profile_printer_data_;
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index a133d810675814f6be7da23a2335fb19f3ff47fc..221ff7900f398166c193c495848a2afcfd4edc81 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -39,7 +39,7 @@ static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
     }
   }
   permutation.push_back(index_vector_dim);
-  return CreateTransposeHlo(gather_indices, permutation);
+  return MakeTransposeHlo(gather_indices, permutation);
 }
 
 // If the gather_indices holds scalar indices (i.e. gather_indices has rank N
@@ -53,9 +53,14 @@ static StatusOr<HloInstruction*> DeScalarizeGatherIndices(
     return gather_indices;
   }
 
-  int64 last_index = gather_indices_shape.dimensions(
-      gather_indices_shape.dimensions_size() - 1);
-  return ExpandLastDimIntoNDims(gather_indices, {last_index, 1});
+  DCHECK_EQ(index_vector_dim, gather_indices_shape.dimensions_size());
+
+  std::vector<int64> result_shape_dims;
+  c_copy(gather_indices_shape.dimensions(),
+         std::back_inserter(result_shape_dims));
+  result_shape_dims.push_back(1);
+
+  return MakeReshapeHlo(result_shape_dims, gather_indices);
 }
 
 // Canonicalizes the gather_indices tensors so that we only have deal with some
@@ -81,16 +86,17 @@ static StatusOr<HloInstruction*> CanonicalizeGatherIndices(
   // all of the non-index-vector dimensions.
   const Shape& shape = transposed_gather_indices->shape();
   if (shape.dimensions_size() == 1) {
-    return ExpandFirstDimIntoNDims(gather_indices, {1, shape.dimensions(0)});
+    return ExpandFirstDimIntoNDims(transposed_gather_indices,
+                                   {1, shape.dimensions(0)});
   } else {
     return CollapseFirstNDims(transposed_gather_indices,
                               shape.dimensions_size() - 1);
   }
 }
 
-// Expands out the gather dimensions in the accumulator produced by the while
-// loop.
-static StatusOr<HloInstruction*> ExpandGatherDimsInAccumulator(
+// Expands out or contracts away the gather dimensions in the accumulator
+// produced by the while loop.
+static StatusOr<HloInstruction*> AdjustGatherDimsInAccumulator(
     const Shape& gather_indices_shape, HloInstruction* accumulator,
     int64 index_vector_dim) {
   std::vector<int64> output_gather_dim_bounds;
@@ -103,9 +109,14 @@ static StatusOr<HloInstruction*> ExpandGatherDimsInAccumulator(
 
   if (output_gather_dim_bounds.empty()) {
     // If output_gather_dim_bounds is empty we must be lowering a (effectively)
-    // dynamic-slice.
+    // dynamic-slice.  In that case, there is a leading degenerate gather
+    // dimension that we added to make this special case play well with the
+    // general while loop which we need to remove now.
     CHECK_EQ(accumulator->shape().dimensions(0), 1);
-    return CollapseFirstNDims(accumulator, 2);
+    ArraySlice<int64> reshaped_dim_sizes =
+        AsInt64Slice(accumulator->shape().dimensions());
+    reshaped_dim_sizes.remove_prefix(1);
+    return MakeReshapeHlo(reshaped_dim_sizes, accumulator);
   }
 
   return ExpandFirstDimIntoNDims(accumulator, output_gather_dim_bounds);
@@ -133,16 +144,16 @@ static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
         dim_numbers.gather_dims_to_operand_dims_size()) {
       TF_ASSIGN_OR_RETURN(
           HloInstruction * component_to_concat,
-          CreateSliceHlo(
-              index_vector, /*start_indices=*/{index_vector_dim_index},
-              /*limit_indices=*/{index_vector_dim_index + 1}, /*strides=*/{1}));
+          MakeSliceHlo(index_vector, /*start_indices=*/{index_vector_dim_index},
+                       /*limit_indices=*/{index_vector_dim_index + 1},
+                       /*strides=*/{1}));
       expanded_index_components.push_back(component_to_concat);
     } else {
       expanded_index_components.push_back(zero);
     }
   }
 
-  return CreateConcatHlo(expanded_index_components, /*dimension=*/0);
+  return MakeConcatHlo(expanded_index_components, /*dimension=*/0);
 }
 
 // This generates the body of the while that implements the main data movement
@@ -159,8 +170,8 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * induction_var_as_vector,
-      CreateBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
-                         /*result_shape_bounds=*/{1}));
+      MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
+                       /*result_shape_bounds=*/{1}));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * index_into_gather_indices,
@@ -169,8 +180,8 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * index_vector_2d,
-      CreateDynamicSliceHlo(gather_indices, index_into_gather_indices,
-                            {1, index_vector_size}));
+      MakeDynamicSliceHlo(gather_indices, index_into_gather_indices,
+                          {1, index_vector_size}));
 
   TF_ASSIGN_OR_RETURN(HloInstruction * index_vector,
                       ElideDegenerateDims(index_vector_2d, {0}));
@@ -181,8 +192,8 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
                           operand->shape().dimensions_size()));
 
   TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice,
-                      CreateDynamicSliceHlo(operand, gathered_slice_start,
-                                            gather.gather_window_bounds()));
+                      MakeDynamicSliceHlo(operand, gathered_slice_start,
+                                          gather.gather_window_bounds()));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * gathered_slice_for_update,
@@ -197,8 +208,8 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * updated_accumulator,
-      CreateDynamicUpdateSliceHlo(output_accumulator, gathered_slice_for_update,
-                                  index_vector_into_accumulator));
+      MakeDynamicUpdateSliceHlo(output_accumulator, gathered_slice_for_update,
+                                index_vector_into_accumulator));
 
   // New loop state -- only the accumulator has changed.  The
   // WhileUtil::MakeCountedLoop functions takes care of the induction variable
@@ -250,7 +261,7 @@ static StatusOr<HloInstruction*> PermuteGatherAndWindowDims(
     }
   }
 
-  return CreateTransposeHlo(accumulator, permutation);
+  return MakeTransposeHlo(accumulator, permutation);
 }
 
 // High Level Algorithm
@@ -290,21 +301,38 @@ static StatusOr<HloInstruction*> PermuteGatherAndWindowDims(
 
 StatusOr<HloInstruction*> GatherExpander::ExpandGather(
     HloInstruction* gather_instr) {
+  CHECK(!ShapeUtil::HasZeroElements(gather_instr->shape()));
+
   HloComputation* computation = gather_instr->parent();
   HloInstruction* operand = gather_instr->mutable_operand(0);
   HloInstruction* gather_indices = gather_instr->mutable_operand(1);
+  const Shape& gather_indices_shape = gather_indices->shape();
   const Shape& output_shape = gather_instr->shape();
   int64 output_rank = output_shape.dimensions_size();
 
   const GatherDimensionNumbers& dim_numbers =
       gather_instr->gather_dimension_numbers();
 
+  int64 gather_loop_trip_count = 1;
+  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != dim_numbers.index_vector_dim()) {
+      gather_loop_trip_count *= gather_indices_shape.dimensions(i);
+    }
+  }
+
+  if (!IsInt32(gather_loop_trip_count)) {
+    return Unimplemented(
+        "Gather operations with more than 2147483647 gather indices are not "
+        "supported. This error occurred for %s.",
+        gather_instr->ToString().c_str());
+  }
+
   TF_ASSIGN_OR_RETURN(HloInstruction * canonical_gather_indices,
                       CanonicalizeGatherIndices(
                           gather_indices, dim_numbers.index_vector_dim()));
 
-  const int64 gather_loop_trip_count =
-      canonical_gather_indices->shape().dimensions(0);
+  CHECK_EQ(gather_loop_trip_count,
+           canonical_gather_indices->shape().dimensions(0));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * accumulator_init,
@@ -331,7 +359,7 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * accumulator_with_output_gather_dims_decanonicalized,
-      ExpandGatherDimsInAccumulator(gather_indices->shape(),
+      AdjustGatherDimsInAccumulator(gather_indices->shape(),
                                     accumulator_with_window_dims_elided,
                                     dim_numbers.index_vector_dim()));
 
@@ -341,12 +369,17 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
 }
 
 StatusOr<bool> GatherExpander::Run(HloModule* module) {
+  auto is_nontrivial_gather = [](HloInstruction* inst) {
+    return inst->opcode() == HloOpcode::kGather &&
+           // Avoid expanding gather ops that produce zero sized tensors,
+           // instead punt these to ZeroSizedHloElimination.
+           !ShapeUtil::HasZeroElements(inst->shape());
+  };
+
   std::vector<HloInstruction*> gather_instrs;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     c_copy_if(computation->instructions(), std::back_inserter(gather_instrs),
-              [](HloInstruction* inst) {
-                return inst->opcode() == HloOpcode::kGather;
-              });
+              is_nontrivial_gather);
   }
 
   for (HloInstruction* inst : gather_instrs) {
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ba41ee8428cbe7132103df24d552565a8dc2f9f6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gather_expander.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+namespace xla {
+namespace {
+TEST(GatherExpanderTest, ErrorStatusOnTooManyIndices) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2147483647,5] parameter(1)
+  ROOT gather = s32[2147483647,3,5] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3, 1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_text));
+
+  Status status = GatherExpander{}.Run(module.get()).status();
+  EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
+
+  ASSERT_THAT(
+      status.error_message(),
+      ::testing::HasSubstr("Gather operations with more than 2147483647 gather "
+                           "indices are not supported."));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index a3b7e10ae8df080879ce98b02b83f246bb19204b..f1707442fe3354d5183d905468810f3871146ff5 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -241,6 +241,7 @@ cc_library(
         "gpu_executable.cc",
         "infeed_thunk.cc",
         "kernel_thunk.cc",
+        "memset_thunk.cc",
         "sequential_thunk.cc",
         "thunk_schedule.cc",
         "tuple_thunk.cc",
@@ -257,6 +258,7 @@ cc_library(
         "gpu_executable.h",
         "infeed_thunk.h",
         "kernel_thunk.h",
+        "memset_thunk.h",
         "sequential_thunk.h",
         "thunk.h",
         "thunk_schedule.h",
@@ -273,6 +275,7 @@ cc_library(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -293,6 +296,7 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
+        "//tensorflow/stream_executor",
     ],
 )
 
@@ -696,17 +700,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index c67b552abbdc971351f99ec89536af78479b87c1..07be2a0cf90c326af6e41764e79950db546e43e4 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -671,6 +671,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   if (module->config().hlo_profiling_enabled()) {
     HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
+    cost_analysis.set_bytes_per_second(
+        stream_exec->GetDeviceDescription().memory_bandwidth());
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
     profile_printer =
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 04b37d913e0bc8f8226057f107da05fd1e675010..28f93447953b90d8a7fa4386e2355066c0405aec 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -267,16 +267,22 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
        ++i) {
     const BufferAllocation& allocation = assignment_->GetAllocation(i);
     if (allocation.is_entry_computation_parameter()) {
-      // The caller must give us a buffer for ShapeIndex {} of every parameter.
-      // It can optionally give us a buffer for other ShapeIndices, but we
-      // ignore them: Because we can't rely on these sub-buffers' addresses
-      // being available, our generated code can't use them.  Instead, it must
-      // chase pointers starting at the tuple root.
-      if (allocation.param_shape_index().empty()) {
-        auto param_no = allocation.parameter_number();
-        buffer_allocations_builder.RegisterBuffer(
-            i, arguments[param_no]->root_buffer());
+      auto param_no = allocation.parameter_number();
+      se::DeviceMemoryBase buffer =
+          arguments[param_no]->buffer(allocation.param_shape_index());
+
+      // All top-level buffers and sub-buffers must have an explicit, non-null
+      // pointer, except for zero-sized buffers, which may be null.
+      if (buffer.is_null() && buffer.size() > 0) {
+        return FailedPrecondition(
+            "Cannot run XLA computation because pointer to (sub-)buffer at "
+            "index %s of parameter %lld was null.  All pointers to "
+            "(sub-)buffers must not be null, unless the (sub-)buffer has zero "
+            "elements.",
+            allocation.param_shape_index().ToString().c_str(), param_no);
       }
+
+      buffer_allocations_builder.RegisterBuffer(i, buffer);
     }
   }
   se::StreamExecutor* executor = run_options->stream()->parent();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index b19cfd43debd0a5490495d176fa2f1fcd625da07..dcb3991f41a31db84d8e9e555ae7d13c3ac84b97 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -83,11 +83,6 @@ class GpuExecutable : public Executable {
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
-  const Status EqualOrFail(const Executable& executable) {
-    // TODO(b/62952745) Implement equality test on GPU executable.
-    return Unimplemented("Equality test on GPU executable is not implemented.");
-  }
-
  private:
   // If `block_host_until_done` is false, execution will not block the host
   // until the kernels have completed. This is used as an optimization for
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 2381d7a7d59ba2777e711138779b4493b8037f3d..d29cc21ab1c697f8481ed1e94846d4df5ec5c1dc 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <cstring>
 #include <memory>
 #include <string>
 #include <vector>
@@ -44,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/memset_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
@@ -142,37 +145,6 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
        llvm::ConstantAsMetadata::get(threads_per_block_ir_value)}));
 }
 
-// Tries to get a Slice for the given instruction at the given index, but
-// returns nullopt if we might not know the slice's address at runtime without
-// dereferencing a containing tuple.
-//
-// In particular, when XLA accepts a parameter of tuple type, the caller has the
-// option of telling XLA what are the values inside of the tuple, or just giving
-// XLA a pointer to the top-level tuple and letting us chase the pointers on the
-// GPU.  We therefore cannot rely having these pointers to parameter sub-buffers
-// being present when we run the program.
-optional<BufferAllocation::Slice> GetKnownAtRuntimeSlice(
-    const HloInstruction* instr, const ShapeIndex& index,
-    const BufferAssignment& buffer_assn) {
-  auto maybe_slice = buffer_assn.GetUniqueSlice(instr, index);
-  if (!maybe_slice.ok()) {
-    return nullopt;
-  }
-  // BufferAllocation gives a slice and alloc to every buffer accessed by XLA,
-  // but we don't necessarily know the runtime address of sub-buffers of input
-  // parameters.
-  const BufferAllocation::Slice& slice = maybe_slice.ValueOrDie();
-  const BufferAllocation* alloc = slice.allocation();
-  if (alloc->IsInputOrOutput() && !alloc->maybe_live_out() &&
-      !alloc->param_shape_index().empty()) {
-    return nullopt;
-  }
-
-  // Otherwise, we will know the address of this slice at runtime without having
-  // to dereference a tuple.
-  return slice;
-}
-
 }  // namespace
 
 IrEmitterUnnested::IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
@@ -203,7 +175,7 @@ bool ImplementedAsHostToDeviceMemcpy(const BufferAssignment& buffer_assignment,
   return hlo.opcode() == HloOpcode::kCopy &&
          hlo.operand(0)->opcode() == HloOpcode::kConstant &&
          ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) &&
-         GetKnownAtRuntimeSlice(&hlo, {}, buffer_assignment).has_value();
+         buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok();
 }
 
 bool ImplementedAsDeviceToDeviceMemcpy(
@@ -213,13 +185,13 @@ bool ImplementedAsDeviceToDeviceMemcpy(
   //
   // 1. `hlo` is a kCopy instruction.
   // 2. `hlo` and its operand have the same shape (thus the same layout too).
-  // 3. The operand to `hlo` has a buffer assignment (constants do not, for
-  //    instance) which means the source buffer also resides on the device.
+  // 3. `hlo` and its operand have a statically-known buffer assignment
+  //     (constants do not, for instance), which means the source buffer also
+  //     resides on the device.
   return hlo.opcode() == HloOpcode::kCopy &&
          ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) &&
-         GetKnownAtRuntimeSlice(&hlo, {}, buffer_assignment).has_value() &&
-         GetKnownAtRuntimeSlice(hlo.operand(0), {}, buffer_assignment)
-             .has_value();
+         buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok() &&
+         buffer_assignment.GetUniqueTopLevelSlice(hlo.operand(0)).ok();
 }
 }  // namespace
 
@@ -498,12 +470,11 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     switch (root->opcode()) {
       case HloOpcode::kReduce: {
         VLOG(3) << "Emitting fused reduction to vector: " << fusion->ToString();
+        TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
+                            BuildInitializerThunk(fusion));
         std::vector<std::unique_ptr<Thunk>> thunks;
-        thunks.emplace_back(BuildKernelThunk(fusion));
-        TF_RETURN_IF_ERROR(EmitInitializer(
-            fusion, static_cast<KernelThunk*>(thunks.back().get())));
-        bindings_.UnbindAllLocalIrValues();
-        thunks.emplace_back(BuildKernelThunk(fusion));
+        thunks.push_back(std::move(initializer_thunk));
+        thunks.push_back(BuildKernelThunk(fusion));
         thunk_sequence_->emplace_back(
             MakeUnique<SequentialThunk>(std::move(thunks), fusion));
         std::vector<llvm_ir::IrArray> parameter_arrays;
@@ -1635,14 +1606,14 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
   if (IsReductionToVector(*reduce) &&
       // NVPTX backend can't do atomic cmpxchg any narrower than 32 bits
       32 <= primitive_util::BitWidth(reduce->shape().element_type())) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
+                        BuildInitializerThunk(reduce));
     std::vector<std::unique_ptr<Thunk>> thunks;
-    thunks.emplace_back(BuildKernelThunk(reduce));
-    TF_RETURN_IF_ERROR(EmitInitializer(
-        reduce, static_cast<KernelThunk*>(thunks.back().get())));
-    bindings_.UnbindAllLocalIrValues();
-    thunks.emplace_back(BuildKernelThunk(reduce));
+    thunks.push_back(std::move(initializer_thunk));
+    thunks.push_back(BuildKernelThunk(reduce));
     thunk_sequence_->emplace_back(
         MakeUnique<SequentialThunk>(std::move(thunks), reduce));
+
     return EmitReductionToVector(
         reduce, input->shape(),
         [&](const llvm_ir::IrArray::Index& index) {
@@ -1706,16 +1677,13 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   CHECK_EQ(rank, ShapeUtil::Rank(source->shape()));
   CHECK_EQ(rank, window.dimensions_size());
 
-  {
-    std::vector<std::unique_ptr<Thunk>> thunks;
-    thunks.emplace_back(BuildKernelThunk(select_and_scatter));
-    TF_RETURN_IF_ERROR(EmitInitializer(
-        select_and_scatter, static_cast<KernelThunk*>(thunks.back().get())));
-    bindings_.UnbindAllLocalIrValues();
-    thunks.emplace_back(BuildKernelThunk(select_and_scatter));
-    thunk_sequence_->emplace_back(
-        MakeUnique<SequentialThunk>(std::move(thunks), select_and_scatter));
-  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
+                      BuildInitializerThunk(select_and_scatter));
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(initializer_thunk));
+  thunks.push_back(BuildKernelThunk(select_and_scatter));
+  thunk_sequence_->emplace_back(
+      MakeUnique<SequentialThunk>(std::move(thunks), select_and_scatter));
 
   // TODO(b/31410564): Implement dilation rate for select-and-scatter.
   if (window_util::HasDilation(window)) {
@@ -1960,38 +1928,54 @@ GetHloBufferSlices(const HloInstruction* hlo,
       -> optional<std::pair<BufferAllocation::Slice, ShapeIndex>> {
     // Simple, common case: Is the buffer for instr known at runtime?  If so,
     // we're done.
-    auto slice = GetKnownAtRuntimeSlice(instr, index, buffer_assn);
-    if (slice.has_value()) {
-      return {{*slice, ShapeIndex()}};
+    auto slice = buffer_assn.GetUniqueSlice(instr, index);
+    if (slice.ok()) {
+      return {{slice.ValueOrDie(), ShapeIndex()}};
     }
 
-    // If we don't know the buffer for instr at index, see if we know the buffer
-    // for instr at index without its last element.  If so, we can dynamically
-    // find the buffer for instr by dereferencing a pointer in that buffer.
-    // Continue looking this way until we run out of elements in 'index'.
-    ShapeIndex new_index = index;
-    ShapeIndex gte_indices;
-    while (!new_index.empty()) {
-      gte_indices.push_front(new_index.back());
-      new_index.pop_back();
-      auto slice = GetKnownAtRuntimeSlice(instr, new_index, buffer_assn);
-      if (slice.has_value()) {
-        return {{*slice, gte_indices}};
+    // If that didn't work, walk up any bitcasts that we might see.  These must
+    // appear before any GTE instructions, because it's illegal to bitcast to a
+    // tuple type.
+    const HloInstruction* parent = instr;
+    while (parent->opcode() == HloOpcode::kBitcast) {
+      parent = parent->operand(0);
+
+      auto slice = buffer_assn.GetUniqueSlice(parent, {});
+      if (slice.ok()) {
+        return {{slice.ValueOrDie(), ShapeIndex()}};
       }
     }
 
-    // If *that* didn't work, check whether instr is a GTE instruction.  If it
-    // is, see if we can get a buffer for its parent, and continue walking up
-    // parents until we find a defined buffer or we hit something that's not a
-    // GTE.
-    const HloInstruction* parent = instr;
+    // Check whether instr is a GTE instruction.  If it is, see if we can get a
+    // buffer for its parent, and continue walking up parents until we find a
+    // defined buffer or we hit something that's not a GTE.
+    ShapeIndex gte_indices;
     while (parent->opcode() == HloOpcode::kGetTupleElement) {
       gte_indices.push_front(parent->tuple_index());
       parent = parent->operand(0);
 
-      auto slice = GetKnownAtRuntimeSlice(parent, {}, buffer_assn);
-      if (slice.has_value()) {
-        return {{*slice, gte_indices}};
+      auto slice = buffer_assn.GetUniqueSlice(parent, {});
+      if (slice.ok()) {
+        return {{slice.ValueOrDie(), gte_indices}};
+      }
+    }
+
+    // Finally, if we don't know the buffer for instr at index, see if we know
+    // the buffer for instr at index without its last element.  If so, we can
+    // dynamically find the buffer for instr by dereferencing a pointer in that
+    // buffer.  Continue looking this way until we run out of elements in
+    // 'index'.
+    //
+    // We can almost always get a buffer without resorting to this.  The only
+    // exception is for cases where the relevant sub-buffer is truly unknowable,
+    // for example the sub-buffer of a tuple-shaped select.
+    ShapeIndex new_index = index;
+    while (!new_index.empty()) {
+      gte_indices.push_front(new_index.back());
+      new_index.pop_back();
+      auto slice = buffer_assn.GetUniqueSlice(instr, new_index);
+      if (slice.ok()) {
+        return {{slice.ValueOrDie(), gte_indices}};
       }
     }
 
@@ -2036,7 +2020,7 @@ Status IrEmitterUnnested::HandleGather(HloInstruction* gather) {
   return Unimplemented("Gather is not implemented on GPUs.");
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildKernelThunk(
+std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
     const HloInstruction* inst) {
   const BufferAssignment& buffer_assn =
       ir_emitter_context_->buffer_assignment();
@@ -2260,37 +2244,87 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
                               /*output_shape=*/inst->shape(), inst);
 }
 
-Status IrEmitterUnnested::EmitInitializer(const HloInstruction* hlo,
-                                          KernelThunk* thunk) {
+StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
+    const HloInstruction* hlo) {
   bool fused = HloOpcode::kFusion == hlo->opcode();
-
   const HloInstruction* inst = fused ? hlo->fused_expression_root() : hlo;
-  CHECK(inst->opcode() == HloOpcode::kSelectAndScatter ||
-        inst->opcode() == HloOpcode::kReduce);
-  const HloInstruction* init_value = nullptr;
-  switch (inst->opcode()) {
-    case HloOpcode::kSelectAndScatter:
-      init_value = inst->operand(2);
-      break;
-    case HloOpcode::kReduce:
-      init_value = inst->operand(1);
-      break;
-    default:
-      LOG(FATAL) << "Opcode " << inst->opcode()
-                 << " should not need an initializer.";
-  }
+  const HloInstruction* init_value = [&] {
+    switch (inst->opcode()) {
+      case HloOpcode::kSelectAndScatter:
+        return inst->operand(2);
+      case HloOpcode::kReduce:
+        return inst->operand(1);
+      default:
+        LOG(FATAL) << "Opcode " << inst->opcode()
+                   << " should not need an initializer.";
+    }
+  }();
 
   if (fused && init_value->opcode() == HloOpcode::kParameter) {
     init_value = hlo->operand(init_value->parameter_number());
   }
 
-  return EmitTargetElementLoopInThunk(
+  // In the common case, the initializer is a constant.  In this case, emit a
+  // device-memset call if we can.  Currently StreamExecutor only supports
+  // zeroing and 32-bit memsets.
+  if (init_value->IsConstant()) {
+    CHECK(ShapeUtil::IsScalar(init_value->shape()));
+    int64 num_bytes = ShapeUtil::ByteSizeOfElements(init_value->shape());
+    const auto& literal = init_value->literal();
+
+    // Are all the bytes of this scalar equal to 0?  If so, we can create a
+    // MemzeroThunk.
+    ArraySlice<uint8> literal_bytes(
+        reinterpret_cast<const uint8*>(literal.untyped_data()), num_bytes);
+    if (c_all_of(literal_bytes, [](uint8 byte) { return byte == 0; })) {
+      return {MakeUnique<MemzeroThunk>(GetAllocationSlice(*hlo), hlo)};
+    }
+
+    // If the literal is 8 or 16 bits wide, we can emit a 32-bit memset by
+    // repeating the literal 4 or 2 times, so long as the destination buffer is
+    // an even multiple of 32 bits long.
+    if ((num_bytes == 1 || num_bytes == 2) &&
+        ShapeUtil::ByteSizeOf(hlo->shape()) % 4 == 0) {
+      uint16 pattern16;
+      if (num_bytes == 1) {
+        uint8 b = literal_bytes.front();
+        pattern16 = uint16{b} | (uint16{b} << 8);
+      } else {
+        pattern16 = literal_bytes.front();
+      }
+      uint32 pattern32 = uint32{pattern16} | (uint32{pattern16} << 16);
+      return {MakeUnique<Memset32BitValueThunk>(pattern32,
+                                                GetAllocationSlice(*hlo), hlo)};
+    }
+
+    // If the literal is an even multiple of 32 bits wide, we can emit a 32-bit
+    // memset so long as all 32-bit words of the scalar are equal to each other.
+    if (num_bytes >= 4 && num_bytes % 4 == 0 &&
+        memcmp(literal_bytes.data(), literal_bytes.data() + 4,
+               literal_bytes.size() - 4) == 0) {
+      uint32 word;
+      memcpy(&word, literal_bytes.data(), sizeof(word));
+      return {MakeUnique<Memset32BitValueThunk>(word, GetAllocationSlice(*hlo),
+                                                hlo)};
+    }
+  }
+
+  // Otherwise fall back to our slow initializer code.
+  std::unique_ptr<KernelThunk> kernel_thunk = BuildKernelThunk(hlo);
+  TF_RETURN_IF_ERROR(EmitTargetElementLoopInThunk(
       *hlo,
       [=](const llvm_ir::IrArray::Index& index) {
         return GetIrArray(*init_value, *hlo)
             .EmitReadArrayElement(index, &ir_builder_);
       },
-      thunk);
+      kernel_thunk.get()));
+
+  // Clean up state left behind by emitting the loop above.  (This is normally
+  // done in IrEmitterUnnested::Postprocess().)
+  bindings_.UnbindAllLocalIrValues();
+
+  // Convert unique_ptr<KernelThunk> to StatusOr<unique_ptr<Thunk>>.
+  return {std::move(kernel_thunk)};
 }
 
 namespace {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index b83a2337e2decd9d4fba3d40fcf33f131fca8a3c..66c62e2d2de3ed1668271a21943dc73ed3d77651 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -148,13 +148,10 @@ class IrEmitterUnnested : public IrEmitter {
       tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
       HloComputation* reducer);
 
-  // Emits code to initialize buffer of `inst` in given `thunk`.
-  Status EmitInitializer(const HloInstruction* inst, KernelThunk* thunk);
-
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
   // Thunk object.
-  std::unique_ptr<Thunk> BuildKernelThunk(const HloInstruction* inst);
+  std::unique_ptr<KernelThunk> BuildKernelThunk(const HloInstruction* inst);
 
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
@@ -163,6 +160,11 @@ class IrEmitterUnnested : public IrEmitter {
   // to make sure `inst` outlives the lifetime of the returned Thunk object.
   std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
 
+  // Returns a thunk that, given a reduce or select-and-scatter op, initializes
+  // its memory to the appropriate initial value.
+  StatusOr<std::unique_ptr<Thunk>> BuildInitializerThunk(
+      const HloInstruction* hlo);
+
   // Returns a thunk that calls host-to-device cuMemcpy to implement `inst`.
   std::unique_ptr<Thunk> BuildHostToDeviceCopyThunk(const HloInstruction* inst);
 
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index f4c4dcdafd6cc0cd64da5a8d1f23c8c0e7b2a9cb..86c4ac18b0501c38aaaae5a007bddcf261ca338f 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -68,17 +68,3 @@ tf_cc_test(
         "@llvm//:support",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18e673542c5b47cb90d31a8eff62a5e4adb78d1d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/memset_thunk.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace se = ::perftools::gputools;
+
+Status MemzeroThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
+  stream->ThenMemZero(&dest_data, dest_data.size());
+  return Status::OK();
+}
+
+Status Memset32BitValueThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
+  stream->ThenMemset32(&dest_data, value_, dest_data.size());
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4bb74d1dd6dc9d09c5e4d439d57dfe8b57c2ed9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MEMSET_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MEMSET_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+// This file contains thunks that set a buffer's elements to a particular value.
+// This can be faster than emitting a kernel to set the elements.
+
+namespace xla {
+namespace gpu {
+
+// Thunk that zeroes out a given chunk of memory.
+class MemzeroThunk : public Thunk {
+ public:
+  explicit MemzeroThunk(const BufferAllocation::Slice& dest,
+                        const HloInstruction* hlo)
+      : Thunk(Kind::kMemzero, hlo), dest_(dest) {}
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
+
+ private:
+  const BufferAllocation::Slice dest_;
+};
+
+// Thunk that sets a given chunk of memory to a particular 32-bit value.  The
+// destination chunk must have size divisible by 32 bits.
+class Memset32BitValueThunk : public Thunk {
+ public:
+  explicit Memset32BitValueThunk(uint32 value,
+                                 const BufferAllocation::Slice& dest,
+                                 const HloInstruction* hlo)
+      : Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {}
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
+
+ private:
+  uint32 value_;
+  const BufferAllocation::Slice dest_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MEMSET_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index fa405b9329a327a70161821212db4d3213e834b7..7bda4e2fcd469bd430e5ef1846251c8504225383 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -69,7 +69,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
     HloInstruction* padding =
         computation->AddInstruction(HloInstruction::CreateConstant(
             MakeUnique<Literal>(Literal::Zero(element_type))));
-    input = CreatePadHlo(input, padding, padding_config).ValueOrDie();
+    input = MakePadHlo(input, padding, padding_config).ValueOrDie();
   }
 
   if (window_util::HasNegativePadding(conv_window)) {
@@ -92,8 +92,8 @@ HloInstruction* MaybePaddedAndSlicedInput(
           std::max<int64>(0LL, -conv_window.dimensions(i).padding_high());
     }
 
-    input = CreateSliceHlo(input, start_indices, limit_indices, strides)
-                .ValueOrDie();
+    input =
+        MakeSliceHlo(input, start_indices, limit_indices, strides).ValueOrDie();
   }
 
   return input;
@@ -126,7 +126,7 @@ HloInstruction* MaybePaddedKernel(const Window& conv_window,
   HloInstruction* padding =
       computation->AddInstruction(HloInstruction::CreateConstant(
           MakeUnique<Literal>(Literal::Zero(element_type))));
-  return CreatePadHlo(kernel, padding, padding_config).ValueOrDie();
+  return MakePadHlo(kernel, padding, padding_config).ValueOrDie();
 }
 }  // namespace
 
@@ -238,7 +238,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
       computation->AddInstruction(HloInstruction::CreateConstant(
           MakeUnique<Literal>(Literal::Zero(input->shape().element_type()))));
   HloInstruction* padded_input =
-      CreatePadHlo(input, padding, input_padding_config).ValueOrDie();
+      MakePadHlo(input, padding, input_padding_config).ValueOrDie();
 
   // The shape of the backward_conv CustomCall is a tuple (conv_result,
   // scratch_buffer).  Extract out the shape of conv_result.
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 2c3032d79be221e8cacb178ffb1817459b603cc0..9eea958d1214b131d49cb4e28f1944860408d3a8 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -51,6 +51,8 @@ class Thunk {
     kGemm,
     kInfeed,
     kKernel,
+    kMemset32BitValue,
+    kMemzero,
     kSequential,
     kTuple,
     kWhile,
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index bf903d6a390fe2951d33942dfc2e124868c9fdb5..0b446c654779db410ebbd91ef9a5bab14d08a278 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -13,13 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// DO NOT USE THESE PROTO MESSAGES FOR ANYTHING OTHER THAN DEBUGGING.
-//
-// Don't use these protos in the real compilation or execution codepaths. The
-// data format is meant for debugging only, and may change without notice.
+// This proto file defines messages which represent the HLO module. This is a
+// full fidelity serialization of the c++ HLO constructs.
 //
 // Many of the protos below are simple 1-to-1 serializations of the
-// corresponding C++ classes.
+// corresponding C++ classes, e.g., HloModule, HloComputation, and
+// HloInstruction.
 //
 // FIELD NAMES ARE IMPORTANT
 //
@@ -38,16 +37,19 @@ option cc_enable_arenas = true;
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
+  reserved 12;
+  reserved "fused_instructions_computation";
+  reserved 4;
+  reserved "operand_names";
+  reserved 5;
+  reserved "control_predecessor_names";
+  reserved 6;
+  reserved "called_computation_names";
 
   string name = 1;
   string opcode = 2;
   xla.Shape shape = 3;
 
-  // TODO(b/67782397): Replace instruction names with HloInstruction ids.
-  repeated string operand_names = 4;
-  repeated string control_predecessor_names = 5;
-  repeated string called_computation_names = 6;
-
   xla.OpMetadata metadata = 7;
 
   // Literal, only present for kConstant.
@@ -58,7 +60,6 @@ message HloInstructionProto {
 
   // Fusion state, only present for kFusion.
   string fusion_kind = 11;
-  HloComputationProto fused_instructions_computation = 12;
 
   // Index for kGetTupleElement.
   int64 tuple_index = 13;
@@ -136,30 +137,40 @@ message HloInstructionProto {
 
   // The id of this instruction.
   int64 id = 35;
+
+  repeated int64 operand_ids = 36;
+  repeated int64 control_predecessor_ids = 37;
+  repeated int64 called_computation_ids = 38;
+
+  xla.OpSharding sharding = 40;
 }
 
 // Serialization of HloComputation.
 message HloComputationProto {
+  reserved 3;
+  reserved "root_name";
+
   string name = 1;
 
   // The array of instructions is always in a valid dependency order, where
   // operands appear before their users.
   repeated HloInstructionProto instructions = 2;
 
-  // The name of the root of the computation.
-  string root_name = 3;
-
   // The program shape (with layout) of this computation.
   xla.ProgramShape program_shape = 4;
 
   // The id of this computation.
   int64 id = 5;
+
+  // The id of the root of the computation.
+  int64 root_id = 6;
 }
 
 // Serialization of HloModule.
 message HloModuleProto {
   string name = 1;
   string entry_computation_name = 2;
+  int64 entry_computation_id = 6;
 
   // The array of computations is always in a valid dependency order, where
   // callees appear before their callers.
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 30e32a46d7dd0923f738939c33407ac7484b5bbe..a88283ed9a6459b4fa9310e160b59c77d51f1027 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -171,24 +171,21 @@ class BufferValueMap {
     return value_to_buffer_number_.at(&value);
   }
 
-  // Compute and return a vector of buffers that the given value must be
-  // contained in due to HLO aliasing rules.
-  std::vector<BufferNumber> ComputeAliasedBuffers(const HloValue& value) {
+  void ComputeWhileAliasedBuffers(const HloValue& value,
+                                  std::vector<BufferNumber>* aliased_buffers) {
+    VLOG(3) << "Compute kWhile aliases";
     // Value is init of a while (use is while).
-    std::vector<BufferNumber> aliased_buffers;
     for (const HloUse& use : value.uses()) {
-      VLOG(2) << "use of value " << value.ToShortString() << ": " << use;
       if (use.instruction->opcode() == HloOpcode::kWhile) {
         // Determine the while value that this shares a buffer with.
         const HloValue& while_value =
             dataflow_.GetUniqueValueAt(use.instruction, use.operand_index);
-        aliased_buffers.push_back(GetBufferForValue(while_value));
+        aliased_buffers->push_back(GetBufferForValue(while_value));
         VLOG(3) << "  value is init value to a while; must share buffer with "
                    "while value "
                 << while_value.ToShortString();
       }
     }
-
     // Value is a parameter of a while body/condition.
     if (value.defining_instruction()->opcode() == HloOpcode::kParameter) {
       const HloComputation* computation =
@@ -205,11 +202,10 @@ class BufferValueMap {
           VLOG(3) << "  value is parameter value of the body or condition of a "
                      "while; must share buffer with while value "
                   << while_value.ToShortString();
-          aliased_buffers.push_back(GetBufferForValue(while_value));
+          aliased_buffers->push_back(GetBufferForValue(while_value));
         }
       }
     }
-
     // Value is the root of a while body.
     for (const HloPosition& position : value.positions()) {
       const HloComputation* computation = position.instruction->parent();
@@ -224,27 +220,71 @@ class BufferValueMap {
 
             const HloValue& while_value = dataflow_.GetUniqueValueAt(
                 callsite.instruction(), position.index);
-            VLOG(3) << "  value is root the body computation of a while; must "
-                       "share buffer with while value "
+            VLOG(3) << "  value @ " << position << " is root of "
+                    << callsite.instruction()->name()
+                    << "; body root and while value root must share buffer "
+                       "among them : "
                     << while_value.ToShortString();
-            aliased_buffers.push_back(GetBufferForValue(while_value));
+            aliased_buffers->push_back(GetBufferForValue(while_value));
           }
         }
       }
     }
-
     // Value is the output of the while instruction itself.
     if (value.defining_instruction()->opcode() == HloOpcode::kWhile) {
       VLOG(3) << "  value is output of a while instruction";
-      aliased_buffers.push_back(GetBufferForValue(value));
+      aliased_buffers->push_back(GetBufferForValue(value));
+    }
+  }
+
+  void ComputeConditionalAliasedBuffers(
+      const HloValue& value, std::vector<BufferNumber>* aliased_buffers) {
+    VLOG(3) << "Compute kConditional aliases";
+    // Aliases the buffers of the true/false computations roots, with the one of
+    // the conditional.
+    for (const HloPosition& position : value.positions()) {
+      const HloComputation* computation = position.instruction->parent();
+      const CallGraphNode& call_graph_node =
+          dataflow_.call_graph().GetNode(computation);
+      if (position.instruction == computation->root_instruction()) {
+        for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+          if (callsite.instruction()->opcode() == HloOpcode::kConditional) {
+            // Call graph must have been flattened.
+            CHECK_EQ(call_graph_node.caller_callsites().size(), 1);
+
+            const HloValue& cond_value = dataflow_.GetUniqueValueAt(
+                callsite.instruction(), position.index);
+            VLOG(3)
+                << "  value @ " << position << " is root of "
+                << callsite.instruction()->name()
+                << "; true/false branch roots must share buffer among them : "
+                << cond_value.ToShortString();
+            aliased_buffers->push_back(GetBufferForValue(cond_value));
+          }
+        }
+      }
+    }
+    // Value is the output of the conditional instruction itself.
+    if (value.defining_instruction()->opcode() == HloOpcode::kConditional) {
+      VLOG(3) << "  value is output of a conditional instruction";
+      aliased_buffers->push_back(GetBufferForValue(value));
     }
+  }
 
+  // Compute and return a vector of buffers that the given value must be
+  // contained in due to HLO aliasing rules.
+  std::vector<BufferNumber> ComputeAliasedBuffers(const HloValue& value) {
+    for (const HloUse& use : value.uses()) {
+      VLOG(2) << "Use of value " << value.ToShortString() << ": " << use;
+    }
+    std::vector<BufferNumber> aliased_buffers;
+    ComputeWhileAliasedBuffers(value, &aliased_buffers);
+    ComputeConditionalAliasedBuffers(value, &aliased_buffers);
     // Uniquify aliased buffers.
     std::sort(aliased_buffers.begin(), aliased_buffers.end());
     aliased_buffers.erase(
         std::unique(aliased_buffers.begin(), aliased_buffers.end()),
         aliased_buffers.end());
-
     return aliased_buffers;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index f99c7cf5e495eaf83e0dda859ef31a7487bc6ffe..594413e88fb26e86b198d08b2e4db77fad671348 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -65,6 +65,7 @@ HloComputation::HloComputation(
     std::vector<std::unique_ptr<HloInstruction>>* instructions,
     HloInstruction* root_instruction, HloInstruction* fusion_instruction)
     : name_(name),
+      unique_id_(-1),
       root_instruction_(root_instruction),
       fusion_instruction_(fusion_instruction) {
   param_instructions_.resize(parameter_count, nullptr);
@@ -101,7 +102,7 @@ HloInstruction* HloComputation::AddInstructionInternal(
     instruction->UniquifyName(&parent()->instruction_name_uniquer());
     instruction->SetUniqueId(parent()->NewUniqueInstructionId());
   }
-  Reparent(instruction.get());
+  instruction->set_parent(this);
   HloInstruction* pinst = instruction.get();
   instruction_iterators_[pinst] =
       instructions_.insert(instructions_.end(), std::move(instruction));
@@ -158,10 +159,6 @@ Status HloComputation::RemoveParameter(int64 param_no) {
   return Status::OK();
 }
 
-void HloComputation::Reparent(HloInstruction* instruction) {
-  instruction->set_parent(this);
-}
-
 bool HloComputation::IsRemovable(const HloInstruction* instruction) {
   // If the instruction has control predecessors or successors then we cannot
   // remove the instruction without violating ordering constraints (added, for
@@ -307,19 +304,15 @@ void ComputeComputationPostOrder(
     HloComputation* computation,
     tensorflow::gtl::FlatSet<HloComputation*>* visited,
     std::list<HloComputation*>* post_order) {
-  if (visited->count(computation) > 0) {
-    return;
-  }
-
-  for (auto* instruction : computation->instructions()) {
-    for (HloComputation* called_computation :
-         instruction->called_computations()) {
-      ComputeComputationPostOrder(called_computation, visited, post_order);
+  if (visited->insert(computation).second) {
+    for (auto* instruction : computation->instructions()) {
+      for (HloComputation* called_computation :
+           instruction->called_computations()) {
+        ComputeComputationPostOrder(called_computation, visited, post_order);
+      }
     }
+    post_order->push_back(computation);
   }
-
-  visited->insert(computation);
-  post_order->push_back(computation);
 }
 
 }  // namespace
@@ -393,12 +386,16 @@ string HloComputation::ToString(const HloPrintOptions& options) const {
 
 HloComputationProto HloComputation::ToProto() const {
   HloComputationProto proto;
+  CHECK(unique_id_ != -1)
+      << "This computation does not have a valid id. Please make sure the "
+         "computation is inside a module before dumping it.";
+  proto.set_id(unique_id_);
   proto.set_name(name_);
   for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
     HloInstructionProto instruction_proto = instruction->ToProto();
     proto.add_instructions()->Swap(&instruction_proto);
   }
-  proto.set_root_name(root_instruction()->name());
+  proto.set_root_id(root_instruction()->unique_id());
   *proto.mutable_program_shape() = ComputeProgramShape();
   return proto;
 }
@@ -406,31 +403,29 @@ HloComputationProto HloComputation::ToProto() const {
 /* static */ StatusOr<std::unique_ptr<HloComputation>>
 HloComputation::CreateFromProto(
     HloModule* module, const HloComputationProto& proto,
-    const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
-    const std::function<void(std::unique_ptr<HloComputation>)>&
-        add_fused_computation,
-    HloInstruction* fusion_instruction) {
+    const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
   std::vector<std::unique_ptr<HloInstruction>> instructions;
-  tensorflow::gtl::FlatMap<string, HloInstruction*> instruction_map;
+  tensorflow::gtl::FlatMap<int64, HloInstruction*> instruction_map;
   int64 parameter_count = 0;
   for (const HloInstructionProto& instruction_proto : proto.instructions()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloInstruction> instruction,
-                        HloInstruction::CreateFromProto(
-                            module, instruction_proto, instruction_map,
-                            computation_map, add_fused_computation));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloInstruction> instruction,
+        HloInstruction::CreateFromProto(module, instruction_proto,
+                                        instruction_map, computation_map));
     if (instruction->opcode() == HloOpcode::kParameter) {
       parameter_count++;
     }
-    TF_RET_CHECK(!ContainsKey(instruction_map, instruction->name()));
-    instruction_map[instruction->name()] = instruction.get();
+    TF_RET_CHECK(!ContainsKey(instruction_map, instruction_proto.id()));
+    instruction_map[instruction_proto.id()] = instruction.get();
     instructions.push_back(std::move(instruction));
   }
 
-  TF_RET_CHECK(!proto.root_name().empty());
-  TF_RET_CHECK(ContainsKey(instruction_map, proto.root_name()));
-  HloInstruction* root = instruction_map.at(proto.root_name());
-  return WrapUnique(new HloComputation(
-      proto.name(), parameter_count, &instructions, root, fusion_instruction));
+  TF_RET_CHECK(proto.root_id() != -1);
+  TF_RET_CHECK(ContainsKey(instruction_map, proto.root_id()));
+  HloInstruction* root = instruction_map.at(proto.root_id());
+  return WrapUnique(new HloComputation(proto.name(), parameter_count,
+                                       &instructions, root,
+                                       /*fusion_instruction=*/nullptr));
 }
 
 void HloComputation::FuseInstructionsInto(
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index dd9d346999f0eae448d74628278c802ccd3f51b4..9d3f6e9a2c2efd97681a22b6b0f6d929afc553de 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -160,20 +160,12 @@ class HloComputation {
   //   module: the module which will contain the computation. The newly created
   //     computation is *not* added to the module, however.
   //   proto: the proto to convert from.
-  //   computation_map: a map from computation name to HloComputation*. This map
+  //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed computation
   //     calls.
-  //   add_fused_computation: A function to call to add a fused
-  //     computation. Used only when the instruction is a fusion instruction.
-  //   fusion_instruction: if non-null then the newly created computation will
-  //     be constructed as a fused computation with this instruction as its
-  //     fusion parent.
   static StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
       HloModule* module, const HloComputationProto& proto,
-      const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
-      const std::function<void(std::unique_ptr<HloComputation>)>&
-          add_fused_computation,
-      HloInstruction* fusion_instruction = nullptr);
+      const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
 
   // Gets the instructions in this computation.
   //
@@ -342,6 +334,15 @@ class HloComputation {
     fusion_instruction_ = fusion_instruction;
   }
 
+  // The id of this computation should be unique within the module.
+  void SetUniqueId(int64 id) {
+    CHECK_EQ(unique_id_, -1);
+    CHECK_GE(id, 0);
+    unique_id_ = id;
+  }
+
+  int64 unique_id() const { return unique_id_; }
+
  private:
   explicit HloComputation(
       const string& name, int parameter_count,
@@ -352,10 +353,6 @@ class HloComputation {
   HloInstruction* AddInstructionInternal(
       std::unique_ptr<HloInstruction> instruction);
 
-  // Helper for setting the parent of instructions that are added to this
-  // computation.
-  void Reparent(HloInstruction* instruction);
-
   // Fuses HLOs in instructions_to_fuse into fusion_instruction.
   //
   // Pre-condition: fusion_instruction's opcode is kFusion.
@@ -373,6 +370,7 @@ class HloComputation {
   std::vector<HloInstruction*> CollectUnreachableRoots() const;
 
   string name_;
+  int64 unique_id_;
   HloInstruction* root_instruction_;
 
   // If this computation is a fusion computation, this field points to the
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 4ec2ef27bf59b0c877ec38e55ef5c12debeec227..44e4f75f75b275653e1a07111943843fc6f78b33 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -379,20 +380,101 @@ Status HloCostAnalysis::HandleTranspose(const HloInstruction*) {
 }
 
 Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
-  auto rhs_instruction = convolution->operand(1);
+  auto lhs = convolution->operand(0);
+  auto rhs = convolution->operand(1);
+  Window window = convolution->window();
+  const auto& result_shape = convolution->shape();
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+
   const auto& dnums = convolution->convolution_dimension_numbers();
-  const int64 output_features =
-      convolution->shape().dimensions(dnums.output_feature_dimension());
-
-  // For each output element, we do one fma per element in the kernel at some
-  // given output feature index.
-  const int64 fmas_per_output_element =
-      output_features > 0
-          ? ShapeUtil::ElementsIn(rhs_instruction->shape()) / output_features
-          : 0;
-  const int64 output_elements = ShapeUtil::ElementsIn(convolution->shape());
-  current_properties_[kFlopsKey] =
-      output_elements * fmas_per_output_element * kFmaFlops;
+
+  const int64 input_batch_dim = dnums.input_batch_dimension();
+  const int64 input_feature_dim = dnums.input_feature_dimension();
+  const int64 output_feature_dim = dnums.output_feature_dimension();
+  const int64 input_feature =
+      ShapeUtil::GetDimension(lhs_shape, input_feature_dim);
+  const int64 output_feature =
+      ShapeUtil::GetDimension(result_shape, output_feature_dim);
+  const int64 batch = ShapeUtil::GetDimension(lhs_shape, input_batch_dim);
+
+  DimensionVector kernel_limits;
+  DimensionVector output_limits;
+  DimensionVector input_limits;
+  if (window.dimensions().empty()) {
+    window = window_util::MakeWindow({1});
+    kernel_limits.push_back(1);
+    output_limits.push_back(1);
+    input_limits.push_back(1);
+  } else {
+    for (int64 spatial_dimension = 0;
+         spatial_dimension < window.dimensions_size(); ++spatial_dimension) {
+      // Spatial dimension number for kernel (rhs).
+      const int64 kernel_spatial_dim =
+          dnums.kernel_spatial_dimensions(spatial_dimension);
+      const int64 kernel_limit = rhs_shape.dimensions(kernel_spatial_dim);
+      kernel_limits.push_back(kernel_limit);
+
+      // Spatial dimension number for output.
+      const int64 output_spatial_dim =
+          dnums.output_spatial_dimensions(spatial_dimension);
+      const int64 output_limit = result_shape.dimensions(output_spatial_dim);
+      output_limits.push_back(output_limit);
+
+      // Spatial dimension number for input (lhs).
+      const int64 input_spatial_dim =
+          dnums.input_spatial_dimensions(spatial_dimension);
+      const int64 input_limit = lhs_shape.dimensions(input_spatial_dim);
+      input_limits.push_back(input_limit);
+    }
+  }
+
+  DimensionVector valid_position_counts;
+
+  // Loop over each spatial dimension.
+  for (int64 spatial_dimension = 0;
+       spatial_dimension < window.dimensions_size(); ++spatial_dimension) {
+    int64 valid_position_count = 0;
+    // Loop over each point in the kernel.
+    for (int64 kernel_idx = 0; kernel_idx < kernel_limits[spatial_dimension];
+         ++kernel_idx) {
+      // Loop over each point in the output.
+      for (int64 output_idx = 0; output_idx < output_limits[spatial_dimension];
+           ++output_idx) {
+        // Calculate lhs (input) index without taking base dilation into
+        // account.
+        const auto& window_dim = window.dimensions(spatial_dimension);
+        const int64 undilated_index = output_idx * window_dim.stride() -
+                                      window_dim.padding_low() +
+                                      kernel_idx * window_dim.window_dilation();
+
+        // Calculate the actual lhs (input) index after dilation. Avoid the
+        // division as an optimization.
+        const int64 lhs_spatial_index =
+            window_dim.base_dilation() > 1
+                ? undilated_index / window_dim.base_dilation()
+                : undilated_index;
+
+        // Skip if the lhs (input) index is to be dilated.
+        if (undilated_index != lhs_spatial_index * window_dim.base_dilation()) {
+          continue;
+        }
+
+        // Skip if input index is not in bound.
+        if (lhs_spatial_index < 0 ||
+            lhs_spatial_index >= input_limits[spatial_dimension]) {
+          continue;
+        }
+
+        valid_position_count += 1;
+      }
+    }
+    valid_position_counts.push_back(valid_position_count);
+  }
+
+  const int64 fma_count =
+      input_feature * output_feature * batch * Product(valid_position_counts);
+  current_properties_[kFlopsKey] = fma_count * kFmaFlops;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 3b289c240a45e8f3df8156ed89e879da2132d01a..3d055b327ee920dac9c0904c69e1461206b31203 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -186,12 +186,14 @@ TEST_F(HloCostAnalysisTest, Map) {
 TEST_F(HloCostAnalysisTest, Convolution) {
   ComputationBuilder builder(client_, "convolution");
   auto input = builder.Parameter(
-      0, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
-                                    /*x_dim=*/20}),
+      0,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
+                                 /*x_dim=*/20}),
       "input");
   auto kernel = builder.Parameter(
-      1, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3,
-                                    /*x_dim=*/3}),
+      1,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3,
+                                 /*x_dim=*/3}),
       "kernel");
   auto result = builder.Conv(input, kernel, {1, 1}, Padding::kValid);
 
@@ -440,5 +442,32 @@ TEST_F(HloCostAnalysisTest, TupleCost) {
   EXPECT_EQ(analysis.bytes_accessed(), kPointerSize * 2);
 }
 
+TEST_F(HloCostAnalysisTest, BaseDilatedConvolution) {
+  ComputationBuilder builder(client_, "BaseDilatedConvolution");
+  auto input = builder.Parameter(
+      0,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
+                                 /*x_dim=*/20}),
+      "input");
+  auto kernel = builder.Parameter(
+      1,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3,
+                                 /*x_dim=*/3}),
+      "kernel");
+
+  auto result = builder.ConvGeneralDilated(
+      input, kernel, /*window_strides=*/{1, 1}, /*padding=*/{{1, 1}, {1, 1}},
+      /*lhs_dilation=*/{3, 5}, /*rhs_dilation=*/{7, 11},
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(2));
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  EXPECT_EQ(analysis.flop_count(), 1472);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 4585bffa42fd001c7375b778c5e1c42e58d17692..b186767ce792cd89ae77fe9a03b3a2ecf296b804 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -23,8 +23,8 @@ namespace xla {
 using tensorflow::gtl::ArraySlice;
 using tensorflow::strings::StrCat;
 
-StatusOr<HloInstruction*> CreateBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
-                                          HloInstruction* rhs) {
+StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
+                                        HloInstruction* rhs) {
   HloComputation* computation = lhs->parent();
   CHECK_EQ(computation, rhs->parent());
   TF_ASSIGN_OR_RETURN(Shape binary_op_shape,
@@ -33,9 +33,9 @@ StatusOr<HloInstruction*> CreateBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
       HloInstruction::CreateBinary(binary_op_shape, opcode, lhs, rhs));
 }
 
-StatusOr<HloInstruction*> CreatePadHlo(HloInstruction* operand,
-                                       HloInstruction* padding_value,
-                                       const PaddingConfig& padding_config) {
+StatusOr<HloInstruction*> MakePadHlo(HloInstruction* operand,
+                                     HloInstruction* padding_value,
+                                     const PaddingConfig& padding_config) {
   HloComputation* computation = operand->parent();
   CHECK_EQ(computation, padding_value->parent());
   TF_ASSIGN_OR_RETURN(
@@ -46,10 +46,10 @@ StatusOr<HloInstruction*> CreatePadHlo(HloInstruction* operand,
       pad_shape, operand, padding_value, padding_config));
 }
 
-StatusOr<HloInstruction*> CreateSliceHlo(HloInstruction* operand,
-                                         ArraySlice<int64> start_indices,
-                                         ArraySlice<int64> limit_indices,
-                                         ArraySlice<int64> strides) {
+StatusOr<HloInstruction*> MakeSliceHlo(HloInstruction* operand,
+                                       ArraySlice<int64> start_indices,
+                                       ArraySlice<int64> limit_indices,
+                                       ArraySlice<int64> strides) {
   HloComputation* computation = operand->parent();
   TF_ASSIGN_OR_RETURN(Shape slice_shape, ShapeInference::InferSliceShape(
                                              operand->shape(), start_indices,
@@ -58,7 +58,7 @@ StatusOr<HloInstruction*> CreateSliceHlo(HloInstruction* operand,
       slice_shape, operand, start_indices, limit_indices, strides));
 }
 
-StatusOr<HloInstruction*> CreateConvolveHlo(
+StatusOr<HloInstruction*> MakeConvolveHlo(
     HloInstruction* lhs, HloInstruction* rhs, const Window& window,
     const ConvolutionDimensionNumbers& dimension_numbers) {
   HloComputation* computation = lhs->parent();
@@ -70,8 +70,8 @@ StatusOr<HloInstruction*> CreateConvolveHlo(
       convolve_shape, lhs, rhs, window, dimension_numbers));
 }
 
-StatusOr<HloInstruction*> CreateTransposeHlo(HloInstruction* operand,
-                                             ArraySlice<int64> dimensions) {
+StatusOr<HloInstruction*> MakeTransposeHlo(HloInstruction* operand,
+                                           ArraySlice<int64> dimensions) {
   HloComputation* computation = operand->parent();
   TF_ASSIGN_OR_RETURN(
       Shape transpose_shape,
@@ -80,23 +80,23 @@ StatusOr<HloInstruction*> CreateTransposeHlo(HloInstruction* operand,
       HloInstruction::CreateTranspose(transpose_shape, operand, dimensions));
 }
 
-StatusOr<HloInstruction*> CreateReshapeHlo(const Shape& result_shape,
-                                           HloInstruction* operand) {
+StatusOr<HloInstruction*> MakeReshapeHlo(const Shape& result_shape,
+                                         HloInstruction* operand) {
   HloComputation* computation = operand->parent();
   return computation->AddInstruction(
       HloInstruction::CreateReshape(result_shape, operand));
 }
 
-StatusOr<HloInstruction*> CreateReshapeHlo(
+StatusOr<HloInstruction*> MakeReshapeHlo(
     ArraySlice<int64> result_shape_dim_bounds, HloInstruction* operand) {
   Shape new_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
                                          result_shape_dim_bounds);
-  return CreateReshapeHlo(new_shape, operand);
+  return MakeReshapeHlo(new_shape, operand);
 }
 
-StatusOr<HloInstruction*> CreateDynamicSliceHlo(HloInstruction* operand,
-                                                HloInstruction* start_indices,
-                                                ArraySlice<int64> slice_sizes) {
+StatusOr<HloInstruction*> MakeDynamicSliceHlo(HloInstruction* operand,
+                                              HloInstruction* start_indices,
+                                              ArraySlice<int64> slice_sizes) {
   HloComputation* computation = operand->parent();
   CHECK_EQ(computation, start_indices->parent());
   TF_ASSIGN_OR_RETURN(
@@ -107,7 +107,7 @@ StatusOr<HloInstruction*> CreateDynamicSliceHlo(HloInstruction* operand,
       dynamic_slice_shape, operand, start_indices, slice_sizes));
 }
 
-StatusOr<HloInstruction*> CreateDynamicUpdateSliceHlo(
+StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
     HloInstruction* operand, HloInstruction* update,
     HloInstruction* start_indices) {
   HloComputation* computation = operand->parent();
@@ -121,7 +121,7 @@ StatusOr<HloInstruction*> CreateDynamicUpdateSliceHlo(
       dynamic_update_slice_shape, operand, update, start_indices));
 }
 
-StatusOr<HloInstruction*> CreateBroadcastHlo(
+StatusOr<HloInstruction*> MakeBroadcastHlo(
     HloInstruction* operand, ArraySlice<int64> broadcast_dimensions,
     ArraySlice<int64> result_shape_bounds) {
   HloComputation* computation = operand->parent();
@@ -132,8 +132,8 @@ StatusOr<HloInstruction*> CreateBroadcastHlo(
       broadcast_shape, operand, broadcast_dimensions));
 }
 
-StatusOr<HloInstruction*> CreateGetTupleElementHlo(HloInstruction* operand,
-                                                   int64 index) {
+StatusOr<HloInstruction*> MakeGetTupleElementHlo(HloInstruction* operand,
+                                                 int64 index) {
   HloComputation* computation = operand->parent();
 
   TF_ASSIGN_OR_RETURN(
@@ -143,8 +143,8 @@ StatusOr<HloInstruction*> CreateGetTupleElementHlo(HloInstruction* operand,
       HloInstruction::CreateGetTupleElement(gte_shape, operand, index));
 }
 
-StatusOr<HloInstruction*> CreateConcatHlo(ArraySlice<HloInstruction*> operands,
-                                          int64 dimension) {
+StatusOr<HloInstruction*> MakeConcatHlo(ArraySlice<HloInstruction*> operands,
+                                        int64 dimension) {
   CHECK_GT(operands.size(), 0);
 
   HloComputation* computation = operands[0]->parent();
@@ -181,7 +181,7 @@ StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   Shape output_shape =
       ShapeUtil::MakeShape(operand_shape.element_type(), new_shape_dims);
 
-  return CreateReshapeHlo(output_shape, operand);
+  return MakeReshapeHlo(output_shape, operand);
 }
 
 StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
@@ -198,25 +198,7 @@ StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
             std::back_inserter(expanded_shape_dim_bounds));
   Shape new_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
                                          expanded_shape_dim_bounds);
-  return CreateReshapeHlo(new_shape, operand);
-}
-
-StatusOr<HloInstruction*> ExpandLastDimIntoNDims(
-    HloInstruction* operand, ArraySlice<int64> expanded_dims) {
-  CHECK_GT(operand->shape().dimensions_size(), 0);
-  CHECK_EQ(operand->shape().dimensions(operand->shape().dimensions_size() - 1),
-           Product(expanded_dims));
-
-  std::vector<int64> expanded_shape_dim_bounds;
-  expanded_shape_dim_bounds.reserve(expanded_dims.size() +
-                                    operand->shape().dimensions_size() - 1);
-  std::copy(operand->shape().dimensions().begin(),
-            operand->shape().dimensions().end() - 1,
-            std::back_inserter(expanded_shape_dim_bounds));
-  c_copy(expanded_dims, std::back_inserter(expanded_shape_dim_bounds));
-  Shape new_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
-                                         expanded_shape_dim_bounds);
-  return CreateReshapeHlo(new_shape, operand);
+  return MakeReshapeHlo(new_shape, operand);
 }
 
 StatusOr<HloInstruction*> ElideDegenerateDims(HloInstruction* operand,
@@ -241,7 +223,7 @@ StatusOr<HloInstruction*> ElideDegenerateDims(HloInstruction* operand,
   c_reverse(new_shape_dim_bounds);
   Shape output_shape =
       ShapeUtil::MakeShape(input_shape.element_type(), new_shape_dim_bounds);
-  return CreateReshapeHlo(output_shape, operand);
+  return MakeReshapeHlo(output_shape, operand);
 }
 
 StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
@@ -258,7 +240,7 @@ StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
   HloInstruction* zero =
       computation->AddInstruction(HloInstruction::CreateConstant(
           MakeUnique<Literal>(Literal::Zero(operand->shape().element_type()))));
-  return CreatePadHlo(operand, zero, padding_config);
+  return MakePadHlo(operand, zero, padding_config);
 }
 
 StatusOr<HloInstruction*> BroadcastZeros(
@@ -267,8 +249,8 @@ StatusOr<HloInstruction*> BroadcastZeros(
   HloInstruction* zero =
       computation->AddInstruction(HloInstruction::CreateConstant(
           MakeUnique<Literal>(Literal::Zero(element_type))));
-  return CreateBroadcastHlo(zero, /*broadcast_dimensions=*/{},
-                            /*result_shape_bounds=*/broadcast_dimensions);
+  return MakeBroadcastHlo(zero, /*broadcast_dimensions=*/{},
+                          /*result_shape_bounds=*/broadcast_dimensions);
 }
 
 StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 2b03a849cff35008a96eaedd212ab1aa24695822..d99e32a737e6aaa2ff746cf6c00d4300cf62f4e1 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -28,73 +28,73 @@ namespace xla {
 
 // Creates a binary HLO instruction and adds it to the computation containing
 // `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
-StatusOr<HloInstruction*> CreateBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
-                                          HloInstruction* rhs);
+StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
+                                        HloInstruction* rhs);
 
 // Creates a pad HLO instruction and adds it to the computation containing
 // `operand` and `padding_value` (`operand` and `padding_value` must be in the
 // same computation).
-StatusOr<HloInstruction*> CreatePadHlo(HloInstruction* operand,
-                                       HloInstruction* padding_value,
-                                       const PaddingConfig& padding_config);
+StatusOr<HloInstruction*> MakePadHlo(HloInstruction* operand,
+                                     HloInstruction* padding_value,
+                                     const PaddingConfig& padding_config);
 
 // Creates a slice HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> CreateSliceHlo(
+StatusOr<HloInstruction*> MakeSliceHlo(
     HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices,
     tensorflow::gtl::ArraySlice<int64> strides);
 
 // Creates a convolution HLO instruction and adds it to the computation
 // containing `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
-StatusOr<HloInstruction*> CreateConvolveHlo(
+StatusOr<HloInstruction*> MakeConvolveHlo(
     HloInstruction* lhs, HloInstruction* rhs, const Window& window,
     const ConvolutionDimensionNumbers& dimension_numbers);
 
 // Creates a transpose HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> CreateTransposeHlo(
+StatusOr<HloInstruction*> MakeTransposeHlo(
     HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> dimensions);
 
 // Creates a reshape HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> CreateReshapeHlo(const Shape& result_shape,
-                                           HloInstruction* operand);
+StatusOr<HloInstruction*> MakeReshapeHlo(const Shape& result_shape,
+                                         HloInstruction* operand);
 
-StatusOr<HloInstruction*> CreateReshapeHlo(
+StatusOr<HloInstruction*> MakeReshapeHlo(
     tensorflow::gtl::ArraySlice<int64> result_shape_dim_bounds,
     HloInstruction* operand);
 
 // Creates a dynamic-slice HLO instruction and adds it to the computation
 // containing `operand` and `start_indices` (`operand` and `start_indices` must
 // be in the same computation).
-StatusOr<HloInstruction*> CreateDynamicSliceHlo(
+StatusOr<HloInstruction*> MakeDynamicSliceHlo(
     HloInstruction* operand, HloInstruction* start_indices,
     tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
 // Creates a dynamic-update-slice HLO instruction and adds it to the computation
 // containing `operand`, `update` and `start_indices` (`operand`, `update` and
 // `start_indices` must be in the same computation).
-StatusOr<HloInstruction*> CreateDynamicUpdateSliceHlo(
+StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
     HloInstruction* operand, HloInstruction* update,
     HloInstruction* start_indices);
 
 // Creates a broadcast HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> CreateBroadcastHlo(
+StatusOr<HloInstruction*> MakeBroadcastHlo(
     HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions,
     tensorflow::gtl::ArraySlice<int64> result_shape_bounds);
 
 // Creates a GetTupleElement HLO instruction and adds it to the computation
 // containing `operand`.
-StatusOr<HloInstruction*> CreateGetTupleElementHlo(HloInstruction* operand,
-                                                   int64 index);
+StatusOr<HloInstruction*> MakeGetTupleElementHlo(HloInstruction* operand,
+                                                 int64 index);
 
 // Creates a Concatenate HLO instruction and adds it to the computation
 // containing `operands` (`operands` must be non-empty and every element must be
 // contained in the same computation).
-StatusOr<HloInstruction*> CreateConcatHlo(
+StatusOr<HloInstruction*> MakeConcatHlo(
     tensorflow::gtl::ArraySlice<HloInstruction*> operands, int64 dimension);
 
 // -----------------------------------------------------------------------------
@@ -119,16 +119,6 @@ StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n);
 StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
     HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> expanded_dims);
 
-// Expands (via reshape) the last (logical) dimension of `operand` into a
-// sequence of `expanded_dims` dimensions.  `operand` must at least be of rank 1
-// and the number of elements in its last dimension must be equal to the
-// product of `expanded_dims`.
-//
-// For instance if `operand` has shape f32[9,7,200] and expanded_dims is
-// {2,5,20} the result is `operand` reshaped to [9,7,2,5,20].
-StatusOr<HloInstruction*> ExpandLastDimIntoNDims(
-    HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> expanded_dims);
-
 // Elides (via reshape) a set of degenerate dimensions (dimensions containing
 // exactly one element), `dims_to_elide` from `operand`.  Every dimension in
 // `dims_to_elide` must be a degenerate dimension.  `dims_to_elide` must be
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 279edd4ba8772a9c576f76f554de8ec68631b953..cd7cbbdd71706fddb64855f631eb09de35da52e8 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -109,6 +109,11 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
         continue;
       }
 
+      // Skip instructions which have side effects.
+      if (instruction->HasSideEffect()) {
+        continue;
+      }
+
       // An instruction is considered to be equivalent to another only if they
       // share the exact same set of operands. So to find equivalent
       // instructions, we just search among instructions which share operand(0)
@@ -118,7 +123,7 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
       tensorflow::gtl::InlinedVector<HloInstruction*, 8>
           equivalent_instructions;
       for (HloInstruction* user : operand->users()) {
-        if (user != instruction &&
+        if (user != instruction && !user->HasSideEffect() &&
             user->Identical(*instruction, eq_instructions, eq_computations,
                             is_layout_sensitive_)) {
           equivalent_instructions.push_back(user);
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 3601a790c4428ee39c264b217a4b9a991ad8456c..df8853f34f6a72c52d1cde7332ada3809d2f3d96 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -414,8 +414,7 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   EXPECT_THAT(root, op::Add(rng1, rng2));
 }
 
-// TODO(b/28245743): Handle impure functions correctly in CSE.
-TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
+TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   // Test that two calls to an impure function are not commoned. RNG
   // is the source of the impurity.
 
@@ -458,14 +457,16 @@ TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(op::Map(), op::Map()));
 
+  VLOG(3) << "before: " << module->ToString();
+
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+
+  VLOG(3) << "after: " << module->ToString();
 
   EXPECT_EQ(4, computation->instruction_count());
   root = computation->root_instruction();
-  auto operand = root->operand(0)->operand(0);
-  EXPECT_THAT(operand, op::Map());
-  EXPECT_THAT(root, op::Add(operand, operand));
+  EXPECT_THAT(root, op::Add(op::Map(op::Constant()), op::Map(op::Constant())));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 934e43ba4879628362009267c671ec4cb0d79c52..0c37a8d75f38dabaad886cc9d4adce8ab29ddf18 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -368,11 +368,11 @@ bool HloDataflowAnalysis::UpdateConditionalValueSet(
           conditional->true_computation()->root_instruction()),
       &GetInstructionValueSet(
           conditional->false_computation()->root_instruction())};
-  // A phi-node is not defined for a kConditional instruction even though it
-  // represents a join point. This is because the current approach is to define
-  // a phi-node only for kWhile to account for the dataflow through back-edges
-  // and deal with the ambiguity in other cases.
-  return GetInstructionValueSet(conditional).AssignUnionOf(inputs);
+  if (ssa_form_) {
+    return Phi(conditional, inputs);
+  } else {
+    return GetInstructionValueSet(conditional).AssignUnionOf(inputs);
+  }
 }
 
 bool HloDataflowAnalysis::UpdateCopyValueSet(HloInstruction* copy) {
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 7bf3a1a06045c79621d75b653bf42220705a69d4..07f69b8e1339fed636e4eb54791941b85e09fd17 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1602,11 +1602,17 @@ TEST_P(HloDataflowAnalysisTest, ConditionalWithIdentity) {
   EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
               ElementsAre(HloUse{conditional, 2, {}}));
 
-  EXPECT_EQ(analysis.values().size(), 3);
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
-  EXPECT_THAT(HloValuesAt(conditional),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
-                                   analysis.GetValueDefinedAt(constant2)));
+  bool ssa_form = GetParam();
+  if (ssa_form) {
+    EXPECT_EQ(analysis.values().size(), 4);
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(conditional));
+  } else {
+    EXPECT_EQ(analysis.values().size(), 3);
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+    EXPECT_THAT(HloValuesAt(conditional),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                     analysis.GetValueDefinedAt(constant2)));
+  }
 }
 
 TEST_P(HloDataflowAnalysisTest, ConditionalTakingTupleOperand) {
@@ -1713,11 +1719,17 @@ TEST_P(HloDataflowAnalysisTest, ConditionalTakingTupleOperand) {
                   HloUse{true_x, 0, {}}, HloUse{true_y, 0, {}},
                   HloUse{false_x, 0, {}}, HloUse{false_y, 0, {}}));
 
-  EXPECT_EQ(analysis.values().size(), 6);
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
-  EXPECT_THAT(HloValuesAt(conditional),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(add),
-                                   analysis.GetValueDefinedAt(sub)));
+  bool ssa_form = GetParam();
+  if (ssa_form) {
+    EXPECT_EQ(analysis.values().size(), 7);
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(conditional));
+  } else {
+    EXPECT_EQ(analysis.values().size(), 6);
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+    EXPECT_THAT(HloValuesAt(conditional),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(add),
+                                     analysis.GetValueDefinedAt(sub)));
+  }
 }
 
 TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
@@ -1834,20 +1846,27 @@ TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
   EXPECT_EQ(analysis.GetUniqueValueAt(false_operand_cond),
             analysis.GetValueDefinedAt(constant2));
 
-  EXPECT_EQ(analysis.values().size(), 9);
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(inner_conditional));
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
-  EXPECT_THAT(
-      HloValuesAt(inner_conditional),
-      UnorderedElementsAre(
-          analysis.GetValueDefinedAt(computation1->root_instruction()),
-          analysis.GetValueDefinedAt(computation2->root_instruction())));
-  EXPECT_THAT(
-      HloValuesAt(conditional),
-      UnorderedElementsAre(
-          analysis.GetValueDefinedAt(computation1->root_instruction()),
-          analysis.GetValueDefinedAt(computation2->root_instruction()),
-          analysis.GetValueDefinedAt(computation3->root_instruction())));
+  bool ssa_form = GetParam();
+  if (ssa_form) {
+    EXPECT_EQ(analysis.values().size(), 11);
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(inner_conditional));
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(conditional));
+  } else {
+    EXPECT_EQ(analysis.values().size(), 9);
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(inner_conditional));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+    EXPECT_THAT(
+        HloValuesAt(inner_conditional),
+        UnorderedElementsAre(
+            analysis.GetValueDefinedAt(computation1->root_instruction()),
+            analysis.GetValueDefinedAt(computation2->root_instruction())));
+    EXPECT_THAT(
+        HloValuesAt(conditional),
+        UnorderedElementsAre(
+            analysis.GetValueDefinedAt(computation1->root_instruction()),
+            analysis.GetValueDefinedAt(computation2->root_instruction()),
+            analysis.GetValueDefinedAt(computation3->root_instruction())));
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 91341b5d35d85b904715fb5a059f51fff13ac4da..9d7251b6ae94c8ffd14db980f18df077c9767ae7 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1520,14 +1520,12 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       arg_dim_counts[dim] = arg_dimensions[dim];
     }
 
-    // Create mapping from result index to arg index.
-    const int64 result_rank = ShapeUtil::Rank(result->shape());
-    int64 result_dim = 0;
-    std::vector<int64> result_to_arg_index(result_rank);
+    // Map each dimension in the result to a dimension in arg that isn't
+    // being reduced.
+    std::vector<int64> result_to_arg_index;
     for (int64 i = 0; i < arg_dimensions.size(); ++i) {
       if (arg_dim_steps[i] == 0) {
-        result_to_arg_index[result_dim] = i;
-        ++result_dim;
+        result_to_arg_index.push_back(i);
       }
     }
 
@@ -1542,6 +1540,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             base[result_to_arg_index[i]] = multi_index[i];
           }
 
+          // When the reduction is addition of floats, accumulate in a double
+          // for better precision. Also, avoid creating Literals for the
+          // intermediate results; it's much faster.
+          if (ShapeUtil::ElementIsFloating(init_literal.shape()) &&
+              IsScalarAdd(function)) {
+            double computed_result = 0;
+            auto func = [&](ArraySlice<int64> input_index) {
+              computed_result += arg_literal.Get<float>(input_index);
+              return true;
+            };
+            ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
+                                    arg_dim_steps, func);
+            return static_cast<ReturnT>(computed_result);
+          }
           auto func = [&](ArraySlice<int64> input_index) {
             auto curr_val = arg_literal.Get<ReturnT>(input_index);
 
@@ -1554,19 +1566,17 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             std::unique_ptr<Literal> computed_result =
                 embedded_evaluator.Evaluate<const Literal*>(*function, args)
                     .ConsumeValueOrDie();
-            // Clear visit states so that the we can use the evaluate again on
+            // Clear visit states so that we can use the evaluator again on
             // the same computation.
             embedded_evaluator.ResetVisitStates();
-
             // Assign computed result to result_val.
             result_val = computed_result->Get<ReturnT>({});
-
             return true;
           };
-
+          // Computes one element of the result, reducing all dimensions that
+          // contribute to that element.
           ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
                                   arg_dim_steps, func);
-
           return result_val;
         }));
 
@@ -1574,6 +1584,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  bool IsScalarAdd(HloComputation* computation) {
+    HloInstruction* instruction = computation->root_instruction();
+    if (instruction->opcode() == HloOpcode::kAdd &&
+        computation->num_parameters() == 2) {
+      const HloInstruction* lhs = instruction->operand(0);
+      const HloInstruction* rhs = instruction->operand(1);
+      return lhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(lhs->shape()) &&
+             rhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(rhs->shape()) && lhs != rhs;
+    }
+    return false;
+  }
+
   Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
     auto operand = select_and_scatter->operand(0);
     auto source = select_and_scatter->operand(1);
@@ -2771,6 +2795,8 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
       gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(),
       /*output_shape=*/shape);
 
+  const Shape& operand_shape = operand.shape();
+
   auto gather_inner_loop_body =
       [&](ArraySlice<int64> output_window_index,
           ArraySlice<int64> input_gather_index,
@@ -2780,9 +2806,16 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
         output_window_index_to_input_index(output_window_index));
     for (int i = 0, e = output_index.size(); i < e; i++) {
       output_index[i] = output_gather_index[i] + output_window_index[i];
+      DCHECK_LT(output_index[i], shape.dimensions(i));
     }
     for (int i = 0, e = input_index.size(); i < e; i++) {
-      input_index[i] = input_gather_index[i] + input_window_index[i];
+      // TODO(b/74360564): We should implement whatever out of bounds behavior
+      // we decide for dynamic-slice here as well.
+      input_index[i] = (input_gather_index[i] + input_window_index[i]) %
+                       operand_shape.dimensions(i);
+      if (input_index[i] < 0) {
+        input_index[i] += operand_shape.dimensions(i);
+      }
     }
     TF_RETURN_IF_ERROR(
         result->CopyElementFrom(operand, input_index, output_index));
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 685cacd7f74c00789296dee16f0a6a94c35a4393..dd14dd38537a83d0ee16cff9e3c22a38f544e208 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -1205,6 +1206,80 @@ TEST_P(HloEvaluatorTest,
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
+class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
+
+// Tests that Reduce doesn't lose precision when adding many numbers (because
+// it accumulates its result in a double).
+TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
+  HloComputation::Builder b(TestName());
+
+  constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
+  std::vector<float> v(kNumElements, 1.0f);
+  HloInstruction* arg_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(v)));
+  HloInstruction* init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
+
+  HloComputation::Builder add_computation("add");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  add_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
+  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+
+  HloInstruction* reduce_instruction = b.AddInstruction(
+      HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
+                                   /*dimensions_to_reduce=*/{0}, add_func));
+  module().AddEntryComputation(b.Build());
+
+  HloEvaluator hlo_eval;
+  std::unique_ptr<Literal> result =
+      hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  LiteralTestUtil::ExpectR0Equal<float>(kNumElements, *result);
+}
+
+// Reducing many numbers should be fast because it doesn't create
+// intermediate Literals; the microbenchmark should finish in < 1 msec.
+void BM_ReducePrecisely(int num_iters) {
+  tensorflow::testing::StopTiming();
+  HloComputation::Builder b("BM_ReducePrecisely");
+  HloModuleConfig config;
+  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  HloModule module("BM_ReducePrecisely", VersionedComputationHandle(), config);
+
+  constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
+  std::vector<float> v(kNumElements, 1.0f);
+  HloInstruction* arg_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(v)));
+  auto init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
+
+  HloComputation::Builder add_computation("add");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  add_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
+  auto add_func = module.AddEmbeddedComputation(add_computation.Build());
+
+  HloInstruction* reduce_instruction = b.AddInstruction(
+      HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
+                                   /*dimensions_to_reduce=*/{0}, add_func));
+  module.AddEntryComputation(b.Build());
+
+  HloEvaluator hlo_eval;
+  tensorflow::testing::StartTiming();
+  hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  tensorflow::testing::StopTiming();
+}
+
+BENCHMARK(BM_ReducePrecisely);
+
 TEST_P(HloEvaluatorTest, ReduceAdd) {
   HloComputation::Builder b(TestName());
 
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index f0df93b61d29c1535d8a89fbd65e669de5b43729..c3ccbf0f0c75b569b49652807dea52faebdccc31 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -111,8 +111,8 @@ HloExecutionProfile::HloExecutionProfile(
     : hlo_profile_printer_data_(*hlo_profile_printer_data),
       hlo_profile_index_map_(*hlo_profile_index_map),
       profile_counters_(
-          /*count*/ hlo_profile_index_map_.total_count(),
-          /*value*/ 0) {}
+          /*count=*/hlo_profile_index_map_.total_count(),
+          /*value=*/0) {}
 
 void HloExecutionProfile::SetCyclesTakenBy(const HloInstruction* hlo,
                                            uint64 cycles_taken) {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 1dc72355cf179e996caab4d6b52068dc99d02244..25702dc65ea1ebd9d91b3382dcb909e606628202 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -823,7 +823,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
 
     // Otherwise, print e.g. "%constant.42 (s32[100])".
     string constant_name;
-    if (tensorflow::StringPiece(constant->name()).starts_with("constant")) {
+    if (tensorflow::str_util::StartsWith(constant->name(), "constant")) {
       constant_name = constant->name();
     } else {
       constant_name = StrCat("constant ", constant->name());
@@ -1041,8 +1041,8 @@ string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
 
   // The HLO instruction name contains usually the opcode, e.g. "%add.42" is
   // an add instruction.  In this case we render just the name.
-  if (tensorflow::StringPiece(instr->name())
-          .starts_with(HloOpcodeString(instr->opcode()))) {
+  if (tensorflow::str_util::StartsWith(instr->name(),
+                                       HloOpcodeString(instr->opcode()))) {
     return Printf("<b>%s</b>", HtmlLikeStringSanitize(instr->name()));
   }
   string extended_opcode =
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index d33add23d07b52cb56e4b212a29b415259af7694..fcf9ebf5f787445f5e89f126e9f2393fd3bd1790 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -51,24 +52,22 @@ using ::tensorflow::strings::StrCat;
 /* static */
 StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     HloModule* module, const HloInstructionProto& proto,
-    const tensorflow::gtl::FlatMap<string, HloInstruction*>& instruction_map,
-    const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
-    const std::function<void(std::unique_ptr<HloComputation>)>&
-        add_fused_computation) {
+    const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
+    const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
   TF_RET_CHECK(!proto.opcode().empty());
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(proto.opcode()));
   TF_RET_CHECK(proto.has_shape());
 
   auto instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
-  for (const string& operand_name : proto.operand_names()) {
-    TF_RET_CHECK(ContainsKey(instruction_map, operand_name))
-        << "No instruction named " << operand_name;
-    instruction->AppendOperand(instruction_map.at(operand_name));
-  }
-  for (const string& predecessor_name : proto.control_predecessor_names()) {
-    TF_RET_CHECK(ContainsKey(instruction_map, predecessor_name))
-        << "No instruction named " << predecessor_name;
-    TF_RETURN_IF_ERROR(instruction_map.at(predecessor_name)
+  for (const int64 operand_id : proto.operand_ids()) {
+    TF_RET_CHECK(ContainsKey(instruction_map, operand_id))
+        << "No instruction with id " << operand_id;
+    instruction->AppendOperand(instruction_map.at(operand_id));
+  }
+  for (const int64 predecessor_id : proto.control_predecessor_ids()) {
+    TF_RET_CHECK(ContainsKey(instruction_map, predecessor_id))
+        << "No instruction with id " << predecessor_id;
+    TF_RETURN_IF_ERROR(instruction_map.at(predecessor_id)
                            ->AddControlDependencyTo(instruction.get()));
   }
 
@@ -76,26 +75,36 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   // HloInstructionProto and do not appear as an HloComputationProto within the
   // HloModuleProto.
   if (instruction->opcode() == HloOpcode::kFusion) {
-    TF_RET_CHECK(proto.has_fused_instructions_computation());
     TF_RET_CHECK(!proto.fusion_kind().empty());
     TF_ASSIGN_OR_RETURN(instruction->fusion_kind_,
                         StringToFusionKind(proto.fusion_kind()));
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> fused_computation,
-                        HloComputation::CreateFromProto(
-                            module, proto.fused_instructions_computation(),
-                            computation_map, add_fused_computation,
-                            /*fusion_instruction=*/instruction.get()));
-    instruction->called_computations_.push_back(fused_computation.get());
-    add_fused_computation(std::move(fused_computation));
+
+    // Find the fused computation and set its fusion instruction.
+    TF_RET_CHECK(proto.called_computation_ids_size() == 1)
+        << "Expect 1 called computation for fusion instruction, but sees "
+        << proto.called_computation_ids_size();
+    const int64 fusion_id = proto.called_computation_ids(0);
+    auto* fused_computation = FindPtrOrNull(computation_map, fusion_id);
+    TF_RET_CHECK(fused_computation != nullptr)
+        << "No fusion computation with id " << fusion_id;
+    fused_computation->SetFusionInstruction(instruction.get());
+    instruction->called_computations_.push_back(fused_computation);
   } else {
-    for (const string& computation_name : proto.called_computation_names()) {
-      TF_RET_CHECK(ContainsKey(computation_map, computation_name))
-          << "No computation named " << computation_name;
+    for (const int64 computation_id : proto.called_computation_ids()) {
+      TF_RET_CHECK(ContainsKey(computation_map, computation_id))
+          << "No computation with id " << computation_id;
       instruction->called_computations_.push_back(
-          computation_map.at(computation_name));
+          computation_map.at(computation_id));
     }
   }
 
+  if (instruction->opcode() == HloOpcode::kTrace) {
+    TF_RET_CHECK(instruction->operands().size() == 1)
+        << "Trace instruction should have 1 operand but sees "
+        << instruction->operands().size();
+    instruction->mutable_operand(0)->set_tracing(instruction.get());
+  }
+
   TF_RET_CHECK(!proto.name().empty());
   instruction->name_ = proto.name();
 
@@ -168,6 +177,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()));
   instruction->operands_.push_back(operand);
   instruction->literal_ = Literal::CreateR1U8(tag);
+  operand->set_tracing(instruction.get());
   return instruction;
 }
 
@@ -2313,14 +2323,18 @@ string HloInstruction::ToShortString() const {
 
 HloInstructionProto HloInstruction::ToProto() const {
   HloInstructionProto proto;
+  CHECK(unique_id_ != -1)
+      << "This instruction does not have a valid id. Please make sure the "
+         "instruction is inside a module before dumping it.";
+  proto.set_id(unique_id_);
   proto.set_name(name_);
   proto.set_opcode(HloOpcodeString(opcode_));
   *proto.mutable_shape() = shape_;
   for (const HloInstruction* operand : operands_) {
-    *proto.add_operand_names() = operand->name();
+    proto.add_operand_ids(operand->unique_id());
   }
   for (const HloInstruction* control : control_predecessors_) {
-    *proto.add_control_predecessor_names() = control->name();
+    proto.add_control_predecessor_ids(control->unique_id());
   }
 
   *proto.mutable_metadata() = metadata_;
@@ -2330,11 +2344,11 @@ HloInstructionProto HloInstruction::ToProto() const {
   proto.set_parameter_number(parameter_number_);
   if (opcode() == HloOpcode::kFusion) {
     proto.set_fusion_kind(xla::ToString(fusion_kind()));
-    *proto.mutable_fused_instructions_computation() =
-        fused_instructions_computation()->ToProto();
+    proto.add_called_computation_ids(
+        fused_instructions_computation()->unique_id());
   } else {
     for (const HloComputation* computation : called_computations_) {
-      *proto.add_called_computation_names() = computation->name();
+      proto.add_called_computation_ids(computation->unique_id());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index e4c86214c2014095b2e171ff10691e1221574cb7..80f84082442798d240a0a8e11d85ceaf638a4695 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -179,20 +179,15 @@ class HloInstruction {
   //   module: the module which will contain the instruction. The newly created
   //     instruction is *not* added to the module or any computation, however.
   //   proto: the proto to convert from.
-  //   instruction_map: a map from instruction name to HloInstruction*. This map
+  //   instruction_map: a map from instruction id to HloInstruction*. This map
   //     must contain all operands of the newly constructed instruction.
-  //   computation_map: a map from computation name to HloComputation*. This map
+  //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed instruction
   //     calls.
-  //   add_fused_computation: A function to call to add a fused
-  //     computation. Used (clearly) when the instruction is a fusion
-  //     instruction.
   static StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
       HloModule* module, const HloInstructionProto& proto,
-      const tensorflow::gtl::FlatMap<string, HloInstruction*>& instruction_map,
-      const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
-      const std::function<void(std::unique_ptr<HloComputation>)>&
-          add_fused_computation);
+      const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
+      const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
 
   // Creates a parameter-retrieving instruction.
   static std::unique_ptr<HloInstruction> CreateParameter(int64 parameter_number,
@@ -933,6 +928,13 @@ class HloInstruction {
   const HloSharding& sharding_or_default(const HloSharding& default_) const {
     return sharding_ ? *sharding_ : default_;
   }
+  // Returns the sharding unique device, if any.
+  tensorflow::gtl::optional<int64> sharding_unique_device() const {
+    if (sharding_ == nullptr || !sharding_->HasUniqueDevice()) {
+      return tensorflow::gtl::optional<int64>();
+    }
+    return sharding_->UniqueDevice().ValueOrDie();
+  }
   // Sets the sharding of this operator. Should only be called by HloModule or
   // HloComputation methods.
   void set_sharding(const HloSharding& sharding) {
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index cdea3d597824d155241a544d226aa18d3b0b0274..08b9a29aeda2ee612d49b0788acf8438a25eb6a3 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -83,6 +83,11 @@ HloComputation* HloModule::AddComputationInternal(
   for (auto* instruction : computation->instructions()) {
     instruction->SetUniqueId(NewUniqueInstructionId());
   }
+  // Set unique id to this computation.
+  CHECK_NE(computation->root_instruction()->unique_id(), -1)
+      << "Root has no valid id: " << computation->ToString();
+  computation->SetUniqueId(computation->root_instruction()->unique_id());
+
   computation->set_parent(this);
   computations_.push_back(std::move(computation));
   return computations_.back().get();
@@ -204,14 +209,11 @@ string HloModule::ToString(const HloPrintOptions& options) const {
 
 HloModuleProto HloModule::ToProto() const {
   HloModuleProto proto;
+  proto.set_id(unique_id_);
   proto.set_name(name_);
   proto.set_entry_computation_name(entry_computation_->name());
+  proto.set_entry_computation_id(entry_computation_->unique_id());
   for (const HloComputation* computation : MakeComputationPostOrder()) {
-    // Fusion computations are added when the fusion instructions are created by
-    // HloInstruction::CreateFromProto.
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
     HloComputationProto computation_proto = computation->ToProto();
     if (computation->name() == entry_computation_->name()) {
       *proto.mutable_program_shape() = computation_proto.program_shape();
@@ -235,8 +237,8 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   for (int i = 0; i < expected_program_shape.parameters_size(); ++i) {
     const Shape& parameter_shape =
         module_config.entry_computation_layout().parameter_layout(i).shape();
-    TF_RET_CHECK(
-        ShapeUtil::Equal(expected_program_shape.parameters(i), parameter_shape))
+    TF_RET_CHECK(ShapeUtil::Compatible(expected_program_shape.parameters(i),
+                                       parameter_shape))
         << "HloModuleConfig has different shape for parameter " << i
         << " than the HLO module. Expected: "
         << ShapeUtil::HumanStringWithLayout(
@@ -245,7 +247,8 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   }
   const Shape& result_shape =
       module_config.entry_computation_layout().result_layout().shape();
-  TF_RET_CHECK(ShapeUtil::Equal(expected_program_shape.result(), result_shape))
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(expected_program_shape.result(), result_shape))
       << "HloModuleConfig has different result shape than the HLO module. "
          "Expected: "
       << ShapeUtil::HumanStringWithLayout(expected_program_shape.result())
@@ -254,26 +257,20 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   auto module = MakeUnique<HloModule>(proto.name(), entry_computation_handle,
                                       module_config);
 
-  tensorflow::gtl::FlatMap<string, HloComputation*> computation_map;
+  tensorflow::gtl::FlatMap<int64, HloComputation*> computation_map;
   for (const HloComputationProto& computation_proto : proto.computations()) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloComputation> computation,
-        HloComputation::CreateFromProto(
-            module.get(), computation_proto, computation_map,
-            /*add_fused_computation=*/
-            [&module](std::unique_ptr<HloComputation> fused_computation) {
-              module->AddComputationInternal(std::move(fused_computation),
-                                             /*is_entry=*/false,
-                                             /*uniquify_names=*/false);
-            }));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> computation,
+                        HloComputation::CreateFromProto(
+                            module.get(), computation_proto, computation_map));
     CHECK_NE(computation.get(), nullptr);
-    TF_RET_CHECK(!ContainsKey(computation_map, computation->name()));
-    string computation_name = computation->name();
+    int64 computation_id = computation_proto.id();
+    TF_RET_CHECK(computation_id != -1);
+    TF_RET_CHECK(!ContainsKey(computation_map, computation_id));
     // Don't uniquify names because we want names to be stable across
     // serialization and deserialization.
-    computation_map[computation_name] = module->AddComputationInternal(
+    computation_map[computation_id] = module->AddComputationInternal(
         std::move(computation),
-        /*is_entry=*/proto.entry_computation_name() == computation_name,
+        /*is_entry=*/proto.entry_computation_id() == computation_id,
         /*uniquify_names=*/false);
   }
   TF_RET_CHECK(module->entry_computation_ != nullptr);
@@ -283,10 +280,6 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   tensorflow::gtl::FlatSet<string> computation_names;
   tensorflow::gtl::FlatSet<string> instruction_names;
   for (HloComputation* computation : module->computations()) {
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
-
     TF_RET_CHECK(!ContainsKey(computation_names, computation->name()))
         << "Computation name is not unique: " << computation->name();
     computation_names.insert(computation->name());
@@ -302,12 +295,13 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
 /* static */
 StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
-    const HloModuleProto& module) {
+    const HloModuleProto& module, const DebugOptions& debug_options) {
   TF_RET_CHECK(module.has_program_shape())
       << "No program shape found in the proto";
   const auto& program_shape = module.program_shape();
 
   HloModuleConfig module_config(program_shape);
+  module_config.set_debug_options(debug_options);
 
   // The module config is constructed with default layouts regardless of what is
   // passed in via the ProgramShape. Set the layouts to the appropriate values.
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 755bbd359f7b95e7f3f3cbee1b46df85908202c6..9f7f25202ba42b14e995ed5c47d1012dabc69332 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -172,7 +172,7 @@ class HloModule {
   // Creates and returns an HloModuleConfig with an appropriate program shape
   // for the HLO module in the given proto.
   static StatusOr<HloModuleConfig> CreateModuleConfigFromProto(
-      const HloModuleProto& module);
+      const HloModuleProto& module, const DebugOptions& debug_options);
 
   // Outlines the given expression from the given computation.
   // instructions_to_outline contains the instructions that form the expression.
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index fa5dcb0b369d17c70c64c67b9f11640c93fb4278..54c34ce116651608e6d91cdcba9c708ca3a5f75e 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -313,6 +313,27 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() {
     if (!ShapeUtil::Compatible(send_shape, recv_shape)) {
       return FailedPrecondition("send/recv shapes do not match");
     }
+    const HloModule* send_module = channel.send->parent()->parent();
+    const HloModule* send_done_module = channel.send_done->parent()->parent();
+    if (send_module != send_done_module) {
+      return FailedPrecondition(
+          "send and send-done (channel=%lld) must be on the same device: %lld "
+          "vs. %lld",
+          channel.id, GetModuleId(send_module), GetModuleId(send_done_module));
+    }
+    const HloModule* recv_module = channel.recv->parent()->parent();
+    const HloModule* recv_done_module = channel.recv_done->parent()->parent();
+    if (recv_module != recv_done_module) {
+      return FailedPrecondition(
+          "recv and recv-done (channel=%lld) must be on the same device: %lld "
+          "vs. %lld",
+          channel.id, GetModuleId(recv_module), GetModuleId(recv_done_module));
+    }
+    if (send_module == recv_module) {
+      return FailedPrecondition(
+          "send and recv (channel=%lld) must be on different devices: %lld",
+          channel.id, GetModuleId(send_module));
+    }
   }
 
   // Check if channel instructions are used only in allowed computations.
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 1b24d8da9e832e6847cb6f405e15af3c455f695a..e89d94bede6c437ca1131a1b1b0098390d58c0d9 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -66,6 +66,28 @@ bool HloOrdering::ExecutesBefore(const HloInstruction* a,
     }
   }
 
+  // If the common ancestor is a conditional instruction, even though the true
+  // and false computations are not really ordered per-se, we define the true
+  // computation to be ordered before the false one.
+  // This ensures that buffers can still be shared among the two computations
+  // as they will forcibly have disjoint liveness.
+  if (a_ancestor == b_ancestor &&
+      a_ancestor->opcode() == HloOpcode::kConditional) {
+    const HloComputation* true_computation = a_ancestor->true_computation();
+    const HloComputation* false_computation = a_ancestor->false_computation();
+    if (call_graph_->InstructionIsNestedIn(a, true_computation) &&
+        call_graph_->InstructionIsNestedIn(b, false_computation)) {
+      return true;
+    }
+    // If 'b' is the conditional ancestor, and 'a' is within the true or false
+    // computations, 'a' executes before 'b'.
+    if (b == a_ancestor &&
+        (call_graph_->InstructionIsNestedIn(a, true_computation) ||
+         call_graph_->InstructionIsNestedIn(a, false_computation))) {
+      return true;
+    }
+  }
+
   return ExecutesBeforeInSameComputation(a_ancestor, b_ancestor);
 }
 
@@ -118,7 +140,18 @@ bool HloOrdering::IsDefinedBefore(const HloValue& a, const HloValue& b) const {
            b.defining_instruction()->while_condition()))) {
     return true;
   }
-
+  // If 'b' is a conditional phi and 'a' is in the true or false computation,
+  // then 'a' executes before 'b'.
+  if (b.is_phi() &&
+      b.defining_instruction()->opcode() == HloOpcode::kConditional &&
+      (call_graph_->InstructionIsNestedIn(
+           a.defining_instruction(),
+           b.defining_instruction()->true_computation()) ||
+       call_graph_->InstructionIsNestedIn(
+           a.defining_instruction(),
+           b.defining_instruction()->false_computation()))) {
+    return true;
+  }
   return ExecutesBefore(a.defining_instruction(), b.defining_instruction());
 }
 
@@ -212,18 +245,17 @@ bool HloOrdering::LiveRangeStrictlyBefore(
   VLOG(4) << "LiveRangeStrictlyBefore(a = " << a.ToShortString()
           << ", b = " << b.ToShortString() << ")";
   if (!IsDefinedBefore(a, b)) {
-    VLOG(4) << "a not defined before b";
+    VLOG(4) << a << " not defined before " << b;
     return false;
   }
-
   // All uses of 'a' must be before 'b' is defined.
   for (const HloUse& use : a.uses()) {
     if (!UseIsBeforeValueDefinition(use, b, dataflow)) {
-      VLOG(4) << "use of a (" << use << ") not before b is defined";
+      VLOG(4) << "use of " << a << " (" << use << ") not before " << b
+              << " is defined";
       return false;
     }
   }
-
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index a989fce63234cb860d08c48b02462e96bec879bc..37a7fbad97cea2f34798efecc2489e57d1374f35 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -34,53 +34,6 @@ namespace {
 
 class HloOrderingTest : public HloTestBase {};
 
-TEST_F(HloOrderingTest, LastUseScheduledFirst) {
-  // Tests scheduling of the following HLO code:
-  //
-  //   %ab = abs(%param)
-  //   %exp = exp(%param)
-  //   %add = add(%ab, %exp)
-  //   %negate = negate(%exp)
-  //   %sub = subtract(%add, %negate)
-  //
-  // %add should be scheduled before %negate because %add is the last (and only)
-  // use of %ab. Scheduling %add first then frees up %ab's buffer.
-  const Shape vec = ShapeUtil::MakeShape(xla::F32, {42});
-  auto builder = HloComputation::Builder(TestName());
-  auto param =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, vec, "param"));
-  auto ab = builder.AddInstruction(
-      HloInstruction::CreateUnary(vec, HloOpcode::kAbs, param));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(vec, HloOpcode::kExp, param));
-
-  auto add = builder.AddInstruction(
-      HloInstruction::CreateBinary(vec, HloOpcode::kAdd, ab, exp));
-  auto negate = builder.AddInstruction(
-      HloInstruction::CreateUnary(vec, HloOpcode::kNegate, exp));
-  auto sub = builder.AddInstruction(
-      HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
-
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      SequentialHloOrdering::HloModuleSequence sequence,
-      CreateMemoryMinimizingSequence(*module, [](const LogicalBuffer& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape());
-      }));
-  // Verify that all instructions are in the sequence.
-  EXPECT_EQ(module->entry_computation()->instruction_count(),
-            sequence.at(module->entry_computation()).size());
-
-  // The first instruction should be the parameter and the last the root "sub".
-  EXPECT_EQ(param, sequence.at(module->entry_computation()).front());
-  EXPECT_EQ(sub, sequence.at(module->entry_computation()).back());
-
-  SequentialHloOrdering ordering(module.get(), sequence);
-  EXPECT_TRUE(ordering.ExecutesBefore(add, negate));
-}
-
 TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
   // Tests the ordering of instructions in different computations using the
   // following HLO code:
@@ -362,5 +315,66 @@ ENTRY while.v11 {
   ordering.ToString();  // Shouldn't crash.
 }
 
+TEST_F(HloOrderingTest, ConditionalInstructionOrdering) {
+  const char* module_str = R"(
+HloModule test_conditional_module
+
+true_branch {
+  param.1 = (s32[], s32[]) parameter(0)
+  get-tuple-element.1 = s32[] get-tuple-element(param.1), index=0
+  get-tuple-element.2 = s32[] get-tuple-element(param.1), index=1
+  add.1 = s32[] add(get-tuple-element.1, get-tuple-element.2)
+  ROOT tuple.1 = (s32[], s32[]) tuple(add.1, get-tuple-element.1)
+}
+
+false_branch {
+  param.2 = (s32[], s32[]) parameter(0)
+  get-tuple-element.3 = s32[] get-tuple-element(param.2), index=0
+  get-tuple-element.4 = s32[] get-tuple-element(param.2), index=1
+  add.2 = s32[] add(get-tuple-element.3, get-tuple-element.4)
+  ROOT tuple.2 = (s32[], s32[]) tuple(add.2, get-tuple-element.4)
+}
+
+ENTRY root {
+  param.3 = (pred[], (s32[], s32[])) parameter(0)
+  pred.1 = pred[] get-tuple-element(param.3), index=0
+  cond_arg.1 = (s32[], s32[]) get-tuple-element(param.3), index=1
+  conditional = (s32[], s32[]) conditional(pred.1, cond_arg.1, cond_arg.1), true_computation=true_branch, false_computation=false_branch
+  cond_res.1 = s32[] get-tuple-element(conditional), index=0
+  cond_res.2 = s32[] get-tuple-element(conditional), index=1
+  add.3 = s32[] add(cond_res.1, cond_res.2)
+  ROOT result = (s32[], s32[], s32[]) tuple(add.3, cond_res.1, cond_res.2)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(auto dataflow,
+                          HloDataflowAnalysis::Run(*module, /*ssa_form=*/true));
+  DependencyHloOrdering ordering(module.get());
+
+  // Even though the true and false branches has no ordering, since they do not
+  // interfere (as they are mutually exclusive), we define the true computation
+  // to be before the false one.
+  // Similarly, any instruction in the true or false branches are considered
+  // before the conditional instruction. The roots are effectively "at the same
+  // time" WRT the conditional, but they are Phi-ed anyway.
+  HloInstruction* add_1 = FindInstruction(module.get(), "add.1");
+  HloInstruction* add_2 = FindInstruction(module.get(), "add.2");
+  HloInstruction* add_3 = FindInstruction(module.get(), "add.3");
+  HloInstruction* conditional = FindInstruction(module.get(), "conditional");
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_1),
+                                       dataflow->GetValueDefinedAt(add_2)));
+  EXPECT_TRUE(
+      ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_2),
+                               dataflow->GetValueDefinedAt(conditional)));
+  EXPECT_TRUE(
+      ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_1),
+                               dataflow->GetValueDefinedAt(conditional)));
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_1),
+                                       dataflow->GetValueDefinedAt(add_3)));
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(add_2),
+                                       dataflow->GetValueDefinedAt(add_3)));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 98b8d34be1f331aaeac94e952deeae1e76379861..b0632448933df4b7681a0704c58d697b5ec68a1f 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1320,7 +1320,7 @@ StatusOr<bool> HloRematerialization::Run(
 /* static */ StatusOr<bool> HloRematerialization::RematerializeAndSchedule(
     const HloRematerialization::ShapeSizeFunction& size_function,
     int64 memory_limit_bytes, HloModule* hlo_module,
-    SchedulerAlgorithm scheduler_algorithm,
+    MemorySchedulerAlgorithm scheduler_algorithm,
     SequentialHloOrdering::HloModuleSequence* sequence,
     RematerializationSizes* sizes) {
   HloRematerialization remat(scheduler_algorithm, size_function);
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 52553439033a3bcfa4b472f13f9cd4b1ecf5ed96..2ee2dd0571ae8c6604e4ca722351fd48a913bda5 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -66,12 +66,12 @@ class HloRematerialization {
   // code generation.
   static StatusOr<bool> RematerializeAndSchedule(
       const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
-      HloModule* hlo_module, SchedulerAlgorithm scheduler_algorithm,
+      HloModule* hlo_module, MemorySchedulerAlgorithm scheduler_algorithm,
       SequentialHloOrdering::HloModuleSequence* sequence,
       RematerializationSizes* sizes = nullptr);
 
  protected:
-  HloRematerialization(SchedulerAlgorithm scheduler_algorithm,
+  HloRematerialization(MemorySchedulerAlgorithm scheduler_algorithm,
                        const ShapeSizeFunction& size_function)
       : scheduler_algorithm_(scheduler_algorithm),
         size_function_(size_function) {}
@@ -108,7 +108,7 @@ class HloRematerialization {
       const HloInstruction* instruction) const;
 
   // Selects an algorithm to use for HLO scheduling.
-  SchedulerAlgorithm scheduler_algorithm_;
+  MemorySchedulerAlgorithm scheduler_algorithm_;
 
   // Function which computes the size of the top-level buffer of a shape.
   const ShapeSizeFunction size_function_;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 1b7d26dde501a6a0955d62ea0938e0683a32d49d..83de54f3fa56ee660b79d8c366dbc0b52f9fde87 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -162,7 +162,7 @@ TEST_F(HloRematerializationTest, SingleComputation) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/14 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // Root should not have changed.
@@ -195,7 +195,7 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/20 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
 
   // No instructions should have been materialized.
   EXPECT_FALSE(changed);
@@ -236,7 +236,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/17 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // Only the entry computation should have a rematerialized instruction added.
@@ -272,7 +272,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/15 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // Both computations should have a rematerialized instruction added.
@@ -314,7 +314,7 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/13 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // All computations should have a rematerialized instruction added.
@@ -385,7 +385,7 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) {
       bool changed, HloRematerialization::RematerializeAndSchedule(
                         ByteSizeOf,
                         /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_),
-                        module.get(), SchedulerAlgorithm::kAuto, &sequence));
+                        module.get(), DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
   // The rng should not have been rematerialized.
   EXPECT_EQ(count_rngs(entry_computation), 1);
@@ -480,7 +480,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/22 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   EXPECT_TRUE(changed);
 
   // The broadcast should have been rematerialized 3 times.
@@ -577,7 +577,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
                           HloRematerialization::RematerializeAndSchedule(
                               ByteSizeOf,
                               /*memory_limit_bytes=*/22 * 1024, module.get(),
-                              SchedulerAlgorithm::kAuto, &sequence));
+                              DefaultMemoryScheduler, &sequence));
   // Rematerialization should only occur if the rematerializable instruction has
   // no indirect uses.
   if (indirectly_used) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index e5b1c2efa3fc25d23531df298e125521c002dba1..ec7d8210a70ad7498f77fe807abd53544d4b0487 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -52,10 +52,9 @@ namespace {
 // Creates an HloModule from the given proto.
 StatusOr<std::unique_ptr<HloModule>> HloProtoToModule(
     const HloProto& proto, const DebugOptions& debug_options) {
-  TF_ASSIGN_OR_RETURN(
-      HloModuleConfig config,
-      HloModule::CreateModuleConfigFromProto(proto.hlo_module()));
-  config.set_debug_options(debug_options);
+  TF_ASSIGN_OR_RETURN(HloModuleConfig config,
+                      HloModule::CreateModuleConfigFromProto(proto.hlo_module(),
+                                                             debug_options));
   TF_ASSIGN_OR_RETURN(auto module,
                       HloModule::CreateFromProto(proto.hlo_module(), config));
   return std::move(module);
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index da448ed71ab470e0c4d72e234bf1f1087d3ea7b4..1a767628f6e2d33df353366974fb866e89f0df5a 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -103,10 +103,11 @@ class ListScheduler {
     for (auto* instruction : computation.instructions()) {
       tensorflow::gtl::FlatSet<const LogicalBuffer*> instr_uses;
       for (auto* operand : instruction->operands()) {
-        for (const LogicalBuffer* buffer :
-             points_to_analysis.GetBuffersDefinedByInstruction(operand)) {
-          instr_uses.insert(buffer);
-        }
+        points_to_analysis.GetPointsToSet(operand).ForEachElement(
+            [&](const ShapeIndex& /*index*/,
+                const PointsToSet::BufferList& buffers) {
+              instr_uses.insert(buffers.begin(), buffers.end());
+            });
       }
       buffer_uses_[instruction] = std::vector<const LogicalBuffer*>(
           instr_uses.begin(), instr_uses.end());
@@ -339,7 +340,33 @@ int64 SumLogicalBufferSizes(
   return size;
 }
 
-StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
+StatusOr<int64> MinimumMemoryForComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
+                         sequence, points_to_analysis, size_function));
+  return result.heap_size;
+}
+
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const MemorySchedulerAlgorithm& algorithm) {
+  VLOG(2) << "Computation: " << computation.name();
+  if (algorithm) {
+    return algorithm(computation, points_to_analysis, size_function);
+  }
+  return DefaultMemoryScheduler(computation, points_to_analysis, size_function);
+}
+
+}  // namespace
+
+StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function) {
@@ -396,32 +423,17 @@ StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
   return sequence;
 }
 
-StatusOr<int64> MinimumMemoryForComputation(
+StatusOr<std::vector<const HloInstruction*>> ListMemoryScheduler(
     const HloComputation& computation,
-    const std::vector<const HloInstruction*>& sequence,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function) {
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
-                         sequence, points_to_analysis, size_function));
-  return result.heap_size;
+  return ListScheduler::Run(computation, points_to_analysis, size_function);
 }
 
-StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+StatusOr<std::vector<const HloInstruction*>> DefaultMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    SchedulerAlgorithm algorithm) {
-  VLOG(2) << "Computation: " << computation.name();
-  if (algorithm == SchedulerAlgorithm::kListSchedule) {
-    return ListScheduler::Run(computation, points_to_analysis, size_function);
-  }
-  if (algorithm == SchedulerAlgorithm::kDfsSchedule) {
-    return RunDFSMemoryScheduler(computation, points_to_analysis,
-                                 size_function);
-  }
-
+    const LogicalBuffer::SizeFunction& size_function) {
   // We try both a list-scheduler based ordering and a DFS based ordering, and
   // choose whichever returns a lower min-memory, not accounting for
   // fragmentation.
@@ -431,7 +443,7 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
   // within the caller's context. But it's good enough for now.
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> list_sequence,
-      ListScheduler::Run(computation, points_to_analysis, size_function));
+      ListMemoryScheduler(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 list_memory,
       MinimumMemoryForComputation(computation, list_sequence,
@@ -440,7 +452,7 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
 
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> dfs_sequence,
-      RunDFSMemoryScheduler(computation, points_to_analysis, size_function));
+      DFSMemoryScheduler(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 dfs_memory,
       MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
@@ -458,12 +470,10 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
   }
 }
 
-}  // namespace
-
 StatusOr<SequentialHloOrdering::HloModuleSequence>
 CreateMemoryMinimizingSequence(const HloModule& module,
                                const LogicalBuffer::SizeFunction& size_function,
-                               SchedulerAlgorithm algorithm) {
+                               const MemorySchedulerAlgorithm& algorithm) {
   SequentialHloOrdering::HloModuleSequence sequence;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(&module));
@@ -479,7 +489,7 @@ CreateMemoryMinimizingSequence(const HloModule& module,
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const LogicalBuffer::SizeFunction& size_function,
-    SchedulerAlgorithm algorithm) {
+    const MemorySchedulerAlgorithm& algorithm) {
   CHECK(!computation.IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(computation.parent()));
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h
index 1d1eb1e064f75c2220b39e84b010e720a0c37880..068e68383deb170ded1c9b09a8b7ceb8c4c0ab4b 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.h
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 
@@ -33,28 +34,48 @@ StatusOr<int64> MinimumMemoryForSequence(
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const LogicalBuffer::SizeFunction& size_function);
 
-enum class SchedulerAlgorithm {
-  kListSchedule,
-  kDfsSchedule,
+// A memory scheduler computes an execution sequence for the HLO instructions in
+// 'computation' that minimizes peak memory, given a points-to analysis result
+// that describes buffer aliasing, together with a target-specific size function
+// that maps a tensor's logical size to its padded size.
+typedef std::function<StatusOr<std::vector<const HloInstruction*>>(
+    const HloComputation&, const TuplePointsToAnalysis&,
+    const LogicalBuffer::SizeFunction&)>
+    MemorySchedulerAlgorithm;
 
-  // Selects the available scheduler algorithm that had the minimum memory in
-  // the resulting sequence (a la MinimumMemoryForSequence).
-  kAuto,
-};
+// List scheduler
+StatusOr<std::vector<const HloInstruction*>> ListMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function);
+
+// DFS-order scheduler
+StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function);
+
+// The default scheduling algorithm. Runs both the list scheduler
+// and the DFS scheduler, and chooses whichever returns a lower min-memory,
+// not accounting for fragmentation.
+StatusOr<std::vector<const HloInstruction*>> DefaultMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function);
 
 // Returns an HloModuleSequence which seeks to minimize the memory required for
 // the computation. size_function is the function returning the number of bytes
 // required for a LogicalBuffer.
 StatusOr<SequentialHloOrdering::HloModuleSequence>
-CreateMemoryMinimizingSequence(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
-    SchedulerAlgorithm algorithm = SchedulerAlgorithm::kAuto);
+CreateMemoryMinimizingSequence(const HloModule& module,
+                               const LogicalBuffer::SizeFunction& size_function,
+                               const MemorySchedulerAlgorithm& algorithm = {});
 
 // Overload of above that computes the sequence for a single computation.
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const LogicalBuffer::SizeFunction& size_function,
-    SchedulerAlgorithm algorithm = SchedulerAlgorithm::kAuto);
+    const MemorySchedulerAlgorithm& algorithm = {});
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
index 7fb338e7042ce19ac9647e23719e738f3ef42c7c..74544c4a67a819d341056aba4cf6b321a5a86c0a 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -89,5 +90,105 @@ TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
             MinimumMemoryForSequence(module_sequence, size_fn).ValueOrDie());
 }
 
+class HloSchedulingTest : public HloTestBase {};
+
+TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
+  // Tests scheduling of the following HLO code:
+  //
+  //   %ab = abs(%param)
+  //   %exp = exp(%param)
+  //   %add = add(%ab, %exp)
+  //   %negate = negate(%exp)
+  //   %sub = subtract(%add, %negate)
+  //
+  // %add should be scheduled before %negate because %add is the last (and only)
+  // use of %ab. Scheduling %add first then frees up %ab's buffer.
+  const Shape vec = ShapeUtil::MakeShape(xla::F32, {42});
+  auto builder = HloComputation::Builder(TestName());
+  auto param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, vec, "param"));
+  auto ab = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec, HloOpcode::kAbs, param));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec, HloOpcode::kExp, param));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec, HloOpcode::kAdd, ab, exp));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec, HloOpcode::kNegate, exp));
+  auto sub = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      CreateMemoryMinimizingSequence(*module, [](const LogicalBuffer& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape());
+      }));
+  // Verify that all instructions are in the sequence.
+  EXPECT_EQ(module->entry_computation()->instruction_count(),
+            sequence.at(module->entry_computation()).size());
+
+  // The first instruction should be the parameter and the last the root "sub".
+  EXPECT_EQ(param, sequence.at(module->entry_computation()).front());
+  EXPECT_EQ(sub, sequence.at(module->entry_computation()).back());
+
+  SequentialHloOrdering ordering(module.get(), sequence);
+  EXPECT_TRUE(ordering.ExecutesBefore(add, negate));
+}
+
+TEST_F(HloSchedulingTest, ListSchedulerHandlesAliasing) {
+  const char* module_str = R"(
+HloModule test_aliasing_module
+
+ENTRY root {
+  param = s32[1000] parameter(0)
+  p0 = s32[1000] copy(param)
+  p1 = s32[1000] copy(param)
+  t = (s32[1000], s32[1000]) tuple(p0, p1)
+  a = s32[1000] get-tuple-element(t), index=0
+  b = s32[1000] get-tuple-element(t), index=1
+  c = s32[1000] add(a, b)
+  d = s32[1000] add(c, b)
+  e = s32[1000] add(c, c)
+  f = s32[1000] add(e, e)
+  ROOT result = (s32[1000], s32[1000], s32[1000]) tuple(d, e, f)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(module_str));
+
+  auto size_fn = [](const LogicalBuffer& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  };
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      CreateMemoryMinimizingSequence(*module, size_fn, ListMemoryScheduler));
+  // Verify that all instructions are in the sequence.
+  EXPECT_EQ(module->entry_computation()->instruction_count(),
+            sequence.at(module->entry_computation()).size());
+
+  std::unordered_map<string, const HloInstruction*> instructions_by_name;
+  for (const HloInstruction* instruction :
+       sequence.at(module->entry_computation())) {
+    instructions_by_name[instruction->name()] = instruction;
+  }
+
+  // The first instruction should be the parameter and the last the root.
+  EXPECT_EQ(instructions_by_name.at("param"),
+            sequence.at(module->entry_computation()).front());
+  EXPECT_EQ(instructions_by_name.at("result"),
+            sequence.at(module->entry_computation()).back());
+
+  // Instructions "d" and "e" will both be schedulable at the same time, but
+  // instruction "d" allows us to free the buffer of "p1", so the list scheduler
+  // should prefer it.
+  SequentialHloOrdering ordering(module.get(), sequence);
+  EXPECT_TRUE(ordering.ExecutesBefore(instructions_by_name.at("d"),
+                                      instructions_by_name.at("e")));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index aa9ff89e983aa5d35a18906afca1c6e8eeaefa06..e8e45f1ee968992901988e8b85d4e9ae28f2abe9 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 namespace xla {
 
+using ::tensorflow::str_util::Join;
 using ::tensorflow::strings::StrCat;
 
 HloSharding HloSharding::AssignDevice(int64 device_id) {
@@ -57,8 +58,9 @@ string HloSharding::ToString() const {
     return StrCat(
         "{maximal device=", static_cast<int64>(*tile_assignment_.begin()), "}");
   } else {
-    return StrCat("{", ShapeUtil::HumanString(tile_shape_), " ",
-                  "devices=", VectorString(tile_assignment_), "}");
+    return StrCat("{", ShapeUtil::HumanString(tile_shape_), " ", "devices=[",
+                  Join(tile_assignment_.dimensions(), ","), "]",
+                  Join(tile_assignment_, ","), "}");
   }
 }
 
@@ -374,4 +376,9 @@ HloSharding HloSharding::TransformShardedTileShape(
   return HloSharding::Tile(new_tile_shape, tile_assignment());
 }
 
+std::ostream& operator<<(std::ostream& out, const HloSharding& sharding) {
+  out << sharding.ToString();
+  return out;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index e715dff9a0b8fcc2301a1581919dba384206923c..06204acbca30648e73382cb4641139e852664b77 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -94,6 +94,10 @@ class HloSharding {
   // Create a new sharding from a protobuf OpSharding.
   static StatusOr<HloSharding> FromProto(const OpSharding& proto);
 
+  // Checks whether device is a reserved device number. A reserved device number
+  // has usually a special meaning, with dedicated handling logic.
+  static bool IsReservedDevice(int64 device) { return device < 0; }
+
   OpSharding ToProto() const;
   string ToString() const;
 
@@ -173,7 +177,7 @@ class HloSharding {
 
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
-           protobuf_util::ProtobufEquals(tile_shape_, other.tile_shape_) &&
+           ShapeUtil::Compatible(tile_shape_, other.tile_shape_) &&
            tile_assignment_ == other.tile_assignment_ &&
            tuple_elements_ == other.tuple_elements_;
   }
@@ -207,6 +211,13 @@ class HloSharding {
   // REQUIRES: !IsReplicated() && !IsTuple()
   const Array<int64>& tile_assignment() const { return tile_assignment_; }
 
+  // Returns the flattened list of all the leaf shardings in a tuple shape, by
+  // pre-order walk (ShapeTree iterator order).
+  // REQUIRES: IsTuple().
+  const std::vector<HloSharding>& tuple_elements() const {
+    return tuple_elements_;
+  }
+
   // Return a new sharding that can apply to the given new shape.
   // If this sharding is tile-maximal, the returned sharding will be the same as
   // this sharding. If this sharding is not tile-maximal, the returned
@@ -262,6 +273,8 @@ class HloSharding {
   std::vector<HloSharding> tuple_elements_;
 };
 
+std::ostream& operator<<(std::ostream& out, const HloSharding& sharding);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_H_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 07fc4687cc1c0518b3ab2a86c62464fc54082a01..69ea4233e45c2e59c8d1541a0517a007f4bbf42f 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -282,5 +282,44 @@ TEST_F(HloShardingTest, TransformShardedTileShapeTest) {
   EXPECT_EQ(result, expected);
 }
 
+TEST_F(HloShardingTest, ToStringReplicatedTest) {
+  HloSharding sharding = HloSharding::Replicate();
+  EXPECT_EQ(sharding.ToString(), "{replicated}");
+}
+
+TEST_F(HloShardingTest, ToStringAssignDeviceTest) {
+  HloSharding sharding = HloSharding::AssignDevice(7);
+  EXPECT_EQ(sharding.ToString(), "{maximal device=7}");
+}
+
+TEST_F(HloShardingTest, ToStringTiledTest) {
+  HloSharding sharding =
+      HloSharding::Tile(ShapeUtil::MakeShape(S32, {7, 11, 13}),
+                        Array3D<int64>({{{2, 3}}, {{5, 7}}}));
+  EXPECT_EQ(sharding.ToString(), "{s32[7,11,13] devices=[2,1,2]2,3,5,7}");
+}
+
+TEST_F(HloShardingTest, ToStringTupleTest) {
+  HloSharding sharding = HloSharding::Tuple(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 5}),
+                                 ShapeUtil::MakeShape(U32, {7, 25}),
+                                 ShapeUtil::MakeShape(S32, {9, 11})}),
+      {HloSharding::Replicate(),
+       HloSharding::Tile(ShapeUtil::MakeShape(U32, {7, 13}),
+                         Array2D<int64>({{3, 5}})),
+       HloSharding::AssignDevice(3)});
+  EXPECT_EQ(sharding.ToString(),
+            "{{replicated}, {u32[7,13] devices=[1,2]3,5}, {maximal device=3}}");
+}
+
+TEST_F(HloShardingTest, OstreamTest) {
+  HloSharding sharding =
+      HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 5, 7, 11}),
+                        Array4D<int64>({{{{0, 1}, {2, 3}}}}));
+  std::ostringstream oss;
+  oss << sharding;
+  EXPECT_EQ(oss.str(), "{f32[3,5,7,11] devices=[1,1,2,2]0,1,2,3}");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 0819ab3b90b2360c6b0b2afaa89f322afe566eb3..45505484951abfcee93a62fec7a99e86cbb9150c 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -63,10 +63,7 @@ cc_library(
     name = "platform_id",
     srcs = ["platform_id.cc"],
     hdrs = ["platform_id.h"],
-    deps = [
-        "@nsync//:nsync_headers",
-        "//tensorflow/core:stream_executor_headers_lib",
-    ] + if_static(
+    deps = ["//tensorflow/core:stream_executor_headers_lib"] + if_static(
         ["@protobuf_archive//:protobuf"],
         ["@protobuf_archive//:protobuf_headers"],
     ),
@@ -123,14 +120,3 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 37261ed1e665ebed9685751161a412ad114a9e96..f1e7fc29532ce7e6841010a5258f4000a7c70383 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -169,17 +169,3 @@ cc_library(
         "@llvm//:core",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 2a282f3be79f847a6569416794d1a2a3fcd69148..ec04239b4f9112134ba876fdfbb3905a3baf1f72 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -762,7 +763,7 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
     fake_argv_storage.push_back("");
     for (const auto& it : options) {
       // Skip options the XLA backend itself consumes.
-      if (!tensorflow::StringPiece(it.first).starts_with("xla_")) {
+      if (!tensorflow::str_util::StartsWith(it.first, "xla_")) {
         if (it.second.empty()) {
           fake_argv_storage.push_back(it.first);
         } else {
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 07f989d4faea199e812e54d2ae74d3ff9e7fa19a..499f280211aacd00e79b3ca0ddb3413f933b02da 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -69,6 +69,68 @@ LocalService::LocalService(const ServiceOptions& options,
                            std::unique_ptr<Backend> execute_backend)
     : Service(options, std::move(execute_backend)) {}
 
+namespace {
+
+// Retrieves the parameter metadata for the given computation and parameter
+// number.
+//
+// If the parameter number is invalid for this computation, nullopt is
+// returned. When the return value has_value(), nullptr will never be
+// the held value.
+tensorflow::gtl::optional<const OpMetadata*> ParameterMetadata(
+    const XlaComputation& computation, int parameter_number) {
+  for (const HloComputationProto& comp : computation.proto().computations()) {
+    if (comp.id() == computation.proto().entry_computation_id()) {
+      for (const HloInstructionProto& instr : comp.instructions()) {
+        if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter) &&
+            instr.parameter_number() == parameter_number) {
+          if (!instr.has_metadata()) {
+            return tensorflow::gtl::nullopt;
+          }
+          return &instr.metadata();
+        }
+      }
+    }
+  }
+  return tensorflow::gtl::nullopt;
+}
+
+ExecutionOptions CreateExecutionOptions(
+    const ExecutableBuildOptions& build_options,
+    const ProgramShape* program_shape) {
+  ExecutionOptions execution_options = CreateDefaultExecutionOptions();
+  if (build_options.hlo_profile().has_value()) {
+    execution_options.mutable_debug_options()->set_xla_hlo_profile(
+        *build_options.hlo_profile());
+  }
+  if (build_options.generate_hlo_graph().has_value()) {
+    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
+        build_options.generate_hlo_graph().value());
+  }
+  if (build_options.dump_optimized_hlo_proto_to().has_value()) {
+    execution_options.mutable_debug_options()
+        ->set_xla_dump_optimized_hlo_proto_to(
+            build_options.dump_optimized_hlo_proto_to().value());
+  }
+  if (build_options.dump_per_pass_hlo_proto_to().has_value()) {
+    execution_options.mutable_debug_options()
+        ->set_xla_dump_per_pass_hlo_proto_to(
+            build_options.dump_per_pass_hlo_proto_to().value());
+  }
+  if (build_options.result_layout() != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *build_options.result_layout();
+  } else {
+    *execution_options.mutable_shape_with_output_layout() =
+        program_shape->result();
+    LayoutUtil::SetToDefaultLayout(
+        execution_options.mutable_shape_with_output_layout());
+  }
+  return execution_options;
+}
+
+}  // namespace
+
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
@@ -118,30 +180,78 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
         *build_options.result_layout(), program_shape->result()));
   }
 
-  ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-  if (build_options.generate_hlo_graph().has_value()) {
-    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
-        build_options.generate_hlo_graph().value());
+  ExecutionOptions execution_options =
+      CreateExecutionOptions(build_options, program_shape.get());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
+                      CreateModuleConfig(*program_shape, argument_layouts,
+                                         &execution_options, user_computation));
+
+  TF_ASSIGN_OR_RETURN(
+      se::StreamExecutor * executor,
+      execute_backend_->stream_executor(build_options.device_ordinal()));
+
+  return BuildExecutable(versioned_handle, std::move(module_config),
+                         execute_backend_.get(), executor,
+                         build_options.device_allocator());
+}
+
+StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
+    const XlaComputation& computation,
+    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
+    const ExecutableBuildOptions& build_options) {
+  const HloModuleProto& proto = computation.proto();
+  TF_RET_CHECK(proto.has_program_shape());
+  const ProgramShape& program_shape = proto.program_shape();
+
+  // Validate incoming layouts.
+  if (argument_layouts.size() != program_shape.parameters_size()) {
+    return InvalidArgument(
+        "Invalid number of arguments for computation: expected %d, got %zu.",
+        program_shape.parameters_size(), argument_layouts.size());
+  }
+
+  for (int i = 0; i < argument_layouts.size(); ++i) {
+    const Shape& argument_shape = *argument_layouts[i];
+    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(argument_shape));
+    if (!ShapeUtil::Compatible(argument_shape, program_shape.parameters(i))) {
+      tensorflow::gtl::optional<const OpMetadata*> metadata =
+          ParameterMetadata(computation, /*parameter_number=*/i);
+      auto metadata_string = [&metadata]() -> string {
+        if (!metadata.has_value()) {
+          return "";
+        }
+        CHECK(metadata.value() != nullptr);
+        const OpMetadata& m = *metadata.value();
+        if (!m.source_file().empty()) {
+          return tensorflow::strings::Printf(
+              " (%s:%d)", m.source_file().c_str(), m.source_line());
+        }
+        return "";
+      };
+      return InvalidArgument(
+          "Invalid argument shape for argument %d%s, expected %s, got %s.", i,
+          metadata_string().c_str(),
+          ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
+          ShapeUtil::HumanString(argument_shape).c_str());
+    }
   }
   if (build_options.result_layout() != nullptr) {
-    *execution_options.mutable_shape_with_output_layout() =
-        *build_options.result_layout();
-  } else {
-    *execution_options.mutable_shape_with_output_layout() =
-        program_shape->result();
-    LayoutUtil::SetToDefaultLayout(
-        execution_options.mutable_shape_with_output_layout());
+    TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(
+        *build_options.result_layout(), program_shape.result()));
   }
+
+  ExecutionOptions execution_options =
+      CreateExecutionOptions(build_options, &program_shape);
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, argument_layouts, &execution_options,
-                         *user_computation));
+      CreateModuleConfig(program_shape, argument_layouts, &execution_options));
 
   TF_ASSIGN_OR_RETURN(
       se::StreamExecutor * executor,
       execute_backend_->stream_executor(build_options.device_ordinal()));
 
-  return BuildExecutable(versioned_handle, std::move(module_config),
+  return BuildExecutable(proto, std::move(module_config),
                          execute_backend_.get(), executor,
                          build_options.device_allocator());
 }
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 15e120685e1be9190d49fdaf5ed6706bdf991a6c..06567cabd6eb28aae53881613cd6beb78e25e222 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -50,6 +51,18 @@ class LocalService : public Service {
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
       const ExecutableBuildOptions& options);
 
+  // Builds an Executable with the given XlaComputation, argument layouts and
+  // options. If result_layout is non-null, then the executable is compiled to
+  // produce a result of the given layout.  If device_allocator is non-null,
+  // then the compiler may use it to allocate temp space on the device.  The
+  // compiler is responsible for freeing any memory it allocates this way.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Executable>> CompileExecutable(
+      const XlaComputation& computation,
+      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
+      const ExecutableBuildOptions& build_options);
+
   // Returns the device ordinal that corresponds to the given replica number.
   //
   // This returns an error if there is not a one-to-one correspondence of
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index e62bafc50b0e1270702621c9ea7b2ee43e001fe0..49ec38eb62c7b51c7a2d301d882cef032b288036 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -53,8 +53,8 @@ bool IsReshapeOrTranspose(const HloInstruction* instruction) {
          instruction->opcode() == HloOpcode::kTranspose;
 }
 
-// Returns true iff `instruction` can change its shape simply by adjusting
-// metadata.
+// Returns true if `instruction` can change its shape simply by adjusting
+// metadata or if `instruction` is a broadcast of a scalar value.
 bool CanTriviallyChangeShape(const HloInstruction* instruction) {
   // NOTE: Technically a sequence of reshape(reshape(constant)) is also
   // trivially reshapable, so we might be tempted to simply recurse if
@@ -88,19 +88,31 @@ bool CanTriviallyChangeShape(const HloInstruction* instruction) {
       instruction->user_count() == 1) {
     return true;
   }
+
+  // A broadcase of scalar can trivially change its shape.
+  if (instruction->opcode() == HloOpcode::kBroadcast &&
+      ShapeUtil::IsScalar(instruction->operand(0)->shape())) {
+    return true;
+  }
+
   return false;
 }
 
-// Finds the first non-scalar operand of an instruction that is a non-trivial
-// reshape or transpose. Returns the operand if it is found or nullptr if not
-// found.
+// Returns true iff `instruction` is a reshape/transpose instruction for which
+// a shape change is nontrivial.
+bool IsNontrivialReshape(const HloInstruction* instruction) {
+  return !ShapeUtil::IsScalar(instruction->shape()) &&
+         IsReshapeOrTranspose(instruction) &&
+         !CanTriviallyChangeShape(instruction->operand(0));
+}
+
+// Finds the first operand of an instruction that is a non-trivial reshape or
+// transpose. Returns such an operand or nullptr if not found.
 HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
     const HloInstruction* hlo) {
   for (HloInstruction* operand : hlo->operands()) {
-    if (!ShapeUtil::IsScalar(operand->shape()) &&
-        IsReshapeOrTranspose(operand) &&
-        !CanTriviallyChangeShape(operand->operand(0))) {
-      VLOG(5) << "Found first non-scalar and non-trivial reshape operand of "
+    if (IsNontrivialReshape(operand)) {
+      VLOG(5) << "Found first non-trivial reshape operand of "
               << hlo->ToString(HloPrintOptions().set_print_metadata(false))
               << ":\n\t"
               << operand->ToString(HloPrintOptions().set_print_metadata(false));
@@ -110,7 +122,7 @@ HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
   return nullptr;
 }
 
-// Returns whether `a` and `b` are equivalent for the purposes of this pass.
+// Returns whether `a` and `b` are equivalent reshapes/transposes.
 bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   if (a->opcode() != b->opcode() ||
       !ShapeUtil::SameDimensions(a->shape(), b->shape())) {
@@ -127,71 +139,14 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   }
 }
 
-// Returns true if all operands of `instruction` can easily change shape.
-// Operands can easily change shape if they are all reshapes/transposes to and
-// from the same shape. Additionally, operands like constant, rng, and any
-// scalar change shape with only an adjustment of metadata.
-bool AllOperandsHaveEasyShapeChanges(
-    const HloInstruction* instruction,
-    const HloInstruction* first_reshape_operand) {
-  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
-  VLOG(3) << "** Checking whether all operands have easy shape changes: "
-          << instruction->ToString(print_no_metadata);
-  // Check whether all operands:
-  //    0. Have the same dimensions as the output -- if not, it may be
-  //       implicitly broadcast, which can confound the movement's
-  //       correctness.
-  //
-  // And one of the following:
-  //    1. Are reshapes or transposes that have the same input and
-  //       output shapes as all other reshaped or transposed operands.
-  //     or
-  //    2. Are one of kConstant, kRng, and scalars that can change shape
-  //    trivially,
-  for (const HloInstruction* operand : instruction->operands()) {
-    if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
-      VLOG(5) << "Operand shape differs from output shape; may be "
-                 "implicitly broadcast, so preventing "
-                 "movement\n\toperand: "
-              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
-              << instruction->ToString(print_no_metadata);
-      return false;
-    }
-
-    if (AreEquivalentReshapes(first_reshape_operand, operand)) {
-      VLOG(5) << "Are equivalent reshapes:\n\tfirst_reshape_operand: "
-              << first_reshape_operand->ToString(print_no_metadata)
-              << "\n\toperand: " << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    if (CanTriviallyChangeShape(operand)) {
-      VLOG(5) << "Operand can trivially change shape: "
-              << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    // TODO(someone): Look into supporting general ops for the operands as
-    // well.
-    VLOG(5) << "Operand is neither equalivant to the first Reshape operand"
-               "nor can trivially change shape: "
-            << operand->ToString(print_no_metadata);
-    return false;
-  }
-
-  VLOG(3) << "All operands have easy shape changes: "
-          << instruction->ToString(print_no_metadata);
-  return true;
-}
-
 // This function is called once we've decided to sink reshape/transpose operands
 // across an instruction. It returns an updated `operand` with a shape that
 // plays nicely with `new_operand_shape`; either it has the same shape (of the
 // correct type), or it is a scalar that may be implicitly broadcast.
-HloInstruction* UpdateOperand(HloComputation* computation,
-                              const HloInstruction* first_reshape_operand,
+HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand,
                               const Shape& new_operand_shape,
                               HloInstruction* operand) {
+  HloComputation* computation = operand->parent();
   const PrimitiveType element_type = operand->shape().element_type();
   const Shape new_shape =
       ShapeUtil::ChangeElementType(new_operand_shape, element_type);
@@ -222,36 +177,24 @@ HloInstruction* UpdateOperand(HloComputation* computation,
       VLOG(5) << "Using existing operand of kReshape or kTranspose";
       return operand->mutable_operand(0);
     }
+    case HloOpcode::kBroadcast: {
+      CHECK(ShapeUtil::IsScalar(operand->operand(0)->shape()));
+      HloInstruction* inst = computation->AddInstruction(
+          operand->CloneWithNewOperands(new_shape, operand->operands()));
+      VLOG(5) << "Changing broadcast from " << operand->ToString() << " to "
+              << inst->ToString();
+      return inst;
+    }
+
     default:
       LOG(FATAL) << "Unexpected operand opcode during update: " << operand;
   }
 }
 
-// Try to sink any reshape or transpose operands of `instruction` across it. We
-// do so if `instruction` is elementwise and all operands are either equivalent
-// reshapes/transposes or are trivially reshapable.
-StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
-                                         HloInstruction* instruction) {
-  // Only perform sinks for live elementwise instructions with operands.
-  const bool is_dead = instruction->user_count() == 0 &&
-                       instruction != computation->root_instruction();
-  if (!instruction->IsElementwise() || instruction->operands().empty() ||
-      is_dead) {
-    return false;
-  }
-
-  // Only perform sinks if there are any nontrivial reshape/transpose operands.
-  const HloInstruction* first_reshape_operand =
-      FirstNonScalarAndNonTrivialReshapeOperand(instruction);
-  if (!first_reshape_operand) {
-    return false;
-  }
-
-  // Only perform sinks if all operands can easily change shape.
-  if (!AllOperandsHaveEasyShapeChanges(instruction, first_reshape_operand)) {
-    return false;
-  }
-
+// Actually performs the reshape-move transformation -- that is, sinks the
+// reshape or transpose operands of `instruction` across it.
+StatusOr<bool> PerformSinkReshapeOrTranspose(
+    HloInstruction* instruction, const HloInstruction* first_reshape_operand) {
   auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
   // At this point we've decided to sink reshape/transpose operands.
   const Shape& new_operand_shape = first_reshape_operand->operand(0)->shape();
@@ -272,8 +215,8 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
     }
     VLOG(3) << "Updating operand #" << i << ": "
             << operands[i]->ToString(print_no_metadata);
-    operands[i] = UpdateOperand(computation, first_reshape_operand,
-                                new_operand_shape, operands[i]);
+    operands[i] =
+        UpdateOperand(first_reshape_operand, new_operand_shape, operands[i]);
   }
   if (HloOpcode::kFusion == instruction->opcode()) {
     // Here we already know `instruction` is elementwise, and no operand is
@@ -285,6 +228,7 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
       *shape->mutable_layout() = new_operand_shape.layout();
     }
   }
+  HloComputation* computation = instruction->parent();
   HloInstruction* new_elementwise =
       computation->AddInstruction(instruction->CloneWithNewOperands(
           // `instruction` may change the element type, e.g., from
@@ -319,6 +263,141 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
   return true;
 }
 
+// Returns true if the instruction is a reshape-move candidate.
+//
+// An instruction is a reshape-move candidate if the instruction is elementwise,
+// has at least one nontrivial reshape/transpose operand, and its operands are
+// either trivially reshapable or are equivalent nontrivial reshapes/transposes.
+bool IsReshapeMoveCandidate(HloInstruction* instruction) {
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
+  VLOG(5) << "** Checking instruction: "
+          << instruction->ToString(print_no_metadata);
+
+  // Only perform reshape-move for live elementwise instructions with operands.
+  const bool is_dead = instruction->user_count() == 0 &&
+                       instruction != instruction->parent()->root_instruction();
+  if (!instruction->IsElementwise() || instruction->operands().empty() ||
+      is_dead) {
+    return false;
+  }
+
+  // Check whether all operands:
+  //    0. Have the same dimensions as the output -- if not, they may be
+  //       implicitly broadcast, which can confound the movement's
+  //       correctness.
+  //
+  // And one of the following:
+  //    1. Are reshapes or transposes that have the same input and
+  //       output shapes as all other reshaped or transposed operands.
+  //     or
+  //    2. Are one of kConstant, kRng, broadcast of a scalar value, and scalars
+  //     that can change shape trivially.
+  const HloInstruction* first_reshape_operand = nullptr;
+  for (const HloInstruction* operand : instruction->operands()) {
+    if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
+      VLOG(5) << "Operand shape differs from output shape; may be "
+                 "implicitly broadcast, so preventing "
+                 "movement\n\toperand: "
+              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
+              << instruction->ToString(print_no_metadata);
+      return false;
+    }
+
+    if (CanTriviallyChangeShape(operand)) {
+      VLOG(5) << "Operand can trivially change shape: "
+              << operand->ToString(print_no_metadata);
+      continue;
+    }
+
+    if (!IsNontrivialReshape(operand)) {
+      VLOG(5) << "Operand can't trivially change shape: "
+              << operand->ToString(print_no_metadata);
+      return false;
+    }
+
+    if (first_reshape_operand == nullptr) {
+      first_reshape_operand = operand;
+      VLOG(5) << "First reshape operand "
+              << operand->ToString(print_no_metadata);
+    } else if (AreEquivalentReshapes(first_reshape_operand, operand)) {
+      VLOG(5)
+          << "Operand is an equivalent reshape of the first reshape operand "
+          << operand->ToString(print_no_metadata);
+    } else {
+      // TODO(someone): Look into supporting general ops for the operands as
+      // well.
+      VLOG(5) << "Operand is a reshape but is not equivalent to the first "
+                 "Reshape operand"
+              << operand->ToString(print_no_metadata);
+      return false;
+    }
+  }
+
+  if (first_reshape_operand) {
+    VLOG(5) << "All operands have easy shape changes: "
+            << instruction->ToString(print_no_metadata);
+  }
+
+  return first_reshape_operand != nullptr;
+}
+
+// Reshape-moves all qualifying instructions in reshape_candidates.  Returns
+// true if it makes changes.
+//
+// `reshape_candidates` is a set of HloInstructions with nontrivial reshape
+// operands, and a instruction in the set can be reshape-moved iff all the users
+// of its nontrivial reshape operands can also be reshaped-moved.
+//
+// The algorithm here iteratively finds the nontrivial operands with users that
+// are outside the set of `reshape_candidates`, and removes their users from
+// `reshape_candidates`, until either `reshape_candidates` becomes empty or none
+// of the remaining nontrivial operands have users outside `reshape_candidates`.
+// In the later case, all the remaining instructions in `reshape_candidates`
+// are reshape-moved and the routine returns true.
+StatusOr<bool> TryReshapeMoveOnCandidates(
+    HloInstructionSet* reshape_candidates) {
+  bool removed = true;
+  while (!reshape_candidates->empty() && removed) {
+    if (VLOG_IS_ON(5)) {
+      for (const HloInstruction* instruction : *reshape_candidates) {
+        VLOG(5) << "candidate " << instruction->ToString();
+      }
+    }
+    ConstHloInstructionSet nontrivial_operands;
+    for (const HloInstruction* instruction : *reshape_candidates) {
+      for (const auto* operand : instruction->operands()) {
+        if (IsNontrivialReshape(operand)) {
+          nontrivial_operands.insert(operand);
+        }
+      }
+    }
+
+    removed = false;
+    for (auto operand : nontrivial_operands) {
+      if (c_any_of(operand->users(), [&](HloInstruction* user) {
+            return !reshape_candidates->count(user);
+          })) {
+        for (auto* user : operand->users()) {
+          removed |= reshape_candidates->erase(user) > 0;
+        }
+      }
+    }
+  }
+
+  if (reshape_candidates->empty()) {
+    return false;
+  }
+  for (HloInstruction* instruction : *reshape_candidates) {
+    const HloInstruction* first_reshape_operand =
+        FirstNonScalarAndNonTrivialReshapeOperand(instruction);
+    TF_ASSIGN_OR_RETURN(
+        bool did_change,
+        PerformSinkReshapeOrTranspose(instruction, first_reshape_operand));
+    CHECK(did_change);
+  }
+  return true;
+}
+
 }  // namespace
 
 StatusOr<bool> ReshapeMover::Run(HloModule* module) {
@@ -326,11 +405,15 @@ StatusOr<bool> ReshapeMover::Run(HloModule* module) {
   VLOG(2) << "Pre ReshapeMover HLO:";
   XLA_VLOG_LINES(2, module->ToString());
   for (auto* comp : module->MakeNonfusionComputations()) {
-    for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
-      TF_ASSIGN_OR_RETURN(bool did_change,
-                          TrySinkReshapeOrTranspose(comp, instruction));
-      changed |= did_change;
+    HloInstructionSet reshape_candidates;
+    for (HloInstruction* instruction : comp->instructions()) {
+      if (IsReshapeMoveCandidate(instruction)) {
+        reshape_candidates.insert(instruction);
+      }
     }
+    TF_ASSIGN_OR_RETURN(bool did_change,
+                        TryReshapeMoveOnCandidates(&reshape_candidates));
+    changed |= did_change;
   }
   VLOG(2) << "Post ReshapeMover HLO:";
   XLA_VLOG_LINES(2, module->ToString());
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index aac8638a54f744f0c230ec6c5ca071c1daf45ab2..094f7319f462a71f4bfe972771a1de4aedbb8ee3 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -560,5 +560,95 @@ TEST_F(ReshapeMoverTest, MultiplePasses) {
       op::Reshape(op::Add(param2, op::Reshape(op::Add(param0, param1)))));
 }
 
+TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
+  const string hlo_string = R"(
+    HloModule TransposeMulInversedTransposeModule
+    ENTRY TransposeMulInversedTranspose {
+      src0 = f32[20,8]{1,0} parameter(0)
+      transpose0 = f32[8,20]{1,0} transpose(src0), dimensions={1,0}
+      src1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(src1), dimensions={}
+      ROOT multiply0 = f32[8,20]{1,0} multiply(transpose0, broadcast0)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Transpose(op::Multiply()));
+}
+
+TEST_F(ReshapeMoverTest, ReshapeWithUsersOutsideCandidatesNotSink) {
+  const string hlo_string = R"(
+    HloModule ReshapeWithUsersOutsideCandidates
+    ENTRY ReshapeWithMultipleUsers {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      param1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(param1), dimensions={}
+      param2 = f32[20,8]{1,0} parameter(2)
+      reshape1 = f32[8,20]{1,0} reshape(param2)
+      param3 = f32[20,8]{1,0} parameter(3)
+      reshape2 = f32[8,20]{1,0} reshape(param3)
+      param4 = f32[8,20]{1,0} parameter(4)
+      add0 = f32[8,20]{1,0} add(reshape0, broadcast0)
+      add1 = f32[8,20]{1,0} add(reshape0, reshape1)
+      add2 = f32[8,20]{1,0} add(reshape1, param4)
+      ROOT tuple = (f32[8,20]{1,0},f32[8,20]{1,0},
+        f32[8,20]{1,0}) tuple(add0, add1, add2)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink1) {
+  const string hlo_string = R"(
+    HloModule ReshapeNoUsersOutsideCandidates1
+    ENTRY ReshapeWithMultipleUsers1 {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      param1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(param1), dimensions={}
+      param2 = f32[20,8]{1,0} parameter(2)
+      reshape1 = f32[8,20]{1,0} reshape(param2)
+      param3 = f32[20,8]{1,0} parameter(3)
+      reshape2 = f32[8,20]{1,0} reshape(param3)
+      add0 = f32[8,20]{1,0} add(reshape0, broadcast0)
+      add1 = f32[8,20]{1,0} add(reshape0, reshape1)
+      add2 = f32[8,20]{1,0} add(reshape1, reshape2)
+      ROOT tuple = (f32[8,20]{1,0},f32[8,20]{1,0},
+        f32[8,20]{1,0}) tuple(add0, add1, add2)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Tuple(op::Reshape(), op::Reshape(), op::Reshape()));
+}
+
+TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink2) {
+  const string hlo_string = R"(
+    HloModule ReshapeNoUsersOutsideCandidates2
+    ENTRY ReshapeWithMultipleUsers2 {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      ROOT add0 = f32[8,20]{1,0} add(reshape0, reshape0)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Reshape(op::Add()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 0becc9d8f8ed22b2d7174b76ce775efec4b646f5..ec883a6cf3ce9546ac54f5c2524a8eda53bad33f 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -272,7 +272,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
     const ExecutionOptions* execution_options,
-    const UserComputation& user_computation) {
+    const UserComputation* user_computation) {
   auto config = MakeUnique<HloModuleConfig>(program_shape);
   auto* computation_layout = config->mutable_entry_computation_layout();
 
@@ -286,8 +286,15 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     // ProgramShape.
     if (!ShapeUtil::Compatible(*argument_shapes[i],
                                program_shape.parameters(i))) {
+      if (user_computation == nullptr) {
+        return InvalidArgument(
+            "Argument does not match shape of computation parameter %d: want "
+            "%s, got %s",
+            i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
+            ShapeUtil::HumanString(*argument_shapes[i]).c_str());
+      }
       return InvalidParameterArgument(
-          *user_computation.ParameterMetadata(i).value(),
+          *user_computation->ParameterMetadata(i).value(),
           "Argument does not match shape of computation parameter %d: want %s, "
           "got %s",
           i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
@@ -330,7 +337,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutionOptions& execution_options,
-    const UserComputation& user_computation) {
+    const UserComputation* user_computation) {
   std::vector<const Shape*> argument_shapes;
   for (const auto* arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
@@ -402,6 +409,37 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
   return std::move(executables);
 }
 
+StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
+    const std::vector<const HloModuleProto*>& module_protos,
+    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+    Backend* backend,
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    DeviceMemoryAllocator* device_allocator) {
+  VLOG(1) << Printf("BuildExecutable on service %p", this);
+
+  VLOG(1) << "Computations:";
+  for (const HloModuleProto* proto : module_protos) {
+    VLOG(1) << proto->name();
+  }
+
+  CHECK_EQ(module_protos.size(), module_configs.size());
+  std::vector<std::unique_ptr<HloModule>> modules;
+  for (int64 i = 0; i < module_protos.size(); ++i) {
+    const HloModuleProto* proto = module_protos[i];
+    const HloModuleConfig& config = *module_configs[i];
+    TF_ASSIGN_OR_RETURN(auto module,
+                        HloModule::CreateFromProto(*proto, config));
+    modules.push_back(std::move(module));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<Executable>> executables,
+      backend->compiler()->Compile(std::move(modules), std::move(executors),
+                                   device_allocator));
+
+  return std::move(executables);
+}
+
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const VersionedComputationHandle& versioned_handle,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -696,6 +734,47 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
   return computation->SetReturnValue(arg->operand());
 }
 
+StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
+Service::GetExecutors(const ExecutionOptions& execution_options,
+                      int64 requests_size, int64 request_index) const {
+  if (execution_options.device_handles().empty()) {
+    return FailedPrecondition(
+        "device handles must be given to execute parallel computations");
+  }
+  if (requests_size > 1 && execution_options.device_handles_size() > 1) {
+    return InvalidArgument(
+        "Parallel requests with multiple device handles is not supported. "
+        "Found %lld parallel requests, with request %lld containing %d device "
+        "handles.",
+        requests_size, request_index, execution_options.device_handles_size());
+  }
+  std::vector<perftools::gputools::StreamExecutor*> executors;
+  for (const auto& device_handle : execution_options.device_handles()) {
+    TF_ASSIGN_OR_RETURN(auto replicas,
+                        Replicas(*execute_backend_, device_handle));
+    se::StreamExecutor* executor = replicas[0];
+    CHECK(executor != nullptr);
+    executors.push_back(executor);
+  }
+  return executors;
+}
+
+StatusOr<std::vector<std::vector<const ShapedBuffer*>>> Service::GetArguments(
+    const ExecutionOptions& execution_options,
+    tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments) {
+  // Resolve the allocations for the arguments of the computation, and create
+  // a vector of device memory offsets for the arguments from the allocations.
+  // In the case of partitioned computations, assume all arguments go on the
+  // zeroth core.
+  TF_ASSIGN_OR_RETURN(
+      auto replicas,
+      Replicas(*execute_backend_, execution_options.device_handles(0)));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arguments, replicas));
+  return replicated_arguments;
+}
+
 tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
                                             ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
@@ -724,26 +803,10 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // is one of the executors to run the replicated computation.
     const ExecutionOptions& execution_options =
         arg->requests(i).execution_options();
-    if (execution_options.device_handles().empty()) {
-      return FailedPrecondition(
-          "device handles must be given to execute parallel computations");
-    }
-    if (arg->requests_size() > 1 &&
-        execution_options.device_handles_size() > 1) {
-      return InvalidArgument(
-          "Parallel requests with multiple device handles is not supported. "
-          "Found %d parallel requests, with request %lld containing %d device "
-          "handles.",
-          arg->requests_size(), i, execution_options.device_handles_size());
-    }
-    std::vector<perftools::gputools::StreamExecutor*> executors;
-    for (const auto& device_handle : execution_options.device_handles()) {
-      TF_ASSIGN_OR_RETURN(auto replicas,
-                          Replicas(*execute_backend_, device_handle));
-      se::StreamExecutor* executor = replicas[0];
-      CHECK(executor != nullptr);
-      executors.push_back(executor);
-    }
+
+    // Get the executors.
+    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
+                                                     arg->requests_size(), i));
 
     // Resolve the UserComputation object associated with the requested
     // computation and compute the program shape.
@@ -760,16 +823,9 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
         std::shared_ptr<const ProgramShape> program_shape,
         user_computation->ComputeProgramShape(versioned_handle.version));
 
-    // Resolve the allocations for the arguments of the computation, and create
-    // a vector of device memory offsets for the arguments from the allocations.
-    // In the case of partitioned computations, assume all arguments go on the
-    // zeroth core.
-    TF_ASSIGN_OR_RETURN(
-        auto replicas,
-        Replicas(*execute_backend_, execution_options.device_handles(0)));
-    TF_ASSIGN_OR_RETURN(
-        std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-        ResolveAndValidateArguments(request.arguments(), replicas));
+    // Get the replicated arguments.
+    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
+                        GetArguments(execution_options, request.arguments()));
 
     // Create an HloModuleConfig object for the computation, given the shape of
     // the program and the argument allocations. Here, we care only about the
@@ -778,7 +834,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
         CreateModuleConfig(*program_shape, replicated_arguments.front(),
-                           request.execution_options(), *user_computation));
+                           request.execution_options(), user_computation));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
 
@@ -830,6 +886,107 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status Service::ExecuteGraphParallel(
+    const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) {
+  VLOG(1) << "running execute-graph-parallel request";
+
+  std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
+  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<const HloModuleProto*> module_protos;
+  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
+  std::vector<string> computation_names;
+  std::vector<DeviceHandle> device_handles;
+
+  int num_requested_devices =
+      std::accumulate(arg->requests().begin(), arg->requests().end(), 0,
+                      [](int a, const ExecuteGraphRequest& r) -> int {
+                        return a + r.execution_options().device_handles_size();
+                      });
+  if (num_requested_devices * options_.number_of_replicas() >
+      execute_backend_->device_count()) {
+    return FailedPrecondition(
+        "there are not enough stream executors to execute %d computations",
+        num_requested_devices);
+  }
+
+  for (int64 i = 0; i < arg->requests_size(); ++i) {
+    // Get the stream executor for the i'th computation. This stream executor
+    // is one of the executors to run the replicated computation.
+    const ExecutionOptions& execution_options =
+        arg->requests(i).execution_options();
+    const ExecuteGraphRequest& request = arg->requests(i);
+    TF_RET_CHECK(request.has_computation()) << "computations may not be empty";
+    TF_RET_CHECK(request.computation().has_program_shape())
+        << "programe shape may not be empty";
+
+    // Get the executors.
+    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
+                                                     arg->requests_size(), i));
+
+    // Get the replicated arguments.
+    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
+                        GetArguments(execution_options, request.arguments()));
+
+    // Create an HloModuleConfig object for the computation, given the shape of
+    // the program and the argument allocations. Here, we care only about the
+    // shapes of the arguments, so, it is sufficient to use the arguments of
+    // replica 0.
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(request.computation().program_shape(),
+                           replicated_arguments.front(),
+                           request.execution_options(),
+                           /*user_computation=*/nullptr));
+    VLOG(3)
+        << "ExecuteGraphParallel created HloModuleConfig computation layout: "
+        << module_config->entry_computation_layout().ToString();
+
+    // Adds to the vectors to build and execute the computations after the loop.
+    all_arguments.push_back(replicated_arguments);
+    all_arguments.insert(all_arguments.end(), executors.size() - 1, {{}});
+    module_protos.push_back(&request.computation());
+    module_configs.push_back(std::move(module_config));
+    computation_names.insert(computation_names.end(), executors.size(),
+                             request.computation().name());
+    all_executors.push_back(executors);
+    device_handles.insert(device_handles.end(),
+                          execution_options.device_handles().begin(),
+                          execution_options.device_handles().end());
+  }
+
+  // Build the HloModules and compile to generate the executables.
+  //
+  // TODO(jlebar): There's currently no way to pass a device allocator to
+  // ExecuteGraphParallel, so we have to pass a null device_allocator below.
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
+                      BuildExecutables(module_protos, std::move(module_configs),
+                                       execute_backend_.get(), all_executors,
+                                       /*device_allocator=*/nullptr));
+  std::vector<Executable*> executable_ptrs;
+  executable_ptrs.reserve(executables.size());
+  for (const auto& executable : executables) {
+    executable_ptrs.push_back(executable.get());
+  }
+
+  // Execute the generated executables in parallel and return the device
+  // handles for each computation's output.
+  ExecutionProfile profile;
+  TF_ASSIGN_OR_RETURN(
+      std::vector<GlobalDataHandle> outputs,
+      ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
+                                       execute_backend_.get(), device_handles,
+                                       computation_names, &profile));
+  for (const GlobalDataHandle& output : outputs) {
+    ExecuteResponse response;
+    *response.mutable_output() = output;
+    *response.mutable_profile() = profile;
+    *result->add_responses() = response;
+  }
+
+  VLOG(1) << "successfully completed 'execute-graph-parallel' request";
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
                                              GetDeviceHandlesResponse* result) {
   const int64 available_device_count = execute_backend_->device_count();
@@ -854,6 +1011,47 @@ tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status Service::ExecuteOneToN(const ExecuteRequest* arg,
+                                          ExecuteResponse* result) {
+  ExecuteParallelRequest parallel_arg;
+  *parallel_arg.add_requests() = *arg;
+  ExecuteParallelResponse parallel_result;
+  TF_RETURN_IF_ERROR(ExecuteParallel(&parallel_arg, &parallel_result));
+  return PickParallelResponse(parallel_result, result);
+}
+
+tensorflow::Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
+                                          ExecuteResponse* result) {
+  ExecuteGraphParallelRequest parallel_arg;
+  *parallel_arg.add_requests() = *arg;
+  ExecuteParallelResponse parallel_result;
+  TF_RETURN_IF_ERROR(ExecuteGraphParallel(&parallel_arg, &parallel_result));
+  return PickParallelResponse(parallel_result, result);
+}
+
+tensorflow::Status Service::PickParallelResponse(
+    const ExecuteParallelResponse& parallel_result, ExecuteResponse* result) {
+  // The "result device" selection is a bit hacky, but better than assuming it
+  // is device 0. We have b/76035356 for restructuring the client API to clean
+  // up the current asymmetries and support more functionalities.
+  for (int64 i = 0; i < parallel_result.responses_size(); ++i) {
+    TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
+                        allocation_tracker_.ResolveForReplica(
+                            parallel_result.responses(i).output(), 0));
+    const Shape& shape = buffer->on_host_shape();
+    if (!ShapeUtil::IsEmptyTuple(shape)) {
+      *result = parallel_result.responses(i);
+      VLOG(3) << "Fetching result from device " << i << ": "
+              << ShapeUtil::HumanString(shape);
+      return Status::OK();
+    }
+  }
+  TF_RET_CHECK(parallel_result.responses_size() > 0);
+  *result = parallel_result.responses(0);
+  VLOG(1) << "Defaulting to device 0 result";
+  return Status::OK();
+}
+
 tensorflow::Status Service::Execute(const ExecuteRequest* arg,
                                     ExecuteResponse* result) {
   VLOG(1) << "running execute request: " << arg->ShortDebugString();
@@ -870,13 +1068,7 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
 
   // If we received multiple device handles, we must partition the module.
   if (arg->execution_options().device_handles_size() > 1) {
-    ExecuteParallelRequest parallel_arg;
-    *parallel_arg.add_requests() = *arg;
-    ExecuteParallelResponse parallel_result;
-    TF_RETURN_IF_ERROR(ExecuteParallel(&parallel_arg, &parallel_result));
-    TF_RET_CHECK(parallel_result.responses_size() > 0);
-    *result = parallel_result.responses(0);
-    return Status::OK();
+    return ExecuteOneToN(arg, result);
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -894,7 +1086,7 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
       CreateModuleConfig(*program_shape, replicated_arguments.front(),
-                         arg->execution_options(), *user_computation));
+                         arg->execution_options(), user_computation));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -935,9 +1127,72 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status Service::ExecuteGraph(const ExecuteGraphRequest* /*arg*/,
-                                         ExecuteResponse* /*result*/) {
-  return Unimplemented("execute-graph is not yet implemented");
+StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
+    const HloModuleProto& module_proto,
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
+  VLOG(1) << Printf(
+      "BuildExecutable on service %p with serialized module proto: %s", this,
+      module_proto.name().c_str());
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(module_proto, *module_config));
+
+  TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module));
+
+  TF_ASSIGN_OR_RETURN(
+      module, backend->compiler()->RunHloPasses(std::move(module), executor,
+                                                device_allocator));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      backend->compiler()->RunBackend(
+                          std::move(module), executor, device_allocator));
+
+  return std::move(executable);
+}
+
+tensorflow::Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
+                                         ExecuteResponse* result) {
+  VLOG(1) << "running execute-graph request";
+
+  if (!arg->has_computation()) {
+    return InvalidArgument("computations may not be empty");
+  }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("programe shape may not be empty");
+  }
+
+  // If we received multiple device handles, we must partition the module.
+  if (arg->execution_options().device_handles_size() > 1) {
+    return ExecuteOneToN(arg, result);
+  }
+
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
+                                              SingleComputationDeviceHandle()));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arg->arguments(), replicas));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
+                      CreateModuleConfig(arg->computation().program_shape(),
+                                         replicated_arguments.front(),
+                                         arg->execution_options()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      BuildExecutable(arg->computation(), std::move(module_config),
+                      execute_backend_.get(),
+                      execute_backend_->default_stream_executor(),
+                      /*device_allocator=*/nullptr));
+
+  TF_ASSIGN_OR_RETURN(
+      *result->mutable_output(),
+      ExecuteAndRegisterResult(
+          executable.get(), replicated_arguments, execute_backend_.get(),
+          "result of " + arg->computation().name(), result->mutable_profile()));
+
+  VLOG(1) << "successfully completed 'execute-graph' request";
+  return tensorflow::Status::OK();
 }
 
 tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
@@ -967,7 +1222,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
       CreateModuleConfig(*program_shape, replicated_arguments.front(),
-                         arg->execution_options(), *user_computation));
+                         arg->execution_options(), user_computation));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -1268,7 +1523,7 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
                       CreateModuleConfig(program_shape, {}, execution_options,
-                                         *user_computation));
+                                         user_computation));
 
   // Exclude dead parameter instructions for the purpose of computing constants.
   TF_ASSIGN_OR_RETURN(
@@ -1360,6 +1615,29 @@ tensorflow::Status Service::GetComputationStats(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status Service::GetComputationGraphStats(
+    const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) {
+  HloModuleConfig config;
+  config.set_debug_options(arg->debug_options());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(arg->computation(), config));
+
+  hlo_graph_dumper::MaybeDumpHloModule(*module,
+                                       "computation statistics subject");
+
+  // Run HLO analysis to get the computation statistics.
+  HloCostAnalysis analysis(
+      execute_backend_->compiler()->ShapeSizeBytesFunction());
+
+  TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&analysis));
+
+  ComputationStats stats;
+  stats.set_flop_count(analysis.flop_count());
+  stats.set_transcendental_count(analysis.transcendental_count());
+  *result->mutable_stats() = stats;
+  return tensorflow::Status::OK();
+}
+
 template <typename RequestT, typename ResponseT>
 tensorflow::Status Service::AddInstruction(
     const RequestT* arg, ResponseT* result,
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 96352d9096e6aeeb33f84c7b6fc42c28820e5e84..e09d58bbe7691b4854538ca5a99bd4c0b8d53c3b 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -115,6 +115,8 @@ class Service : public ServiceInterface {
   // Executes a computation with the provided global data passed as
   // immutable arguments. The request contains the whole computation graph.
   // Returns global data output and execution timing.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* arg,
                                   ExecuteResponse* result) override;
 
@@ -124,6 +126,15 @@ class Service : public ServiceInterface {
   tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
                                      ExecuteParallelResponse* result) override;
 
+  // Executes one or more computations in parallel with the provided global data
+  // passed as immutable arguments. Returns global data output for each
+  // computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* arg,
+      ExecuteParallelResponse* result) override;
+
   // Requests one or more device handles from the target.
   //
   // When N device handles are requested and the number of replicas is R, at
@@ -222,6 +233,13 @@ class Service : public ServiceInterface {
       const ComputationStatsRequest* arg,
       ComputationStatsResponse* result) override;
 
+  // Retrieves the statistics of a computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* arg,
+      ComputationStatsResponse* result) override;
+
   // Snapshots the current state of a computation handle into a serializable
   // protocol buffer form, so it can be loaded via
   // LoadComputationSnapshot.
@@ -258,7 +276,21 @@ class Service : public ServiceInterface {
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutionOptions& execution_options,
-      const UserComputation& user_computation);
+      const UserComputation* user_computation = nullptr);
+
+  // Picks a parallel response and fills the result.
+  Status PickParallelResponse(const ExecuteParallelResponse& parallel_result,
+                              ExecuteResponse* result);
+
+  // Prepare the executors for executing parallel.
+  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> GetExecutors(
+      const ExecutionOptions& execution_options, int64 requests_size,
+      int64 request_index) const;
+
+  // Prepare the arguments for executing parallel.
+  StatusOr<std::vector<std::vector<const ShapedBuffer*>>> GetArguments(
+      const ExecutionOptions& execution_options,
+      tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments);
 
  protected:
   friend class LocalExecutable;
@@ -286,7 +318,7 @@ class Service : public ServiceInterface {
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
       const ExecutionOptions* execution_options,
-      const UserComputation& user_computation);
+      const UserComputation* user_computation = nullptr);
 
   // Builds an Executable for the given parameters.
   //
@@ -299,6 +331,15 @@ class Service : public ServiceInterface {
       perftools::gputools::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator = nullptr);
 
+  // Builds an Executable for the given HLO module proto.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Executable>> BuildExecutable(
+      const HloModuleProto& module_proto,
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator = nullptr);
+
   // Same as BuildExecutable() above, but builds a list of Executables for the
   // given computations that may interact with each other.
   StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
@@ -307,6 +348,12 @@ class Service : public ServiceInterface {
       Backend* backend,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
+  StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
+      const std::vector<const HloModuleProto*>& module_protos,
+      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+      Backend* backend,
+      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
   // executable first. If the executable is not in the cache, it is built and
@@ -346,6 +393,14 @@ class Service : public ServiceInterface {
       const std::function<StatusOr<ComputationDataHandle>(UserComputation*)>&
           adder);
 
+  // Executes a single computation which has more than one target device.
+  // The N devices are expected to all return an empty tuple, but one, which
+  // will be the result of this computation.
+  tensorflow::Status ExecuteOneToN(const ExecuteRequest* arg,
+                                   ExecuteResponse* result);
+  tensorflow::Status ExecuteOneToN(const ExecuteGraphRequest* arg,
+                                   ExecuteResponse* result);
+
   // Convenience function which checks whether the given shape_with_layout
   // (presumably passed by the client to set the result layout) is valid for the
   // given computation result shape.
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 8c8bd6d73ad41db7d609ac91c7bdfc4703f364e1..77e12d36024dae56003ad4e59b54f9934dfc2c58 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -304,12 +304,17 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 
 /* static */ StatusOr<Shape> ShapeInference::InferUnaryOpShape(
     HloOpcode opcode, const HloInstruction* operand) {
+  return InferUnaryOpShape(opcode, operand->shape());
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferUnaryOpShape(
+    HloOpcode opcode, const Shape& shape) {
   // There is no copy operation at the proto level, so handle copy explicitly.
   if (opcode == HloOpcode::kCopy) {
-    return operand->shape();
+    return shape;
   }
 
-  return InferUnaryOpShape(OpcodeToUnaryOperation(opcode), operand->shape());
+  return InferUnaryOpShape(OpcodeToUnaryOperation(opcode), shape);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferUnaryOpShape(
@@ -1033,8 +1038,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 /* static */ StatusOr<Shape> ShapeInference::InferTernaryOpShape(
     HloOpcode opcode, const HloInstruction* lhs, const HloInstruction* rhs,
     const HloInstruction* ehs) {
-  return InferTernaryOpShape(OpcodeToTernaryOperation(opcode), lhs->shape(),
-                             rhs->shape(), ehs->shape());
+  return InferTernaryOpShape(opcode, lhs->shape(), rhs->shape(), ehs->shape());
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferTernaryOpShape(
+    HloOpcode opcode, const Shape& lhs, const Shape& rhs, const Shape& ehs) {
+  return InferTernaryOpShape(OpcodeToTernaryOperation(opcode), lhs, rhs, ehs);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferTernaryOpShape(
@@ -1061,6 +1070,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (const HloInstruction* operand : operands) {
     operand_shapes.push_back(&operand->shape());
   }
+  return InferVariadicOpShape(opcode, operand_shapes);
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferVariadicOpShape(
+    HloOpcode opcode,
+    tensorflow::gtl::ArraySlice<const Shape*> operand_shapes) {
   return InferVariadicOpShape(OpcodeToVariadicOperation(opcode),
                               operand_shapes);
 }
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 085fdac60c6de161c457dff672175e82f4f4da51..9da2c99b4177f08ece8daabaf2922ddd7e947a1b 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -48,6 +48,8 @@ class ShapeInference {
   // given input shape.
   static StatusOr<Shape> InferUnaryOpShape(UnaryOperation operation,
                                            const Shape& arg);
+  static StatusOr<Shape> InferUnaryOpShape(HloOpcode opcode,
+                                           const Shape& shape);
   static StatusOr<Shape> InferUnaryOpShape(HloOpcode opcode,
                                            const HloInstruction* operand);
 
@@ -68,6 +70,9 @@ class ShapeInference {
   static StatusOr<Shape> InferTernaryOpShape(TernaryOperation operation,
                                              const Shape& lhs, const Shape& rhs,
                                              const Shape& ehs);
+  static StatusOr<Shape> InferTernaryOpShape(HloOpcode opcode, const Shape& lhs,
+                                             const Shape& rhs,
+                                             const Shape& ehs);
   static StatusOr<Shape> InferTernaryOpShape(HloOpcode opcode,
                                              const HloInstruction* lhs,
                                              const HloInstruction* rhs,
@@ -78,6 +83,9 @@ class ShapeInference {
   static StatusOr<Shape> InferVariadicOpShape(
       VariadicOperation operation,
       tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
+  static StatusOr<Shape> InferVariadicOpShape(
+      HloOpcode opcode,
+      tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
   static StatusOr<Shape> InferVariadicOpShape(
       HloOpcode opcode,
       tensorflow::gtl::ArraySlice<const HloInstruction*> operands);
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 0dca30a804005c6f536aca5b54af24eb08d4560b..532f7fd5bfc1dffa86638a6bc51832beebd74e1d 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -1284,8 +1284,8 @@ StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
     TF_RETURN_IF_ERROR(LookUpRequest(handle).status());
   }
 
-  if (tensorflow::StringPiece(custom_call_request.call_target_name())
-          .starts_with("$")) {
+  if (tensorflow::str_util::StartsWith(custom_call_request.call_target_name(),
+                                       "$")) {
     return InvalidArgument(
         "Invalid custom_call_target \"%s\": Call targets that start with '$' "
         "are reserved for internal use.",
@@ -3491,7 +3491,6 @@ void ComputationLowerer::Visit(
       HloInstruction* operand = lookup_instruction(trace_request.operand());
       hlo_instruction = add_instruction(
           HloInstruction::CreateTrace(trace_request.tag(), operand));
-      operand->set_tracing(hlo_instruction);
       break;
     }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
index d3d55634c97bbdf3f81321d8089bb808c411340b..3d3e1d60f294c3a2574513c1c2f071805a341ad1 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.h
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -25,7 +25,7 @@ namespace xla {
 // HLO pass that makes the following transformations on while loops:
 //
 //  - A while loop with static trip count of 0 is deleted.
-//  - A while loops with static trip count of 1 is replaced by its body (sans
+//  - A while loop with static trip count of 1 is replaced by its body (sans
 //    loop).
 //  - Elements of a while loop's tuple that the loop doesn't use are removed
 //    from the tuple.
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index f1fea6d7634f2060abe18c0fdd51a3391dcb5ae3..619e87caa5b6d0f6ec3c3b1489b0d4f50ef29963 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -68,7 +68,7 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
       hlo_string_template, "{{LOOP_BOUND}}",
       tensorflow::strings::StrCat(42 + num_iters),
       /*replace_all=*/true);
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
 }
 
 void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
@@ -107,7 +107,7 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
       hlo_string_template, "{{LOOP_BOUND}}",
       tensorflow::strings::StrCat(42 + num_iters),
       /*replace_all=*/true);
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithZeroIterationSimiplified) {
@@ -235,7 +235,7 @@ TEST_F(WhileLoopSimplifierTest, NonTupleShapedLoopNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
@@ -267,7 +267,7 @@ TEST_F(WhileLoopSimplifierTest, LoopSwappingTupleElementsNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
@@ -296,7 +296,7 @@ TEST_F(WhileLoopSimplifierTest,
   }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
@@ -319,7 +319,7 @@ TEST_F(WhileLoopSimplifierTest, LoopWithEmptyTupleNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
@@ -347,7 +347,7 @@ TEST_F(WhileLoopSimplifierTest, LoopWithElemUsedTwiceNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
@@ -389,7 +389,7 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   HloModule* the_module = &module();
   EXPECT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
 
@@ -439,7 +439,7 @@ TEST_F(WhileLoopSimplifierTest, LoopWithNonTupleBodyShapeNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
@@ -472,7 +472,7 @@ TEST_F(WhileLoopSimplifierTest,
   }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
@@ -504,7 +504,7 @@ TEST_F(WhileLoopSimplifierTest, LoopWithArrayConstantNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index 7441a7ad395bf185cd31de7d4b57beae66cc3063..bd0794184328b7926543c4275b3b915f51e7b812 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -142,23 +142,23 @@ WhileUtil::MakeInstructionsLiveIn(
 
 static StatusOr<std::unique_ptr<HloComputation>>
 MakeCountedLoopConditionComputation(const Shape& loop_state_shape,
-                                    int64 trip_count) {
+                                    int32 trip_count) {
   Shape scalar_pred = ShapeUtil::MakeShape(PRED, {});
-  Shape scalar_s64 = ShapeUtil::MakeShape(S64, {});
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> cond_computation,
                       CreateComputationWithSignature(
                           {&loop_state_shape}, scalar_pred, "while_cond"));
 
   HloInstruction* trip_count_constant = cond_computation->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int64>(trip_count)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(trip_count)));
 
   HloInstruction* param = cond_computation->parameter_instruction(0);
-  TF_ASSIGN_OR_RETURN(HloInstruction * counter,
-                      CreateGetTupleElementHlo(param, 0));
+  TF_ASSIGN_OR_RETURN(HloInstruction * indvar,
+                      MakeGetTupleElementHlo(param, 0));
+
   TF_ASSIGN_OR_RETURN(
       HloInstruction * compare,
-      CreateBinaryHlo(HloOpcode::kLt, counter, trip_count_constant));
+      MakeBinaryHlo(HloOpcode::kLt, indvar, trip_count_constant));
   cond_computation->set_root_instruction(compare);
   return std::move(cond_computation);
 }
@@ -171,18 +171,17 @@ static StatusOr<std::unique_ptr<HloComputation>> MakeCountedLoopBodyComputation(
                       CreateComputationWithSignature(
                           {&loop_state_shape}, loop_state_shape, "while_body"));
   HloInstruction* one = body_computation->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int64>(1)));
-
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)));
   HloInstruction* param = body_computation->parameter_instruction(0);
   TF_ASSIGN_OR_RETURN(HloInstruction * indvar,
-                      CreateGetTupleElementHlo(param, 0));
+                      MakeGetTupleElementHlo(param, 0));
   TF_ASSIGN_OR_RETURN(HloInstruction * next_indvar,
-                      CreateBinaryHlo(HloOpcode::kAdd, indvar, one));
+                      MakeBinaryHlo(HloOpcode::kAdd, indvar, one));
 
   std::vector<HloInstruction*> loop_body_generator_args;
   for (int64 i = 1, e = loop_state_shape.tuple_shapes_size(); i < e; i++) {
     TF_ASSIGN_OR_RETURN(HloInstruction * tuple_element,
-                        CreateGetTupleElementHlo(param, i));
+                        MakeGetTupleElementHlo(param, i));
     loop_body_generator_args.push_back(tuple_element);
   }
   TF_ASSIGN_OR_RETURN(std::vector<HloInstruction*> next_state,
@@ -200,7 +199,7 @@ static StatusOr<HloInstruction*> MakeInitTupleFromInitValues(
   std::vector<HloInstruction*> init_values_with_indvar;
   init_values_with_indvar.reserve(init_values.size() + 1);
   HloInstruction* zero = computation->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int64>(0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
   init_values_with_indvar.push_back(zero);
   c_copy(init_values, std::back_inserter(init_values_with_indvar));
   return computation->AddInstruction(
@@ -210,16 +209,18 @@ static StatusOr<HloInstruction*> MakeInitTupleFromInitValues(
 static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
   std::vector<Shape> loop_state_shape_components;
   loop_state_shape_components.reserve(init_values.size() + 1);
-  loop_state_shape_components.push_back(ShapeUtil::MakeShape(S64, {}));
+  loop_state_shape_components.push_back(ShapeUtil::MakeShape(S32, {}));
   c_transform(init_values, std::back_inserter(loop_state_shape_components),
               [](HloInstruction* instr) { return instr->shape(); });
   return ShapeUtil::MakeTupleShape(loop_state_shape_components);
 }
 
 /*static*/ StatusOr<WhileUtil::LoopStateTy> WhileUtil::MakeCountedLoop(
-    HloComputation* computation, int64 trip_count,
+    HloComputation* computation, int32 trip_count,
     const WhileUtil::LoopStateTy& init_values,
     const WhileUtil::LoopBodyGeneratorTy& loop_body_generator) {
+  CHECK_GE(trip_count, 0);
+
   Shape loop_state_shape = MakeLoopStateShape(init_values);
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloComputation> cond,
@@ -238,7 +239,7 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
   std::vector<HloInstruction*> result;
   for (int64 i = 0, e = init_values.size(); i < e; i++) {
     TF_ASSIGN_OR_RETURN(HloInstruction * user_state,
-                        CreateGetTupleElementHlo(while_instr, i + 1));
+                        MakeGetTupleElementHlo(while_instr, i + 1));
     result.push_back(user_state);
   }
   return result;
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
index 80f7e16e64f4d1b1faa73f4fb9b4dd6443bf488b..1688d4674269c36c5b356f262dbd5d958572e101 100644
--- a/tensorflow/compiler/xla/service/while_util.h
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -71,7 +71,7 @@ class WhileUtil {
   //    return loop_state;
   //  }
   static StatusOr<LoopStateTy> MakeCountedLoop(
-      HloComputation* computation, int64 trip_count,
+      HloComputation* computation, int32 trip_count,
       const LoopStateTy& init_values,
       const LoopBodyGeneratorTy& loop_body_generator);
 };
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
index 063e312df66ce9cba0fa9f49c2fc6026ba6b74aa..8763e588c484011ba2ccbc7cad8f29817347a605 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
-// HLO pass that replaces zero sized Hlos with an zero sized constant literal.
+// HLO pass that replaces zero sized Hlos with a zero sized constant literal.
 namespace xla {
 class ZeroSizedHloElimination : public HloPassInterface {
  public:
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index d8235113dd800f7bab5ceb70272a598b9dcb1fbe..32aae64973dbd7ac2f8d403d8fbd155d432642f9 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -60,6 +60,10 @@ class ServiceInterface {
   virtual tensorflow::Status ExecuteParallel(
       const ExecuteParallelRequest* arg, ExecuteParallelResponse* result) = 0;
 
+  virtual tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* arg,
+      ExecuteParallelResponse* result) = 0;
+
   virtual tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
                                           ExecuteAsyncResponse* result) = 0;
 
@@ -72,6 +76,10 @@ class ServiceInterface {
   virtual tensorflow::Status GetComputationStats(
       const ComputationStatsRequest* arg, ComputationStatsResponse* result) = 0;
 
+  virtual tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* arg,
+      ComputationStatsResponse* result) = 0;
+
   virtual tensorflow::Status GetComputationShape(
       const GetComputationShapeRequest* arg,
       GetComputationShapeResponse* result) = 0;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 4f604e6f7cb18c1aaf844967d54e3b0e07e54b34..6825d2476587d037aace043230168f78f4e46344 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -502,11 +502,11 @@ namespace {
 StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   tensorflow::str_util::RemoveLeadingWhitespace(s);
 
-  if (s->Consume("(")) {  // Tuple.
+  if (tensorflow::str_util::ConsumePrefix(s, "(")) {  // Tuple.
     std::vector<Shape> shapes;
     bool must_end = false;
     while (true) {
-      if (s->Consume(")")) {
+      if (tensorflow::str_util::ConsumePrefix(s, ")")) {
         break;
       } else if (must_end) {
         return InvalidArgument("Expected end of tuple; got: \"%s\"",
@@ -515,7 +515,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
       shapes.emplace_back();
       TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
       tensorflow::str_util::RemoveLeadingWhitespace(s);
-      must_end = !s->Consume(",");
+      must_end = !tensorflow::str_util::ConsumePrefix(s, ",");
     }
     return ShapeUtil::MakeTupleShape(shapes);
   }
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 025ac129d7040a007493cbb222d07c6cf323567f..6f58c20f34e30324ca36dbc7fa78ebb82a4b435d 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -190,6 +190,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -346,10 +347,10 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -386,6 +387,7 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -596,6 +598,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -676,7 +679,9 @@ xla_test(
     name = "gather_operation_test",
     srcs = ["gather_operation_test.cc"],
     deps = [
+        ":client_library_test_base",
         ":hlo_test_base",
+        "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -932,8 +937,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -972,9 +977,8 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
@@ -1006,7 +1010,10 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -1369,6 +1376,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1435,9 +1443,9 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1557,6 +1565,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1803,9 +1813,8 @@ tf_cc_test(
     deps = [
         ":local_client_test_base",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:computation_tracker",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/core:test_main",
@@ -1952,17 +1961,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 6e21dda25d8e5151b31b8c2328253260595a94c4..03c91745b978f80801e0da5ac44d31959659b20c 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -50,28 +51,28 @@ class ArrayElementwiseOpTestParamCount
       public ::testing::WithParamInterface<int> {};
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantZeroElementF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   ComputeAndCompareR1<float>(&builder, {2.5f, -3.14f, -2.25f, 10.0f, -6.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({-1, 0, 1, 324,
                                       std::numeric_limits<int32>::min(),
                                       std::numeric_limits<int32>::max()});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   // -min == min for int32 due to an overflow. In C++ it is undefined behavior
   // to do this calculation. For XLA we have not specified that, so it
@@ -83,18 +84,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantZeroElementC64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantC64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 1.0f}, {0.0f, 3.14f}, {2.25f, -1.0f}, {-10.0f, 0.0f}});
-  auto result = builder.Neg(a);
+  builder.Neg(a);
 
   ComputeAndCompareR1<complex64>(
       &builder, {{2.5f, -1.0f}, {0.0f, -3.14f}, {-2.25f, 1.0f}, {10.0f, 0.0f}},
@@ -102,7 +103,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantC64) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int64>({
       -1,
       1,
@@ -112,7 +113,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS64) {
       static_cast<int64>(0x8000000000000000LL),
       static_cast<int64>(0x8000000000000001LL),
   });
-  auto result = builder.Neg(a);
+  builder.Neg(a);
   LOG(INFO) << -static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
 
   ComputeAndCompareR1<int64>(&builder,
@@ -129,9 +130,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS64) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
-  auto result = builder.IsFinite(a);
+  builder.IsFinite(a);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
@@ -140,64 +141,63 @@ XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteZeroElementF32s) {
 static const float kNonCanonicalNaN = tensorflow::bit_cast<float>(0x7FD01234);
 
 XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteScalarF32) {
-  ComputationBuilder builder(client_, TestName());
-  auto result = builder.IsFinite(builder.ConstantR0<float>(NAN));
+  XlaBuilder builder(TestName());
+  builder.IsFinite(builder.ConstantR0<float>(NAN));
   ComputeAndCompareR0<bool>(&builder, false, {});
 
   EXPECT_TRUE(std::isnan(kNonCanonicalNaN));
-  auto result_non_canonical =
-      builder.IsFinite(builder.ConstantR0<float>(kNonCanonicalNaN));
+  builder.IsFinite(builder.ConstantR0<float>(kNonCanonicalNaN));
   ComputeAndCompareR0<bool>(&builder, false, {});
 
   const float inf = std::numeric_limits<float>::infinity();
-  auto result_inf = builder.IsFinite(builder.ConstantR0<float>(inf));
+  builder.IsFinite(builder.ConstantR0<float>(inf));
   ComputeAndCompareR0<bool>(&builder, false, {});
 
-  auto result_neg_inf = builder.IsFinite(builder.ConstantR0<float>(-inf));
+  builder.IsFinite(builder.ConstantR0<float>(-inf));
   ComputeAndCompareR0<bool>(&builder, false, {});
 
-  auto result_zero = builder.IsFinite(builder.ConstantR0<float>(0.0f));
+  builder.IsFinite(builder.ConstantR0<float>(0.0f));
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteR1F32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const float inf = std::numeric_limits<float>::infinity();
   EXPECT_TRUE(std::isnan(kNonCanonicalNaN));
   auto a = builder.ConstantR1<float>(
       {{NAN, 7.0f, kNonCanonicalNaN, -1.0f, inf, -inf}});
-  auto result = builder.IsFinite(a);
+  builder.IsFinite(a);
 
   ComputeAndCompareR1<bool>(&builder, {false, true, false, true, false, false},
                             {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({100.0f, 3.13f, 2.75f, 10.5f, -999.0f});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR1<float>(&builder, {97.5f, 6.27f, 5.0f, 0.5f, -993.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 0.0f}, {0.0f, 3.14f}, {2.25f, 0.0f}, {1.0f, -10.0f}});
   auto b = builder.ConstantR1<complex64>(
       {{100.0f, 0.0f}, {3.13f, 0.0f}, {2.75f, 1.0f}, {-2.0f, 10.5f}});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR1<complex64>(
       &builder, {97.5f, {3.13f, 3.14f}, {5.0f, 1.0f}, {-1.0f, 0.5f}}, {},
@@ -205,10 +205,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantC64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
   auto b = builder.ConstantR1<complex64>({});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
@@ -244,7 +244,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) {
   std::unique_ptr<GlobalData> rhs_data =
       client_->TransferToServer(*rhs_literal).ConsumeValueOrDie();
 
-  auto add = b.Add(lhs_param, rhs_param);
+  b.Add(lhs_param, rhs_param);
 
   std::vector<uint64> expected(lhs.size());
   for (int64 i = 0; i < lhs.size(); ++i) {
@@ -295,7 +295,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) {
 
 TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
   const int count = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<float> a_values;
   std::vector<float> b_values;
   for (int i = 0; i < count; ++i) {
@@ -334,49 +334,49 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({100.0f, 3.13f, 2.75f, 10.5f, -999.0f});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<float>(&builder, {-102.5f, 0.01f, -0.5f, -20.5f, 1005.0f},
                              {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({-1, 0, 2, 1000000000});
   auto b = builder.ConstantR1<int32>({-1, 2, 1, -1});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {0, -2, 1, 1000000001}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   auto b = builder.ConstantR1<int32>({});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 0.0f}, {0.0f, 3.14f}, {3.0f, 2.25f}});
   auto b = builder.ConstantR1<complex64>(
       {{0.0f, 10.0f}, {3.13f, 0.0f}, {2.75f, -0.25f}});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<complex64>(
       &builder, {{-2.5f, -10.0f}, {-3.13f, 3.14f}, {0.25f, 2.5f}}, {},
@@ -384,29 +384,29 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantC64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
   auto b = builder.ConstantR1<complex64>({});
-  auto add = builder.Sub(a, b);
+  builder.Sub(a, b);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({10.0f, 5.1f, 1.0f, 10.0f, -6.0f});
-  auto add = builder.Div(a, b);
+  builder.Div(a, b);
 
   ComputeAndCompareR1<float>(&builder, {-0.25f, 5.0f, 2.25f, -1.0f, -1.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Div(a, b);
+  builder.Div(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
@@ -436,9 +436,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
-    ComputationDataHandle divisor;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
+    XlaOp divisor;
     auto dividend_data =
         CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
     auto divisor_data =
@@ -451,8 +451,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
 
   // Test with a compile-time constant divisor.
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
     auto dividend_data =
         CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
     builder.Div(dividend, builder.ConstantR1<int32>(divisors));
@@ -461,9 +461,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
-    ComputationDataHandle divisor;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
+    XlaOp divisor;
     auto dividend_data =
         CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
     auto divisor_data =
@@ -476,8 +476,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
 
   // Test with a compile-time constant divisor.
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
     auto dividend_data =
         CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
     builder.Rem(dividend, builder.ConstantR1<int32>(divisors));
@@ -507,9 +507,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
-    ComputationDataHandle divisor;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
+    XlaOp divisor;
     auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
                                                    &builder, &dividend);
     auto divisor_data =
@@ -521,8 +521,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
     auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
                                                    &builder, &dividend);
     builder.Div(dividend, builder.ConstantR1<uint32>(divisors));
@@ -531,9 +531,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
-    ComputationDataHandle divisor;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
+    XlaOp divisor;
     auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
                                                    &builder, &dividend);
     auto divisor_data =
@@ -545,8 +545,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle dividend;
+    XlaBuilder builder(TestName());
+    XlaOp dividend;
     auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
                                                    &builder, &dividend);
     builder.Rem(dividend, builder.ConstantR1<uint32>(divisors));
@@ -556,33 +556,33 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 1.0f}, {-25.5f, 0.0f}, {2.0f, -1.0f}});
   auto b = builder.ConstantR1<complex64>(
       {{10.0f, 0.0f}, {0.0f, 1.0f}, {2.0f, -1.0f}});
-  auto div = builder.Div(a, b);
+  builder.Div(a, b);
 
   ComputeAndCompareR1<complex64>(
       &builder, {{-0.25f, 0.1f}, {0.0f, 25.5f}, {1.0f, 0.0f}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
   auto b = builder.ConstantR1<complex64>({});
-  auto div = builder.Div(a, b);
+  builder.Div(a, b);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, RemF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>(
       {-2.5f, 25.5f, 2.25f, -10.0f, 6.0f, 3.0f, 3.0f, -1.0f, -8.0f});
   auto b = builder.ConstantR1<float>(
       {10.0f, 5.1f, 1.0f, 10.0f, -6.0f, 2.0f, -2.0f, 7.0f, -4.0f});
-  auto add = builder.Rem(a, b);
+  builder.Rem(a, b);
 
   ComputeAndCompareR1<float>(
       &builder, {-2.5f, 0.0f, 0.25f, 0.0f, -0.0f, 1.0f, 1.0f, -1.0f, -0.0f}, {},
@@ -590,21 +590,21 @@ XLA_TEST_F(ArrayElementwiseOpTest, RemF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, RemZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Rem(a, b);
+  builder.Rem(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, RemF64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<double>(
       {-2.5, 25.5, 2.25, -10.0, 6.0, 3.0, 3.0, -1.0, -8.0});
   auto b = builder.ConstantR1<double>(
       {10.0, 5.1, 1.0, 10.0, -6.0, 2.0, -2.0, 7.0, -4.0});
-  auto add = builder.Rem(a, b);
+  builder.Rem(a, b);
 
   ComputeAndCompareR1<double>(
       &builder, {-2.5, 0.0, 0.25, 0.0, -0.0, 1.0, 1.0, -1.0, -0.0}, {},
@@ -612,20 +612,20 @@ XLA_TEST_F(ArrayElementwiseOpTest, RemF64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, -6.0f});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<float>(&builder, {-25.0f, 127.5f, 2.25f, -100.0f, -36.0f},
                              {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
@@ -648,19 +648,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantS32s) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>(a_data);
   auto b = builder.ConstantR1<int32>(b_data);
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   auto b = builder.ConstantR1<int32>({});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
@@ -679,21 +679,21 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantU32s) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>(a_data);
   auto b = builder.ConstantR1<uint32>(b_data);
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<uint32>(&builder, expected, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>(
       {{-2.5f, 0.0f}, {0.0f, 25.5f}, {2.0f, -10.0f}});
   auto b = builder.ConstantR1<complex64>(
       {{0.0f, 10.0f}, {5.0f, 1.0f}, {10.0f, -6.0f}});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<complex64>(
       &builder, {{0.0f, -25.0f}, {-25.5f, 127.5f}, {-40.0f, -112.0}}, {},
@@ -701,264 +701,264 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantC64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<complex64>({});
   auto b = builder.ConstantR1<complex64>({});
-  auto add = builder.Mul(a, b);
+  builder.Mul(a, b);
 
   ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({false, false, true, true});
   auto b = builder.ConstantR1<bool>({false, true, false, true});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<bool>(&builder, {false, false, false, true}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndPredR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({{false, false}, {true, true}});
   auto b = builder.ConstantR2<bool>({{false, true}, {false, true}});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   Array2D<bool> expected_array({{false, false}, {false, true}});
   ComputeAndCompareR2<bool>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndZeroElementPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({});
   auto b = builder.ConstantR1<bool>({});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({0, -1, -8});
   auto b = builder.ConstantR1<int32>({5, -7, 12});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {0, -7, 8}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndS32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<int32>({{0, -5}, {-1, 5}});
   auto b = builder.ConstantR2<int32>({{1, -6}, {4, 5}});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   Array2D<int32> expected_array({{0, -6}, {4, 5}});
   ComputeAndCompareR2<int32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndZeroElementS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   auto b = builder.ConstantR1<int32>({});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({0, 1, 8});
   auto b = builder.ConstantR1<int32>({5, 7, 12});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {0, 1, 8}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndU32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<uint32>({{0, 1}, {3, 8}});
   auto b = builder.ConstantR2<uint32>({{1, 0}, {7, 6}});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   Array2D<uint32> expected_array({{0, 0}, {3, 0}});
   ComputeAndCompareR2<uint32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AndZeroElementU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({});
   auto b = builder.ConstantR1<uint32>({});
-  auto out = builder.And(a, b);
+  builder.And(a, b);
 
   ComputeAndCompareR1<uint32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({false, false, true, true});
   auto b = builder.ConstantR1<bool>({false, true, false, true});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<bool>(&builder, {false, true, true, true}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrPredR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({{false, false}, {true, true}});
   auto b = builder.ConstantR2<bool>({{false, true}, {false, true}});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   Array2D<bool> expected_array({{false, true}, {true, true}});
   ComputeAndCompareR2<bool>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrZeroElementPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({});
   auto b = builder.ConstantR1<bool>({});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({0, -1, 8});
   auto b = builder.ConstantR1<int32>({5, -7, 4});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {5, -1, 12}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrS32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<int32>({{0, -1}, {8, 8}});
   auto b = builder.ConstantR2<int32>({{5, -7}, {4, 1}});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   Array2D<int32> expected_array({{5, -1}, {12, 9}});
   ComputeAndCompareR2<int32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrZeroElementS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   auto b = builder.ConstantR1<int32>({});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({0, 1, 8});
   auto b = builder.ConstantR1<uint32>({5, 7, 4});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<uint32>(&builder, {5, 7, 12}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrU32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<uint32>({{0, 1}, {8, 8}});
   auto b = builder.ConstantR2<uint32>({{5, 7}, {4, 1}});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   Array2D<uint32> expected_array({{5, 7}, {12, 9}});
   ComputeAndCompareR2<uint32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, OrZeroElementU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({});
   auto b = builder.ConstantR1<uint32>({});
-  auto out = builder.Or(a, b);
+  builder.Or(a, b);
 
   ComputeAndCompareR1<uint32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({false, true, true, false});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<bool>(&builder, {true, false, false, true}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotPredR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({{false, true}, {true, false}});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   Array2D<bool> expected_array({{true, false}, {false, true}});
   ComputeAndCompareR2<bool>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotZeroElementPredR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({-1, 0, 1});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<int32>(&builder, {0, -1, -2}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotS32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<int32>({{-1, 0}, {1, 8}});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   Array2D<int32> expected_array({{0, -1}, {-2, -9}});
   ComputeAndCompareR2<int32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotZeroElementS32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({0, 4294967295});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<uint32>(&builder, {4294967295, 0}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotU32R2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<uint32>({{0, 4294967295}, {1, 4294967294}});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   Array2D<uint32> expected_array({{4294967295, 0}, {4294967294, 1}});
   ComputeAndCompareR2<uint32>(&builder, expected_array, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NotZeroElementU32R1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>({});
-  auto out = builder.Not(a);
+  builder.Not(a);
 
   ComputeAndCompareR1<uint32>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftLeftS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({static_cast<int32>(0x12345678),
                                       static_cast<int32>(0xF0001000), 1, 3, 77,
                                       1, -3, 77});
   auto b = builder.ConstantR1<int32>({4, 8, 2, 7, 15, 32, 100, -1});
-  auto out = builder.ShiftLeft(a, b);
+  builder.ShiftLeft(a, b);
 
   ComputeAndCompareR1<int32>(&builder,
                              {static_cast<int32>(0x23456780), 0x00100000, 0x4,
@@ -967,12 +967,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, ShiftLeftS32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightArithmeticS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({static_cast<int32>(0x92345678),
                                       static_cast<int32>(0x10001000), 1, 3, 77,
                                       1, -3, 77});
   auto b = builder.ConstantR1<int32>({4, 8, 2, 7, 2, 32, 100, -1});
-  auto out = builder.ShiftRightArithmetic(a, b);
+  builder.ShiftRightArithmetic(a, b);
 
   ComputeAndCompareR1<int32>(
       &builder,
@@ -982,45 +982,45 @@ XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightArithmeticS32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightLogicalS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({static_cast<int32>(0x92345678),
                                       static_cast<int32>(0x10001000), 1, 3, 77,
                                       1, -3, 77});
   auto b = builder.ConstantR1<int32>({4, 8, 2, 7, 5, 32, 100, -1});
-  auto out = builder.ShiftRightLogical(a, b);
+  builder.ShiftRightLogical(a, b);
 
   ComputeAndCompareR1<int32>(&builder,
                              {0x09234567, 0x00100010, 0, 0, 2, 0, 0, 0}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftLeftU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>(
       {0x12345678, 0xF0001000, 1, 3, 77, 1, ~3u, 77});
   auto b = builder.ConstantR1<uint32>({4, 8, 2, 7, 15, 32, 100, ~0u});
-  auto out = builder.ShiftLeft(a, b);
+  builder.ShiftLeft(a, b);
 
   ComputeAndCompareR1<uint32>(
       &builder, {0x23456780, 0x00100000, 0x4, 0x180, 2523136, 0, 0, 0}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightArithmeticU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>(
       {0x92345678, 0x10001000, 1, 3, 77, 1, ~3u, 77});
   auto b = builder.ConstantR1<uint32>({4, 8, 2, 7, 2, 32, 100, ~0u});
-  auto out = builder.ShiftRightArithmetic(a, b);
+  builder.ShiftRightArithmetic(a, b);
 
   ComputeAndCompareR1<uint32>(
       &builder, {0xF9234567, 0x00100010, 0, 0, 19, 0, ~0u, 0}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightLogicalU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint32>(
       {0x92345678, 0x10001000, 1, 3, 77, 1, ~3u, 77});
   auto b = builder.ConstantR1<uint32>({4, 8, 2, 7, 5, 32, 100, ~0u});
-  auto out = builder.ShiftRightLogical(a, b);
+  builder.ShiftRightLogical(a, b);
 
   ComputeAndCompareR1<uint32>(&builder,
                               {0x09234567, 0x00100010, 0, 0, 2, 0, 0, 0}, {});
@@ -1028,59 +1028,59 @@ XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightLogicalU32) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 2.25f, 10.0f, NAN});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {false, false, true, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({});
   auto rhs = builder.ConstantR1<float>({});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Ge(lhs, rhs);
+  builder.Ge(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {false, true, true, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Gt(lhs, rhs);
+  builder.Gt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {false, true, true, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLeF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 5.0f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Le(lhs, rhs);
+  builder.Le(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {true, true, false, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLtF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Lt(lhs, rhs);
+  builder.Lt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {true, false, false, false, false}, {});
 }
@@ -1088,10 +1088,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtF32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, false, false, false, true, false, false, false, true},
@@ -1099,17 +1099,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({});
   auto rhs = builder.ConstantR1<int32>({});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqC64s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<complex64>({{-2.5f, 10.0f},
                                             {1.0f, 25.5f},
                                             {2.25f, -3.0f},
@@ -1120,16 +1120,16 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqC64s) {
                                             {2.25f, -3.0f},
                                             {10.0f, 0.0f},
                                             {1.0f, NAN}});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {false, false, true, false, false}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementC64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<complex64>({});
   auto rhs = builder.ConstantR1<complex64>({});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
@@ -1138,7 +1138,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeC64s) {
   // Disable fast-math because we're operating on NaNs.
   SetFastMathDisabled(true);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<complex64>({{-2.5f, 10.0f},
                                             {1.0f, 25.5f},
                                             {2.25f, -3.0f},
@@ -1149,7 +1149,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeC64s) {
                                             {2.25f, -3.0f},
                                             {10.0f, 0.0f},
                                             {1.0f, NAN}});
-  auto compare = builder.Ne(lhs, rhs);
+  builder.Ne(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {true, true, false, true, true}, {});
 }
@@ -1158,10 +1158,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
   // Disable fast-math because we're operating on NaNs.
   SetFastMathDisabled(true);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({10.0f, 25.5f, 1.0f, 10.0f, NAN});
-  auto compare = builder.Ne(lhs, rhs);
+  builder.Ne(lhs, rhs);
 
   ComputeAndCompareR1<bool>(&builder, {true, false, true, true, true}, {});
 }
@@ -1169,10 +1169,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Ne(lhs, rhs);
+  builder.Ne(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, true, true, true, false, true, true, true, false}, {});
@@ -1181,10 +1181,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Ge(lhs, rhs);
+  builder.Ge(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, false, false, true, true, false, true, true, true}, {});
@@ -1193,10 +1193,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Gt(lhs, rhs);
+  builder.Gt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, false, false, true, false, false, true, true, false},
@@ -1206,10 +1206,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Le(lhs, rhs);
+  builder.Le(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, true, true, false, true, true, false, false, true}, {});
@@ -1218,10 +1218,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLeS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLtS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({min, min, min, 0, 0, 0, max, max, max});
   auto rhs = builder.ConstantR1<int32>({min, 0, max, -1, 0, 1, min, 0, max});
-  auto compare = builder.Lt(lhs, rhs);
+  builder.Lt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, true, true, false, false, true, false, false, false},
@@ -1230,10 +1230,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtS32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Eq(lhs, rhs);
+  builder.Eq(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, false, false, false, true, false, false, false, true},
@@ -1242,10 +1242,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareNeU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Ne(lhs, rhs);
+  builder.Ne(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, true, true, true, false, true, true, true, false}, {});
@@ -1253,10 +1253,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGeU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Ge(lhs, rhs);
+  builder.Ge(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, false, false, true, true, false, true, true, true}, {});
@@ -1264,10 +1264,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Gt(lhs, rhs);
+  builder.Gt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, false, false, true, false, false, true, true, false},
@@ -1276,10 +1276,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLeU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Le(lhs, rhs);
+  builder.Le(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {true, true, true, false, true, true, false, false, true}, {});
@@ -1287,10 +1287,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLeU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
   auto rhs = builder.ConstantR1<uint32>({0, 1, max, 4, 5, 6, 0, 1, max});
-  auto compare = builder.Lt(lhs, rhs);
+  builder.Lt(lhs, rhs);
 
   ComputeAndCompareR1<bool>(
       &builder, {false, true, true, false, false, true, false, false, false},
@@ -1299,12 +1299,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs =
       builder.ConstantR1<float>({4.0f, 2.0f, 2.0f, NAN, 6.0f, -2.0f, -2.0f});
   auto rhs =
       builder.ConstantR1<float>({2.0f, -2.0f, 3.0f, 10.0f, NAN, 3.0f, 4.0f});
-  auto minimum = builder.Pow(lhs, rhs);
+  builder.Pow(lhs, rhs);
 
   ComputeAndCompareR1<float>(
       &builder, {16.0f, 0.25f, 8.0f, NAN, NAN, -8.0f, 16.0f}, {}, error_spec_);
@@ -1312,20 +1312,20 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowNonIntegerF32s) {
   SetFastMathDisabled(true);
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({-2.0f, -0.6f, -0.6f, 0.0f});
   auto rhs = builder.ConstantR1<float>({0.5f, 0.6f, -0.6f, -0.6f});
-  auto minimum = builder.Pow(lhs, rhs);
+  builder.Pow(lhs, rhs);
 
   ComputeAndCompareR1<float>(&builder, {NAN, NAN, NAN, INFINITY}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({});
   auto rhs = builder.ConstantR1<float>({});
-  auto minimum = builder.Pow(lhs, rhs);
+  builder.Pow(lhs, rhs);
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
@@ -1599,14 +1599,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div4F32) {
 
 TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
   const int count = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<float> values;
   values.reserve(count);
   for (int i = 0; i < count; ++i) {
     values.push_back(i / static_cast<float>(count));
   }
   auto x = builder.ConstantR1<float>(values);
-  auto exp = builder.Pow(x, builder.ConstantR0<float>(2.0f));
+  builder.Pow(x, builder.ConstantR0<float>(2.0f));
 
   std::vector<float> expected;
   expected.reserve(values.size());
@@ -1618,7 +1618,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> values(2, 2, 2, 2);
 
   std::vector<float> values_vector;
@@ -1632,77 +1632,77 @@ XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4D) {
   Array4D<float> expected(2, 2, 2, 2, expected_vector);
 
   auto x = builder.ConstantR4FromArray4D<float>(values);
-  auto exp = builder.Pow(x, builder.ConstantR0<float>(2.0f));
+  builder.Pow(x, builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4DZeroElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> values(2, 2, 0, 2);
   Array4D<float> expected(2, 2, 0, 2);
 
   auto x = builder.ConstantR4FromArray4D<float>(values);
-  auto exp = builder.Pow(x, builder.ConstantR0<float>(2.0f));
+  builder.Pow(x, builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MinF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   SetFastMathDisabled(true);
   auto lhs = builder.ConstantR1<float>({1.0f, 1.0f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({2.0f, -5.0f, 1.0f, 10.0f, NAN});
-  auto minimum = builder.Min(lhs, rhs);
+  builder.Min(lhs, rhs);
 
   ComputeAndCompareR1<float>(&builder, {1.0f, -5.0f, 1.0f, NAN, NAN}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MinZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({});
   auto rhs = builder.ConstantR1<float>({});
-  auto minimum = builder.Min(lhs, rhs);
+  builder.Min(lhs, rhs);
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MinF64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   SetFastMathDisabled(true);
   auto lhs = builder.ConstantR1<double>({1.0, 1.0, 2.25, NAN, 6.0});
   auto rhs = builder.ConstantR1<double>({2.0, -5.0, 1.0, 10.0, NAN});
-  auto minimum = builder.Min(lhs, rhs);
+  builder.Min(lhs, rhs);
 
   ComputeAndCompareR1<double>(&builder, {1.0, -5.0, 1.0, NAN, NAN}, {},
                               error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   SetFastMathDisabled(true);
   auto lhs = builder.ConstantR1<float>({1.0f, 1.0f, 2.25f, NAN, 6.0f});
   auto rhs = builder.ConstantR1<float>({2.0f, -5.0f, 1.0f, 10.0f, NAN});
-  auto maximum = builder.Max(lhs, rhs);
+  builder.Max(lhs, rhs);
 
   ComputeAndCompareR1<float>(&builder, {2.0f, 1.0f, 2.25f, NAN, NAN}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<float>({});
   auto rhs = builder.ConstantR1<float>({});
-  auto minimum = builder.Max(lhs, rhs);
+  builder.Max(lhs, rhs);
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxF64s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   SetFastMathDisabled(true);
   auto lhs = builder.ConstantR1<double>({1.0, 1.0, 2.25, NAN, 6.0});
   auto rhs = builder.ConstantR1<double>({2.0, -5.0, 1.0, 10.0, NAN});
-  auto maximum = builder.Max(lhs, rhs);
+  builder.Max(lhs, rhs);
 
   ComputeAndCompareR1<double>(&builder, {2.0, 1.0, 2.25, NAN, NAN}, {},
                               error_spec_);
@@ -1711,7 +1711,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxF64s) {
 XLA_TEST_F(ArrayElementwiseOpTest, MaxS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>(
       {min, min, min, -1, -1, 0, 0, 0, 1, 1, max, max, max});
   auto y = builder.ConstantR1<int32>(
@@ -1726,7 +1726,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxS32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, MinS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>(
       {min, min, min, -1, -1, 0, 0, 0, 1, 1, max, max, max});
   auto y = builder.ConstantR1<int32>(
@@ -1740,7 +1740,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinS32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<uint32>({0, 0, 1, 1, 1, max, max, max});
   auto y = builder.ConstantR1<uint32>({0, 1, 0, 1, 10, 0, 234234, max});
   builder.Max(x, y);
@@ -1751,7 +1751,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxU32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, MinU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<uint32>({0, 0, 1, 1, 1, max, max, max});
   auto y = builder.ConstantR1<uint32>({0, 1, 0, 1, 10, 0, 234234, max});
   builder.Min(x, y);
@@ -1761,7 +1761,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinU32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxTenF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
   auto y = builder.ConstantR1<float>(
@@ -1774,7 +1774,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxTenF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S1AndR1S0F32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto u = builder.ConstantR1<float>({3.5});
   auto v = builder.ConstantR1<float>({});
   builder.Max(u, v);
@@ -1784,7 +1784,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S1AndR1S0F32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S0AndR2S0x2F32s) {
   for (int broadcast_dim : {0, 1}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto u = builder.ConstantR1<float>({3.5});
     auto v = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
     builder.Max(u, v, /*broadcast_dimensions=*/{broadcast_dim});
@@ -1794,7 +1794,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S0AndR2S0x2F32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   auto m =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
@@ -1805,7 +1805,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({});
   auto m = builder.ConstantR2<float>({{}, {}});
   builder.Max(v, m, /*broadcast_dimensions=*/{1});
@@ -1815,7 +1815,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DZeroElementF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto scalar = builder.ConstantR0<int32>(2);
   Array3D<int32> a_3d({{{3, 9, -1}, {2, -10, 3}}, {{-2, 2, 8}, {12, 10, 4}}});
   auto array = builder.ConstantR3FromArray3D<int32>(a_3d);
@@ -1826,7 +1826,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarZeroElementS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto scalar = builder.ConstantR0<int32>(2);
   Array3D<int32> a_3d(2, 0, 3);
   auto array = builder.ConstantR3FromArray3D<int32>(a_3d);
@@ -1837,7 +1837,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarZeroElementS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m =
       builder.ConstantR2<float>({{-10.4f, 64.0f, 6.0f}, {0.1f, 32.0f, 16.1f}});
   auto v = builder.ConstantR1<float>({-10.2f, 16.4f});
@@ -1848,7 +1848,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantR2<float>({{}, {}});
   auto v = builder.ConstantR1<float>({-10.2f, 16.4f});
   builder.Min(m, v, /*broadcast_dimensions=*/{0});
@@ -1858,7 +1858,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DZeroElementF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto array2d =
       builder.ConstantR2<float>({{-12.2f, 64.3f, 6.1f}, {0.0f, 32.2f, 2.5f}});
   auto array4d = builder.ConstantR4FromArray4D<float>(
@@ -1873,7 +1873,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto array2d =
       builder.ConstantR2<float>({{-12.2f, 64.3f, 6.1f}, {0.0f, 32.2f, 2.5f}});
   Array4D<float> arg(2, 2, 0, 3);
@@ -1885,7 +1885,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DZeroElementF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MinTenS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto y = builder.ConstantR1<int32>({9, 8, 7, 6, 5, 4, 3, 2, 1, 0});
   builder.Min(x, y);
@@ -1895,7 +1895,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinTenS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MaxTenS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto y = builder.ConstantR1<int32>({9, 8, 7, 6, 5, 4, 3, 2, 1, 0});
   builder.Max(x, y);
@@ -1905,110 +1905,107 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxTenS32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, RemTwoConstantS32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({-3, 26, 2, -1, 1});
   auto b = builder.ConstantR1<int32>({10, 5, 1, 10, -10});
-  auto add = builder.Rem(a, b);
+  builder.Rem(a, b);
 
   ComputeAndCompareR1<int32>(&builder, {-3, 1, 0, -1, 1}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NonNanClampF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto minimum = builder.ConstantR1<float>({1.0f, -6.5f, 1.0f, 2.25f, 0.0f});
   auto argument = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 10.0f});
   auto maximum = builder.ConstantR1<float>({3.0f, 0.5f, 25.5f, 5.0f, 123.0});
-  auto clamp = builder.Clamp(minimum, argument, maximum);
+  builder.Clamp(minimum, argument, maximum);
 
   ComputeAndCompareR1<float>(&builder, {2.0f, 0.5f, 1.0f, 2.25f, 10.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto minimum = builder.ConstantR0<float>(0.0f);
   auto argument = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 4.0f});
   auto maximum = builder.ConstantR0<float>(5.0f);
-  auto clamp = builder.Clamp(minimum, argument, maximum);
+  builder.Clamp(minimum, argument, maximum);
 
   ComputeAndCompareR1<float>(&builder, {2.0f, 5.0f, 0.0f, 1.0f, 4.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_scalar = builder.ConstantR0<float>(0.0f);
   auto min_vector = builder.ConstantR1<float>({1.0f, -6.5f, 1.0f, 2.25f, 0.0f});
   auto arg_vector = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 4.0f});
   auto max_scalar = builder.ConstantR0<float>(3.0f);
   auto max_vector = builder.ConstantR1<float>({3.0f, 0.5f, 25.5f, 5.0f, 123.0});
   // Perform clamp with broadcasted scalar and vector.
-  auto clamp = builder.Add(
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
-                  builder.Clamp(min_scalar, arg_vector, max_vector)),
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
-                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+  builder.Add(builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                          builder.Clamp(min_scalar, arg_vector, max_vector)),
+              builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                          builder.Clamp(min_scalar, arg_vector, max_scalar)));
 
   ComputeAndCompareR1<float>(&builder, {8.0f, 7.0f, 2.0f, 6.5f, 14.0f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampS32Vector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0, -5});
   auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4, 10});
   auto max_vector = builder.ConstantR1<int32>({3, 0, 25, 5, 123, -1});
-  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+  builder.Clamp(min_vector, arg_vector, max_vector);
 
   ComputeAndCompareR1<int32>(&builder, {2, 0, 1, 2, 4, -1}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampS32ScalarVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_scalar = builder.ConstantR0<int32>(0);
   auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0});
   auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4});
   auto max_scalar = builder.ConstantR0<int32>(3);
   auto max_vector = builder.ConstantR1<int32>({3, 1, 25, 5, 123});
   // Perform clamp with broadcasted scalar and vector.
-  auto clamp = builder.Add(
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
-                  builder.Clamp(min_scalar, arg_vector, max_vector)),
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
-                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+  builder.Add(builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                          builder.Clamp(min_scalar, arg_vector, max_vector)),
+              builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                          builder.Clamp(min_scalar, arg_vector, max_scalar)));
 
   ComputeAndCompareR1<int32>(&builder, {8, 8, 2, 6, 14}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_vector = builder.ConstantR1<uint32>({1, 2, 1, 2, 0, ~0u - 4});
   auto arg_vector = builder.ConstantR1<uint32>({2, 10, 5, 1, 4, 10});
   auto max_vector = builder.ConstantR1<uint32>({3, 5, 25, 5, 123, ~0u});
-  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+  builder.Clamp(min_vector, arg_vector, max_vector);
 
   ComputeAndCompareR1<uint32>(&builder, {2, 5, 5, 2, 4, ~0u - 4}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, ClampU32ScalarVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min_scalar = builder.ConstantR0<uint32>(0);
   auto min_vector = builder.ConstantR1<uint32>({1, 0, 1, 2, 0});
   auto arg_vector = builder.ConstantR1<uint32>({2, 10, 0, 1, 4});
   auto max_scalar = builder.ConstantR0<uint32>(3);
   auto max_vector = builder.ConstantR1<uint32>({3, 1, 25, 5, 123});
   // Perform clamp with broadcasted scalar and vector.
-  auto clamp = builder.Add(
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
-                  builder.Clamp(min_scalar, arg_vector, max_vector)),
-      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
-                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+  builder.Add(builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                          builder.Clamp(min_scalar, arg_vector, max_vector)),
+              builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                          builder.Clamp(min_scalar, arg_vector, max_scalar)));
 
   ComputeAndCompareR1<uint32>(&builder, {8, 8, 2, 6, 14}, {});
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
@@ -2022,7 +2019,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
 
   auto p0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto p1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto add = builder.Add(p0, p1);
+  builder.Add(p0, p1);
 
   ComputeAndCompareR1<float>(&builder, {8.3f, 4.5f, 6.7f, 11.1f},
                              {param0_data.get(), param1_data.get()},
@@ -2030,7 +2027,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersZeroElementF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR3FromArray3D<float>(Array3D<float>(0, 7, 0));
@@ -2044,7 +2041,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersZeroElementF32s) {
 
   auto p0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto p1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto add = builder.Add(p0, p1);
+  builder.Add(p0, p1);
 
   Array3D<float> expected(0, 7, 0);
   ComputeAndCompareR3<float>(
@@ -2052,7 +2049,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersZeroElementF32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
@@ -2061,35 +2058,35 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) {
 
   auto a = builder.ConstantR1<float>({1.1f, 2.2f, 3.3f, 4.4f});
   auto p = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto add = builder.Add(a, p);
+  builder.Add(a, p);
 
   ComputeAndCompareR1<float>(&builder, {2.2f, 4.4f, 6.6f, 9.9f},
                              {param0_data.get()}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CosF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({3.14159f, 0.0f, 1.570796f, -0.78539f});
-  auto result = builder.Cos(a);
+  builder.Cos(a);
 
   ComputeAndCompareR1<float>(&builder, {-1.0f, 1.0f, 0.0f, 0.707107f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SinF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({3.14159f, 0.0f, 1.570796f, -0.78539f});
-  auto result = builder.Sin(a);
+  builder.Sin(a);
 
   ComputeAndCompareR1<float>(&builder, {0.0f, 0.0f, 1.0f, -0.707107f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Atan2F32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({0.0f, 5.0f, 0.0f, -3.0f, 2.0f, -8.0f});
   auto b = builder.ConstantR1<float>({6.0f, 0.0f, -4.0f, 0.0f, 2.0f, 8.0f});
-  auto atan = builder.Atan2(a, b);
+  builder.Atan2(a, b);
 
   ComputeAndCompareR1<float>(
       &builder,
@@ -2098,9 +2095,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Atan2F32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, TanhF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f});
-  auto result = builder.Tanh(a);
+  builder.Tanh(a);
 
   ComputeAndCompareR1<float>(&builder, {-0.986614f, 0.996260f, 0.978026}, {},
                              error_spec_);
@@ -2110,7 +2107,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
   // This is like the test ArrayElementwiseOpTest.TanhF32s above, except that
   // the input tensor is large enough to exercise the vectorized tanh
   // implementation on XLA CPU.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>(
       {1.02,  -0.32, 0.85,  0.90,  1.23,  -0.91, -0.49, 0.80,  -0.67, 0.16,
        -0.07, 0.39,  -0.41, 0.04,  1.36,  1.25,  0.41,  0.65,  -1.08, 0.32,
@@ -2149,7 +2146,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
 XLA_TEST_F(ArrayElementwiseOpTest, ExpF32sVector) {
   // The input tensor is large enough to exercise the vectorized exp
   // implementation on XLA CPU.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // Just to help make sense of the scales here -- exp(89) saturates float32 and
   // exp(-10) is smaller than our error spec.
@@ -2185,7 +2182,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, ExpF32sVector) {
 XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) {
   // The input tensor is large enough to exercise the vectorized exp
   // implementation on XLA CPU.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> input_literal = Literal::CreateR1<float>(
       {-1.29,    -1.41,    -1.25,    -13.5,    -11.7,    -17.9,    -198,
@@ -2225,14 +2222,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   //         /               /
   // b -----/               /
   // c---------------------/
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto a = builder.ConstantR1<float>({1.1f, 2.2f, 3.3f, 4.4f});
   auto b = builder.ConstantR1<float>({2.1f, 3.2f, 4.3f, 5.4f});
   auto c = builder.ConstantR1<float>({-3.3f, -15.5f, -7.7f, -29.9f});
 
   auto add = builder.Add(a, b);
-  auto add2 = builder.Add(add, c);
+  builder.Add(add, c);
 
   ComputeAndCompareR1<float>(&builder, {-0.1f, -10.1f, -0.1f, -20.1f}, {},
                              error_spec_);
@@ -2243,14 +2240,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldRight) {
   //         /               /
   // c -----/               /
   // a---------------------/
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto a = builder.ConstantR1<float>({91.1f, 2.2f, 3.3f, 4.4f});
   auto b = builder.ConstantR1<float>({2.1f, 3.2f, 4.3f, 5.4f});
   auto c = builder.ConstantR1<float>({-3.3f, -15.5f, -7.7f, -29.9f});
 
   auto add = builder.Add(b, c);
-  auto add2 = builder.Add(a, add);
+  builder.Add(a, add);
 
   ComputeAndCompareR1<float>(&builder, {89.9f, -10.1f, -0.1f, -20.1f}, {},
                              error_spec_);
@@ -2260,14 +2257,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddWithNeg) {
   // a ----- (neg) ----- (add)
   //                    /
   // b ----- (neg) ----/
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto a = builder.ConstantR1<float>({91.1f, 2.2f, 3.3f, 4.4f});
   auto b = builder.ConstantR1<float>({2.1f, 3.2f, 4.3f, 5.4f});
 
   auto neg_a = builder.Neg(a);
   auto neg_b = builder.Neg(b);
-  auto result = builder.Add(neg_a, neg_b);
+  builder.Add(neg_a, neg_b);
 
   ComputeAndCompareR1<float>(&builder, {-93.2f, -5.4f, -7.6f, -9.8f}, {},
                              error_spec_);
@@ -2281,7 +2278,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainTwoSide) {
   // c ------ (add) ------------/
   //         /
   // d -----/
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto a = builder.ConstantR1<float>({91.1f, 2.2f, 3.3f, 4.4f});
   auto b = builder.ConstantR1<float>({2.1f, 3.2f, 4.3f, 5.4f});
@@ -2290,19 +2287,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainTwoSide) {
 
   auto add_ab = builder.Add(a, b);
   auto add_cd = builder.Add(c, d);
-  auto add_all = builder.Add(add_ab, add_cd);
+  builder.Add(add_ab, add_cd);
 
   ComputeAndCompareR1<float>(&builder, {70.9f, -0.1f, -40.1f, 0.1f}, {},
                              error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, 2DBinaryOpF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto b =
       builder.ConstantR2<float>({{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   Array2D<float> expected_array(
       {{-4.0f, 11.28f, 43.0f}, {1.25f, -14.0f, 8.88f}});
@@ -2311,11 +2308,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, 2DBinaryOpF32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, ScalarPlus2DF32) {
   // Add a scalar + matrix.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto scalar = builder.ConstantR0<float>(3.0f);
-  auto add = builder.Add(scalar, a);
+  builder.Add(scalar, a);
 
   Array2D<float> expected_array({{0.5f, 6.14f, 4.0f}, {5.25f, -7.0f, 6.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2323,11 +2320,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, ScalarPlus2DF32) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, 2DPlusScalarF32) {
   // Add a matrix + scalar.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto scalar = builder.ConstantR0<float>(3.0f);
-  auto add = builder.Add(a, scalar);
+  builder.Add(a, scalar);
 
   Array2D<float> expected_array({{0.5f, 6.14f, 4.0f}, {5.25f, -7.0f, 6.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2336,14 +2333,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, 2DPlusScalarF32) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32) {
   // Test simple broadcasting of a R1F32 over R2F32. The vector's size matches
   // only dim 0 of the matrix.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({20.0f, 40.0f, 60.0f});
   // clang-format off
   auto m = builder.ConstantR2<float>({
     {-2.5f, 3.14f, 1.0f},
     {2.25f, -10.0f, 3.33f}});
   // clang-format on
-  auto add = builder.Add(v, m, /*broadcast_dimensions=*/{1});
+  builder.Add(v, m, /*broadcast_dimensions=*/{1});
   Array2D<float> expected_array(
       {{17.5f, 43.14f, 61.0f}, {22.25f, 30.0f, 63.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2369,10 +2366,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Eq) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ne) {
   // Test broadcasting in Ne comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({42, 73});
   auto m = builder.ConstantR2<int32>({{42, 73}, {42, 52}});
-  auto cmp = builder.Ne(v, m, /*broadcast_dimensions=*/{1});
+  builder.Ne(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,2] {
   { 00 },
@@ -2383,10 +2380,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ne) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ge) {
   // Test broadcasting in Ge comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, 2, 3, 4});
   auto m = builder.ConstantR2<int32>({{1, 0, 5, 6}, {42, 52, 10, 4}});
-  auto cmp = builder.Ge(v, m, /*broadcast_dimensions=*/{1});
+  builder.Ge(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
   { 1100 },
@@ -2397,10 +2394,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ge) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Gt) {
   // Test broadcasting in Gt comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, 2, 3, 4});
   auto m = builder.ConstantR2<int32>({{1, 0, 5, 6}, {42, 52, 10, 4}});
-  auto cmp = builder.Gt(v, m, /*broadcast_dimensions=*/{1});
+  builder.Gt(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
   { 0100 },
@@ -2411,10 +2408,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Gt) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Le) {
   // Test broadcasting in Le comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, 2, 3, 4});
   auto m = builder.ConstantR2<int32>({{1, 0, 5, 6}, {42, 52, 10, 4}});
-  auto cmp = builder.Le(v, m, /*broadcast_dimensions=*/{1});
+  builder.Le(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
   { 1011 },
@@ -2425,10 +2422,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Le) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Lt) {
   // Test broadcasting in Lt comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, 2, 3, 4});
   auto m = builder.ConstantR2<int32>({{1, 0, 5, 6}, {42, 52, 10, 4}});
-  auto cmp = builder.Lt(v, m, /*broadcast_dimensions=*/{1});
+  builder.Lt(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
   { 0011 },
@@ -2440,24 +2437,24 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Lt) {
 XLA_TEST_F(ArrayElementwiseOpTest, Mul2Dby1DF32) {
   // Test simple broadcasting of a R1F32 over R2F32 when the order of binary op
   // arguments is reversed.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantR2<float>({{1.5f, 2.5f, 3.5f}, {4.5f, 5.5f, 6.5f}});
   auto v = builder.ConstantR1<float>({2.0f, 4.0f, 6.0f});
-  auto add = builder.Mul(m, v, /*broadcast_dimensions=*/{1});
+  builder.Mul(m, v, /*broadcast_dimensions=*/{1});
   Array2D<float> expected_array({{3.0f, 10.0f, 21.0f}, {9.0f, 22.0f, 39.0f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim1) {
   // Tests broadcasting for arrays with degenerate (size == 1) dimensions.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // m's shape in XLA notation is {3, 2}
   // md's shape in XLA notation is {3, 1}
   // The result has shape {3, 2}, where md is broadcast over m
   auto m =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto md = builder.ConstantR2<float>({{10.0f, 20.0f, 30.0f}});
-  auto add = builder.Add(m, md);
+  builder.Add(m, md);
   Array2D<float> expected_array(
       {{7.5f, 23.14f, 31.0f}, {12.25f, 10.0f, 33.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2465,14 +2462,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim1) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim0) {
   // Tests broadcasting for arrays with degenerate (size == 1) dimensions.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // m's shape in XLA notation is {3, 2}
   // md's shape in XLA notation is {1, 2}
   // The result has shape {3, 2}, where md is broadcast over m
   auto m =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto md = builder.ConstantR2<float>({{10.0f}, {20.0f}});
-  auto add = builder.Add(m, md);
+  builder.Add(m, md);
   Array2D<float> expected_array(
       {{7.5f, 13.14f, 11.0f}, {22.25f, 10.0f, 23.33f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
@@ -2483,13 +2480,13 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DsWithDegenerateDimsOuterProduct) {
   // effectively creates an "outer product" operation.
   // This is taken from the Numpy docs example at:
   // http://docs.scipy.org/doc/numpy-1.10.1/user/basics.broadcasting.html
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // a's shape in XLA notation is {1, 4}
   // b's shape in XLA notation is {3, 1}
   // The result has shape {3, 4}.
   auto a = builder.ConstantR2<float>({{0.0f}, {10.0f}, {20.0f}, {30.0f}});
   auto b = builder.ConstantR2<float>({{1.0f, 2.0f, 3.0f}});
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
   Array2D<float> expected_array({{1.0f, 2.0f, 3.0f},
                                  {11.0f, 12.0f, 13.0f},
                                  {21.0f, 22.0f, 23.0f},
@@ -2500,10 +2497,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DsWithDegenerateDimsOuterProduct) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver1) {
   // Add together a (2,2) array and a (2) array, using dimension 0 for
   // broadcasting (though there are two ways to broadcast these shapes).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({20.0f, 40.0f});
   auto m = builder.ConstantR2<float>({{10.0f, 50.0f}, {77.0f, 88.0f}});
-  auto add = builder.Add(v, m, /*broadcast_dimensions=*/{1});
+  builder.Add(v, m, /*broadcast_dimensions=*/{1});
   Array2D<float> expected_array({{30.0f, 90.0f}, {97.0f, 128.0f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
@@ -2511,17 +2508,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver1) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver0) {
   // Add together a (2,2) array and a (2) array, using dimension 1 for
   // broadcasting (though there are two ways to broadcast these shapes).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({20.0f, 40.0f});
   auto m = builder.ConstantR2<float>({{10.0f, 50.0f}, {77.0f, 88.0f}});
-  auto add = builder.Add(v, m, /*broadcast_dimensions=*/{0});
+  builder.Add(v, m, /*broadcast_dimensions=*/{0});
   Array2D<float> expected_array({{30.0f, 70.0f}, {117.0f, 128.0f}});
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, 3DBinaryOpF32s) {
   // Binary add of two R3s together
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_3d({{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
                        {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}});
   auto a = builder.ConstantR3FromArray3D<float>(a_3d);
@@ -2529,7 +2526,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, 3DBinaryOpF32s) {
   Array3D<float> b_3d({{{2.0f, 4.0f}, {6.0f, 8.0f}, {10.0f, 12.0f}},
                        {{14.0f, 16.0f}, {18.0f, 20.0f}, {22.0f, 24.0f}}});
   auto b = builder.ConstantR3FromArray3D<float>(b_3d);
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   Array3D<float> expected_3d(
       {{{3.0f, 6.0f}, {9.0f, 12.0f}, {15.0f, 18.0f}},
@@ -2540,7 +2537,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, 3DBinaryOpF32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver2) {
   // Add together a (2, 3, 2) array with a (2) array, using dimension 0 for
   // broadcasting (though there are two ways to broadcast these shapes).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   Array3D<float> a_3d({
     {{1.0f, 2.0f},
@@ -2553,7 +2550,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver2) {
   // clang-format on
   auto a = builder.ConstantR3FromArray3D<float>(a_3d);
   auto v = builder.ConstantR1<float>({10.0f, 20.0f});
-  auto add = builder.Add(a, v, /*broadcast_dimensions=*/{2});
+  builder.Add(a, v, /*broadcast_dimensions=*/{2});
 
   Array3D<float> expected_3d(
       {{{11.0f, 22.0f}, {13.0f, 24.0f}, {15.0f, 26.0f}},
@@ -2564,7 +2561,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver2) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver0) {
   // Add together a (2, 3, 2) array with a (2) array, using dimension 2 for
   // broadcasting (though there are two ways to broadcast these shapes).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   Array3D<float> a_3d({
     {{1.0f, 2.0f},
@@ -2577,7 +2574,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver0) {
   // clang-format on
   auto a = builder.ConstantR3FromArray3D<float>(a_3d);
   auto v = builder.ConstantR1<float>({10.0f, 20.0f});
-  auto add = builder.Add(a, v, /*broadcast_dimensions=*/{0});
+  builder.Add(a, v, /*broadcast_dimensions=*/{0});
 
   // clang-format off
   Array3D<float> expected_3d({
@@ -2595,7 +2592,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver0) {
 XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo3D) {
   // Add together a (2, 3, 2) array with a (3, 2) array, using dimensions {1,2}
   // for broadcasting.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   Array3D<float> a_3d({
     {{1.0f, 2.0f},
@@ -2610,7 +2607,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo3D) {
     {10.0f, 20.0f, 30.0f},
     {40.0f, 50.0f, 60.0f},
   });
-  auto add = builder.Add(a, m, /*broadcast_dimensions=*/{0, 1});
+  builder.Add(a, m, /*broadcast_dimensions=*/{0, 1});
 
   Array3D<float> expected_3d({
     {{11.0f, 12.0f},
@@ -2627,7 +2624,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo3D) {
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
   // Comparison between two 3D arrays of compatible shapes:
   // (2, 3, 2) and (2, 3, 1): expected to produce a (2, 3, 2) shape of PREDs.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_3d({{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
                        {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}});
   auto a = builder.ConstantR3FromArray3D<float>(a_3d);
@@ -2635,7 +2632,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
   Array3D<float> b_3d({{{7.0f, 1.0f}, {3.0f, 10.0f}, {15.0f, 6.0f}}});
   auto b = builder.ConstantR3FromArray3D<float>(b_3d);
 
-  auto compare = builder.Gt(a, b);
+  builder.Gt(a, b);
 
   Array3D<int> expected_3d(
       {{{0, 1}, {0, 0}, {0, 0}}, {{0, 1}, {1, 0}, {0, 1}}});
@@ -2651,7 +2648,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, 4DBinaryOpF32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Array4D<float>> operand_a_4d(new Array4D<float>(2, 3, 4, 5));
   std::unique_ptr<Array4D<float>> operand_b_4d(new Array4D<float>(2, 3, 4, 5));
@@ -2672,13 +2669,13 @@ XLA_TEST_F(ArrayElementwiseOpTest, 4DBinaryOpF32s) {
 
   auto a = builder.ConstantR4FromArray4D<float>(*operand_a_4d);
   auto b = builder.ConstantR4FromArray4D<float>(*operand_b_4d);
-  auto add = builder.Add(a, b);
+  builder.Add(a, b);
 
   ComputeAndCompareR4<float>(&builder, *expected_4d, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, R4PlusR1InDim1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Array4D<float>> operand_a_4d(new Array4D<float>(2, 3, 4, 5));
   std::unique_ptr<Array4D<float>> expected_4d(new Array4D<float>(2, 3, 4, 5));
@@ -2700,7 +2697,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4PlusR1InDim1) {
 
   auto a = builder.ConstantR4FromArray4D<float>(*operand_a_4d);
   auto b = builder.ConstantR1<float>(operand_b_1d);
-  auto add = builder.Add(a, b, {1});
+  builder.Add(a, b, {1});
 
   ComputeAndCompareR4<float>(&builder, *expected_4d, {}, error_spec_);
 }
@@ -2715,7 +2712,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
   std::vector<float> r1(d1);
   std::iota(r1.begin(), r1.end(), 1.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> a_literal = Literal::CreateR4FromArray4DWithLayout(
       r4, LayoutUtil::MakeLayout({0, 1, 2, 3}));
   auto a = builder.ConstantLiteral(*a_literal);
@@ -2736,11 +2733,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
 
 // Show that we can't add two opaques.
 XLA_TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto shape = ShapeUtil::MakeOpaqueShape();
   auto x = builder.Parameter(0, shape, "x");
-  auto concatenated = builder.Add(x, x);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.Add(x, x);
+  auto computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
               ::testing::ContainsRegex(
@@ -2748,12 +2745,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, IdentityBroadcastOfSameRankIsAllowed) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto b =
       builder.ConstantR2<float>({{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}});
-  auto add = builder.Add(a, b, /*broadcast_dimensions=*/{0, 1});
+  builder.Add(a, b, /*broadcast_dimensions=*/{0, 1});
 
   Array2D<float> expected_array(
       {{-4.0f, 11.28f, 43.0f}, {1.25f, -14.0f, 8.88f}});
@@ -2761,14 +2758,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, IdentityBroadcastOfSameRankIsAllowed) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
   auto b =
       builder.ConstantR2<float>({{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}});
-  auto add = builder.Add(a, b, /*broadcast_dimensions=*/{1, 0});
+  builder.Add(a, b, /*broadcast_dimensions=*/{1, 0});
 
-  StatusOr<Computation> computation_status = builder.Build();
+  auto computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().error_message(),
               ::testing::ContainsRegex("must.*be the identity"));
diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
index 3f6fd7c65d3360a622dbf754833009fb20410535..ec3b46acfec0ee0ff514a862ce5b1ca74279efa8 100644
--- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -28,11 +29,11 @@ namespace {
 class AxpySimpleTest : public ClientLibraryTestBase {};
 
 TEST_F(AxpySimpleTest, AxTenValues) {
-  ComputationBuilder builder(client_, "ax_10");
+  XlaBuilder builder("ax_10");
   auto alpha = builder.ConstantR0<float>(3.1415926535);
   auto x = builder.ConstantR1<float>(
       {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
-  auto ax = builder.Mul(alpha, x);
+  builder.Mul(alpha, x);
 
   std::vector<float> expected = {
       -3.14159265, 3.14159265,  6.28318531,   -6.28318531,  -9.42477796,
@@ -46,7 +47,7 @@ XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) {
   auto x = builder.ConstantR1<float>({});
   auto y = builder.ConstantR1<float>({});
   auto ax = builder.Mul(alpha, x);
-  auto axpy = builder.Add(ax, y);
+  builder.Add(ax, y);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -60,7 +61,7 @@ TEST_F(AxpySimpleTest, AxpyTenValues) {
   auto y = builder.ConstantR1<float>(
       {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0});
   auto ax = builder.Mul(alpha, x);
-  auto axpy = builder.Add(ax, y);
+  builder.Add(ax, y);
 
   TF_ASSERT_OK_AND_ASSIGN(ProgramShape shape, builder.GetProgramShape());
 
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 28ab9654997728fbafd6610af840e721e72cce5a..af8af99c791e2a40cfcfa2291b786b33e5652267 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -69,6 +69,17 @@ class BatchNormalizationTest
     CHECK_EQ(kY, input_array_.width());
   }
 
+  ComputationDataHandle CheckShape(ComputationBuilder* b,
+                                   const ComputationDataHandle& operand,
+                                   const Shape& expected_shape) const {
+    std::unique_ptr<Shape> actual_shape =
+        b->GetShape(operand).ConsumeValueOrDie();
+    CHECK(ShapeUtil::Equal(expected_shape, *actual_shape))
+        << "want " << ShapeUtil::HumanString(expected_shape) << " got "
+        << ShapeUtil::HumanString(*actual_shape);
+    return operand;
+  }
+
   static constexpr int64 kSamples = 3;
   static constexpr int64 kX = 1;
   static constexpr int64 kY = 1;
@@ -164,14 +175,15 @@ XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
 XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
   ComputationBuilder builder(client_, "batch_normalize_per_spec");
   auto input_activations =
-      builder.CheckShape(builder.ConstantLiteral(input_literal_),
-                         ShapeUtil::MakeShape(F32, {3, 2, 1, 1}));
+      CheckShape(&builder, builder.ConstantLiteral(input_literal_),
+                 ShapeUtil::MakeShape(F32, {3, 2, 1, 1}));
   auto gamma = builder.ConstantR1<float>({1.0, 1.0});
   auto beta = builder.ConstantR1<float>({0.0, 0.0});
   Computation add = CreateScalarAddComputation(F32, &builder);
   // Reduce all dimensions except dimension 1.
   Shape TwoElementVectorF32 = ShapeUtil::MakeShape(F32, {2});
-  auto sum = builder.CheckShape(
+  auto sum = CheckShape(
+      &builder,
       builder.Reduce(input_activations, builder.ConstantR0<float>(0.0f), add,
                      /*dimensions_to_reduce=*/{0, 2, 3}),
       TwoElementVectorF32);
@@ -187,14 +199,16 @@ XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
   auto activation_deviations = builder.Sub(input_activations, set_means,
                                            /*broadcast_dimensions=*/{1});
   auto dev_squares = builder.SquareF32(activation_deviations);
-  auto sum_of_squares = builder.CheckShape(
+  auto sum_of_squares = CheckShape(
+      &builder,
       builder.Reduce(dev_squares, builder.ConstantR0<float>(0.0f), add,
                      /*dimensions_to_reduce=*/{0, 2, 3}),
       TwoElementVectorF32);
   auto variance = builder.Div(sum_of_squares, count);
   auto standard_deviation = builder.SqrtF32(variance);
-  auto standard_deviation_above_epsilon = builder.CheckShape(
-      builder.Gt(standard_deviation, epsilon), ShapeUtil::MakeShape(PRED, {2}));
+  auto standard_deviation_above_epsilon =
+      CheckShape(&builder, builder.Gt(standard_deviation, epsilon),
+                 ShapeUtil::MakeShape(PRED, {2}));
   auto gt_eps = builder.Select(standard_deviation_above_epsilon,
                                standard_deviation, epsilon2);
   auto normalization_factors = builder.ReciprocalF32(gt_eps);
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 610302ac1256a57db6ed6e18016a4136973e3891..eac2eb286c3f7a1cd33aed03686e99ef753b773a 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -137,7 +137,8 @@ def xla_test(name,
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"]
       this_backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
-      backend_deps = plugins[backend]["deps"]
+      backend_deps = []
+      backend_deps += plugins[backend]["deps"]
       this_backend_copts += plugins[backend]["copts"]
       this_backend_tags += plugins[backend]["tags"]
       this_backend_args += plugins[backend]["args"]
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index a677986cd926cc0054d8f36abc98ccac33dc043d..17c6a83c1a3153f78da7f5f6c9b76542bc564203 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -95,6 +95,20 @@ StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
                                      &execution_options);
 }
 
+StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_output_layout) {
+  ExecutionOptions execution_options = execution_options_;
+  if (shape_with_output_layout != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *shape_with_output_layout;
+  }
+  return client_->ExecuteAndTransfer(computation, arguments,
+                                     &execution_options);
+}
+
+template <>
 StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -104,6 +118,15 @@ StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
   return ExecuteAndTransfer(computation, arguments, shape_with_output_layout);
 }
 
+template <>
+StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_output_layout) {
+  // Build the computation, as a convenience.
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+  return ExecuteAndTransfer(computation, arguments, shape_with_output_layout);
+}
+
 std::unique_ptr<GlobalData> ClientLibraryTestBase::ExecuteOrDie(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
@@ -116,14 +139,31 @@ std::unique_ptr<Literal> ClientLibraryTestBase::ExecuteAndTransferOrDie(
   return ExecuteAndTransfer(builder, arguments).ConsumeValueOrDie();
 }
 
+string ClientLibraryTestBase::ExecuteToString(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+  auto computation_status = builder->Build();
+  if (!computation_status.ok()) {
+    return computation_status.status().ToString();
+  }
+  auto computation = computation_status.ConsumeValueOrDie();
+
+  auto result =
+      client_->ExecuteAndTransfer(computation, arguments, &execution_options_);
+  if (!result.ok()) {
+    return result.status().ToString();
+  } else {
+    return result.ValueOrDie()->ToString();
+  }
+}
+
 string ClientLibraryTestBase::ExecuteToString(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-  StatusOr<Computation> computation_status = builder->Build();
+  auto computation_status = builder->Build();
   if (!computation_status.ok()) {
     return computation_status.status().ToString();
   }
-  Computation computation = computation_status.ConsumeValueOrDie();
+  auto computation = computation_status.ConsumeValueOrDie();
 
   auto result =
       client_->ExecuteAndTransfer(computation, arguments, &execution_options_);
@@ -142,16 +182,18 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments);
 }
 
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const Shape* shape_with_layout) {
   EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments,
                                                   shape_with_layout));
 }
 
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
     const Shape* shape_with_layout) {
   EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments,
@@ -249,8 +291,28 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
   return choose(0);
 }
 
+tensorflow::Status
+ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
+    const xla::XlaComputation& /*computation*/, const Literal& /*expected*/,
+    tensorflow::gtl::ArraySlice<GlobalData*> /*arguments*/,
+    const std::function<void(const Literal& actual,
+                             const string& error_message)>& /*verify_output*/) {
+  return Unimplemented("not yet implemented for XlaComputation");
+}
+
+tensorflow::Status
+ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
+    const xla::XlaComputation& /*computation*/, const Literal& /*expected*/,
+    tensorflow::gtl::ArraySlice<GlobalData*> /*arguments*/,
+    const std::function<void(const Literal& actual,
+                             const string& error_message)>& /*verify_output*/,
+    const Shape* /*output_with_layout*/) {
+  return Unimplemented("not yet implemented for XlaComputation");
+}
+
+template <typename BuilderT>
 tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
     const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
@@ -307,8 +369,9 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   return tensorflow::Status::OK();
 }
 
+template <typename BuilderT>
 tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
     ErrorSpec error, const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
@@ -378,8 +441,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8(
   EXPECT_EQ(expected, actual->GetR1U8AsString());
 }
 
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareTuple(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -390,8 +454,9 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
   LiteralTestUtil::ExpectEqual(expected, *actual);
 }
 
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareTuple(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -522,33 +587,6 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols,
   return array;
 }
 
-std::unique_ptr<GlobalData>
-ClientLibraryTestBase::CreateParameterAndTransferLiteral(
-    int64 parameter_number, const Literal& literal, const string& name,
-    ComputationBuilder* builder, ComputationDataHandle* data_handle) {
-  return CreateParameterAndTransferLiteral(parameter_number, literal, name,
-                                           nullptr, builder, data_handle);
-}
-
-std::unique_ptr<GlobalData>
-ClientLibraryTestBase::CreateParameterAndTransferLiteral(
-    int64 parameter_number, const Literal& literal, const string& name,
-    const DeviceHandle* device_handle, ComputationBuilder* builder,
-    ComputationDataHandle* data_handle) {
-  const Literal* param_literal = &literal;
-  std::unique_ptr<Literal> converted_literal;
-  if (use_bfloat16_) {
-    converted_literal = LiteralTestUtil::ConvertF32ToBF16(literal);
-    param_literal = converted_literal.get();
-  }
-  std::unique_ptr<GlobalData> data =
-      client_->TransferToServer(*param_literal, device_handle)
-          .ConsumeValueOrDie();
-  *data_handle =
-      builder->Parameter(parameter_number, param_literal->shape(), name);
-  return data;
-}
-
 ComputationDataHandle ClientLibraryTestBase::AddParam(
     const Literal& argument, ComputationBuilder* builder) {
   ComputationDataHandle data_handle;
@@ -563,4 +601,46 @@ ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
       use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
 }
 
+XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
+                                                       XlaBuilder* builder) {
+  return builder->ConstantLiteral(
+      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
+}
+
+template void ClientLibraryTestBase::ComputeAndCompareLiteral(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_layout);
+
+template void ClientLibraryTestBase::ComputeAndCompareLiteral(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_layout);
+
+template void ClientLibraryTestBase::ComputeAndCompareLiteral(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
+    const Shape* shape_with_layout);
+
+template void ClientLibraryTestBase::ComputeAndCompareLiteral(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
+    const Shape* shape_with_layout);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index ba0319990bc04196386e6812b0a03671676698ec..52f31b06698a424929df0ea1425ca66b5ac96a18 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -94,15 +95,25 @@ class ClientLibraryTestBase : public ::testing::Test {
   StatusOr<std::unique_ptr<GlobalData>> Execute(
       ComputationBuilder* builder,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+  // TODO(b/74197823): Remove the template type 'BuilderT' in all methods once
+  // the migration to XlaBuilder is complete.
+
+  template <typename BuilderT>
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
+
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
 
+  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const Shape* shape_with_output_layout = nullptr);
+
   // Convenience OrDie variants of above methods.
   std::unique_ptr<GlobalData> ExecuteOrDie(
       ComputationBuilder* builder,
@@ -113,29 +124,31 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   // Run a computation and return its value as a string. If an error
   // occurs, then instead return the error as a string.
+  string ExecuteToString(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<GlobalData*> arguments);
   string ExecuteToString(ComputationBuilder* builder,
                          tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   // Convenience methods for building and running a computation, transferring
   // the result, and comparing it to the expected value(s). Methods are
   // templated on the native host type which maps to specific XLA types (See
-  // ComputationBuilder for details). For each rank, two forms are provided: one
-  // for floating point types with an ErrorSpec parameter, and one for integral
-  // types without the ErrorSpec parameter.
-  template <typename NativeT>
-  void ComputeAndCompareR0(ComputationBuilder* builder, NativeT expected,
+  // ComputationBuilder/XlaBuilder for details). For each rank, two forms are
+  // provided: one for floating point types with an ErrorSpec parameter, and one
+  // for integral types without the ErrorSpec parameter.
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR0(BuilderT* builder, NativeT expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR0(ComputationBuilder* builder, NativeT expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR0(BuilderT* builder, NativeT expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
-  template <typename NativeT>
-  void ComputeAndCompareR1(ComputationBuilder* builder,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR1(BuilderT* builder,
                            tensorflow::gtl::ArraySlice<NativeT> expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR1(ComputationBuilder* builder,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR1(BuilderT* builder,
                            tensorflow::gtl::ArraySlice<NativeT> expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
@@ -146,55 +159,53 @@ class ClientLibraryTestBase : public ::testing::Test {
                            const tensorflow::core::Bitmap& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
-  template <typename NativeT>
-  void ComputeAndCompareR2(ComputationBuilder* builder,
-                           const Array2D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR2(BuilderT* builder, const Array2D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR2(ComputationBuilder* builder,
-                           const Array2D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR2(BuilderT* builder, const Array2D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
-  template <typename NativeT>
-  void ComputeAndCompareR3(ComputationBuilder* builder,
-                           const Array3D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR3(BuilderT* builder, const Array3D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR3(ComputationBuilder* builder,
-                           const Array3D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR3(BuilderT* builder, const Array3D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
-  template <typename NativeT>
-  void ComputeAndCompareR4(ComputationBuilder* builder,
-                           const Array4D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR4(BuilderT* builder, const Array4D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT>
-  void ComputeAndCompareR4(ComputationBuilder* builder,
-                           const Array4D<NativeT>& expected,
+  template <typename NativeT, typename BuilderT>
+  void ComputeAndCompareR4(BuilderT* builder, const Array4D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
   // Build and run the computation and compare the result with the given
   // literal. shape_with_layout indicates the result layout to request when
   // calling Execute.
+  template <typename BuilderT>
   void ComputeAndCompareLiteral(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_layout = nullptr);
+  template <typename BuilderT>
   void ComputeAndCompareLiteral(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
       const Shape* shape_with_layout = nullptr);
 
   // ComputeAndCompare variant which returns an error status.
+  template <typename BuilderT>
   tensorflow::Status ComputeAndCompareLiteralWithStatus(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_layout = nullptr);
+  template <typename BuilderT>
   tensorflow::Status ComputeAndCompareLiteralWithStatus(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
       const Shape* shape_with_layout = nullptr);
 
@@ -206,11 +217,13 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   // Convenience method for running a built computation, transferring the
   // result, and comparing it to the expected tuple literal.
+  template <typename BuilderT>
   void ComputeAndCompareTuple(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+  template <typename BuilderT>
   void ComputeAndCompareTuple(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
 
   // Convenience method for running a built computation and comparing the result
@@ -266,17 +279,19 @@ class ClientLibraryTestBase : public ::testing::Test {
   // server, then stores into "data_handle" the global handle for that
   // parameter. When the use_bfloat16 flag is set but the literal has F32
   // elements, the literal will be converted to BF16 before being transferred.
+  template <typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
-      ComputationBuilder* builder, ComputationDataHandle* data_handle);
+      BuilderT* builder, HandleT* data_handle);
 
   // As above, but the caller can specify the device that the literal is
   // transferred to. If device_handle is nullptr, the literal will be
   // transferred to the default device.
+  template <typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
-      const DeviceHandle* device_handle, ComputationBuilder* builder,
-      ComputationDataHandle* data_handle);
+      const DeviceHandle* device_handle, BuilderT* builder,
+      HandleT* data_handle);
 
   // Creates a parameter instruction and sets the value that will be passed to
   // the computation as specified. This function must be used for all parameters
@@ -297,6 +312,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   // will be converted to BF16s.
   ComputationDataHandle CreateConstantFromLiteral(const Literal& literal,
                                                   ComputationBuilder* builder);
+  XlaOp CreateConstantFromLiteral(const Literal& literal, XlaBuilder* builder);
 
   // Creates a constant instruction with the given array. When the use_bfloat16
   // flag is set but the array has float elements, the elements will be
@@ -307,6 +323,12 @@ class ClientLibraryTestBase : public ::testing::Test {
     return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
   }
 
+  template <typename NativeT>
+  XlaOp CreateConstantFromArray(const Array<NativeT>& array,
+                                XlaBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
+  }
+
   // Same as CreateConstantFromArray, but for scalars.
   template <typename NativeT>
   ComputationDataHandle CreateConstantFromScalar(NativeT value,
@@ -315,6 +337,12 @@ class ClientLibraryTestBase : public ::testing::Test {
                                      builder);
   }
 
+  template <typename NativeT>
+  XlaOp CreateConstantFromScalar(NativeT value, XlaBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateR0<NativeT>(value),
+                                     builder);
+  }
+
   // Creates a parameter instruction that wraps a given value and then stores
   // into "data_handle" the global handle for that parameter.
   //
@@ -323,10 +351,12 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT>
-  std::unique_ptr<GlobalData> CreateR0Parameter(
-      NativeT value, int64 parameter_number, const string& name,
-      ComputationBuilder* builder, ComputationDataHandle* data_handle);
+  template <typename NativeT, typename BuilderT, typename HandleT>
+  std::unique_ptr<GlobalData> CreateR0Parameter(NativeT value,
+                                                int64 parameter_number,
+                                                const string& name,
+                                                BuilderT* builder,
+                                                HandleT* data_handle);
 
   // Creates a parameter instruction that wraps the given values and then stores
   // into "data_handle" the global handle for that parameter.
@@ -336,11 +366,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT>
+  template <typename NativeT, typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateR1Parameter(
       tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
-      const string& name, ComputationBuilder* builder,
-      ComputationDataHandle* data_handle);
+      const string& name, BuilderT* builder, HandleT* data_handle);
 
   // Creates a parameter instruction that wraps the given constant array
   // "array_2d" and then stores to "data_handle" the global handle for that
@@ -351,11 +380,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT>
+  template <typename NativeT, typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateR2Parameter(
       const Array2D<NativeT>& array_2d, int64 parameter_number,
-      const string& name, ComputationBuilder* builder,
-      ComputationDataHandle* data_handle);
+      const string& name, BuilderT* builder, HandleT* data_handle);
 
   // Creates a parameter instruction that wraps the given constant array
   // "array_3d" and then stores to "data_handle" the global handle for that
@@ -366,11 +394,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT>
+  template <typename NativeT, typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateR3Parameter(
       const Array3D<NativeT>& array_3d, int64 parameter_number,
-      const string& name, ComputationBuilder* builder,
-      ComputationDataHandle* data_handle);
+      const string& name, BuilderT* builder, HandleT* data_handle);
 
   // Getter and setter for the use_bfloat16 flag, which indicates whether to run
   // tests with all float-type input/output converted to bfloat16.
@@ -399,6 +426,18 @@ class ClientLibraryTestBase : public ::testing::Test {
                                const string& error_message)>& verify_output,
       const Shape* output_with_layout = nullptr);
 
+  tensorflow::Status ComputeAndCompareLiteralWithAllOutputLayouts(
+      const xla::XlaComputation& computation, const Literal& expected,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const std::function<void(const Literal& actual,
+                               const string& error_message)>& verify_output);
+  tensorflow::Status ComputeAndCompareLiteralWithAllInputLayouts(
+      const xla::XlaComputation& computation, const Literal& expected,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const std::function<void(const Literal& actual,
+                               const string& error_message)>& verify_output,
+      const Shape* output_with_layout = nullptr);
+
   // Executes the computation and calculates the expected reference value using
   // the HloEvaluator. Returns two literal in the order of (expected, actual).
   StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
@@ -414,9 +453,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   std::vector<std::unique_ptr<GlobalData>> arguments_;
 };
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR0(
-    ComputationBuilder* builder, NativeT expected,
+    BuilderT* builder, NativeT expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR0<NativeT>(expected);
@@ -424,9 +463,9 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR0(
-    ComputationBuilder* builder, NativeT expected,
+    BuilderT* builder, NativeT expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -440,9 +479,9 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR1(
-    ComputationBuilder* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
+    BuilderT* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR1<NativeT>(expected);
@@ -450,9 +489,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR1(
-    ComputationBuilder* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
+    BuilderT* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -466,9 +505,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR2(
-    ComputationBuilder* builder, const Array2D<NativeT>& expected,
+    BuilderT* builder, const Array2D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR2FromArray2D<NativeT>(expected);
@@ -476,9 +515,9 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR2(
-    ComputationBuilder* builder, const Array2D<NativeT>& expected,
+    BuilderT* builder, const Array2D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -492,9 +531,9 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR3(
-    ComputationBuilder* builder, const Array3D<NativeT>& expected,
+    BuilderT* builder, const Array3D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR3FromArray3D<NativeT>(expected);
@@ -502,9 +541,9 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR3(
-    ComputationBuilder* builder, const Array3D<NativeT>& expected,
+    BuilderT* builder, const Array3D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -518,9 +557,9 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR4(
-    ComputationBuilder* builder, const Array4D<NativeT>& expected,
+    BuilderT* builder, const Array4D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR4FromArray4D<NativeT>(expected);
@@ -528,9 +567,9 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                                                   arguments);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareR4(
-    ComputationBuilder* builder, const Array4D<NativeT>& expected,
+    BuilderT* builder, const Array4D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -544,10 +583,10 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                                                   arguments, error);
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT, typename HandleT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
     NativeT value, int64 parameter_number, const string& name,
-    ComputationBuilder* builder, ComputationDataHandle* data_handle) {
+    BuilderT* builder, HandleT* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR0(value);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
     literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
@@ -558,11 +597,10 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
   return data;
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT, typename HandleT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
     tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
-    const string& name, ComputationBuilder* builder,
-    ComputationDataHandle* data_handle) {
+    const string& name, BuilderT* builder, HandleT* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR1(values);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
     literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
@@ -573,11 +611,10 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
   return data;
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT, typename HandleT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
     const Array2D<NativeT>& array_2d, int64 parameter_number,
-    const string& name, ComputationBuilder* builder,
-    ComputationDataHandle* data_handle) {
+    const string& name, BuilderT* builder, HandleT* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR2FromArray2D(array_2d);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
     literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
@@ -588,11 +625,10 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
   return data;
 }
 
-template <typename NativeT>
+template <typename NativeT, typename BuilderT, typename HandleT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR3Parameter(
     const Array3D<NativeT>& array_3d, int64 parameter_number,
-    const string& name, ComputationBuilder* builder,
-    ComputationDataHandle* data_handle) {
+    const string& name, BuilderT* builder, HandleT* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR3FromArray3D(array_3d);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
     literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
@@ -628,6 +664,37 @@ std::unique_ptr<Array2D<NativeT>> ClientLibraryTestBase::CreatePseudorandomR2(
   return result;
 }
 
+template <typename BuilderT, typename HandleT>
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
+                                                         const Literal& literal,
+                                                         const string& name,
+                                                         BuilderT* builder,
+                                                         HandleT* data_handle) {
+  return CreateParameterAndTransferLiteral(parameter_number, literal, name,
+                                           nullptr, builder, data_handle);
+}
+
+template <typename BuilderT, typename HandleT>
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(
+    int64 parameter_number, const Literal& literal, const string& name,
+    const DeviceHandle* device_handle, BuilderT* builder,
+    HandleT* data_handle) {
+  const Literal* param_literal = &literal;
+  std::unique_ptr<Literal> converted_literal;
+  if (use_bfloat16_) {
+    converted_literal = LiteralTestUtil::ConvertF32ToBF16(literal);
+    param_literal = converted_literal.get();
+  }
+  std::unique_ptr<GlobalData> data =
+      client_->TransferToServer(*param_literal, device_handle)
+          .ConsumeValueOrDie();
+  *data_handle =
+      builder->Parameter(parameter_number, param_literal->shape(), name);
+  return data;
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_CLIENT_LIBRARY_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 045148cdd11da94ae4789a753efca95c6aaa1f27..32e2f2c0848407ec46a5ac52e2668ef27b92c426 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -109,14 +111,14 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
 
 XLA_TEST_F(ClientTest,
         DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) {
-  Computation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
+  XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
   Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<GlobalData> const_arg,
       client_->TransferToServer(*Literal::CreateR2<int32>({{5, 6}, {7, 8}})));
 
-  ComputationBuilder b(client_, TestName() + ".add");
+  XlaBuilder b(TestName() + ".add");
   b.Add(b.Parameter(0, shape, "param_0"),
         b.ConstantR2<int32>({{1, 2}, {3, 4}}));
   TF_ASSERT_OK_AND_ASSIGN(add_with_one_arg, b.Build());
@@ -124,14 +126,14 @@ XLA_TEST_F(ClientTest,
   // We can't really test parallel execution on CPU since all of the cores in a
   // CPU are presented as a single device.  So for now we test "parallel"
   // execution on a single device.
-  std::vector<Client::ComputationInstance> computation_instances;
+  std::vector<Client::XlaComputationInstance> computation_instances;
   TF_ASSERT_OK_AND_ASSIGN(std::vector<xla::DeviceHandle> devices,
                           client_->GetDeviceHandles(1));
   ASSERT_EQ(devices.size(), 1);
 
   ExecutionOptions options = execution_options_;
   *options.add_device_handles() = devices[0];
-  computation_instances.push_back(Client::ComputationInstance(
+  computation_instances.push_back(Client::XlaComputationInstance(
       add_with_one_arg, {const_arg.get()}, options, nullptr));
 
   TF_ASSERT_OK_AND_ASSIGN(auto results,
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index ec2c580670cfac14ba42e8c9a836c86551af4b89..e5a03b49ad259a64b9cbbc88c31d8c6558289d1b 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -167,8 +168,8 @@ TEST_F(ComputeConstantTest, DirectParamMissing) {
     EXPECT_FALSE(IsConstant(computation, &b));
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                    .contains("depends on a parameter"))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(),
+                                                  "depends on a parameter"))
         << value.status();
   }
 }
@@ -183,8 +184,8 @@ TEST_F(ComputeConstantTest, IndirectParamMissing) {
     EXPECT_FALSE(IsConstant(computation, &b));
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                    .contains("depends on a parameter"))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(),
+                                                  "depends on a parameter"))
         << value.status();
   }
 }
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index fb0e9c724a69b61801e6e0c2d07ef75b63a00465..a4c8a83eb15f7cc279b6c8f1bf1394c0afb9f7cf 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -38,9 +38,9 @@ using ::testing::HasSubstr;
 
 // Concatenate expects at least one argument.
 XLA_TEST_F(ConcatTest, Concat_Nothing) {
-  ComputationBuilder builder(client_, TestName());
-  auto concatenated = builder.ConcatInDim({}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  XlaBuilder builder(TestName());
+  builder.ConcatInDim({}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
               HasSubstr("Concatenate expects at least one argument"));
@@ -48,18 +48,18 @@ XLA_TEST_F(ConcatTest, Concat_Nothing) {
 
 // Concatenate with one argument works.
 XLA_TEST_F(ConcatTest, Concat_R1_With_Nothing) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
-  auto concatenated = builder.ConcatInDim({a}, 0);
+  builder.ConcatInDim({a}, 0);
 
   std::vector<float> expected = {42, 64};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a}, 0);
+  builder.ConcatInDim({a}, 0);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -68,51 +68,51 @@ XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
 // Show that we can't concatenate R0 with R0 because we can't name the dimension
 // to concatenate on.
 XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR0<float>(42.0);
   auto b = builder.ConstantR0<float>(64.0);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.ConcatInDim({a, b}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
               HasSubstr("out of bounds: 0"));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
   auto b = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {42, 64};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
   auto b = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -129,20 +129,20 @@ XLA_TEST_F(ConcatTest, Concat_R1_L253_With_R1_L7) {
     expected[253 + i] = rhs[i] = 253 + i + 1;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>(lhs);
   auto b = builder.ConstantR1<float>(rhs);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_0x0_With_0x0) {
   for (int dim : {0, 1}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto a = builder.ConstantR2FromArray2D(Array2D<float>(0, 0));
     auto b = builder.ConstantR2FromArray2D(Array2D<float>(0, 0));
-    auto concatenated = builder.ConcatInDim({a, b}, dim);
+    builder.ConcatInDim({a, b}, dim);
 
     ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {},
                                ErrorSpec(0.0001));
@@ -150,26 +150,27 @@ XLA_TEST_F(ConcatTest, Concat_0x0_With_0x0) {
 }
 
 XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(1, 1);
   auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   Array2D<float> expected({
-      {0}, {64},
+      {0},
+      {64},
   });
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(1, 1);
   auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   Array2D<float> expected({
       {0, 64},
@@ -178,22 +179,22 @@ XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) {
 }
 
 XLA_TEST_F(ConcatTest, Concat2x0With2x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(Array2D<float>(2, 0));
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   ComputeAndCompareR2<float>(&builder, *b_array, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat2x3With2x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(2, 3);
   auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   Array2D<float> expected({
       {0, 1, 2, 64, 65, 66, 67, 68},
@@ -203,22 +204,22 @@ XLA_TEST_F(ConcatTest, Concat2x3With2x5) {
 }
 
 XLA_TEST_F(ConcatTest, Concat3x2With0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(3, 2);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(Array2D<float>(0, 2));
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   ComputeAndCompareR2<float>(&builder, *a_array, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat3x2With5x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(3, 2);
   auto b_array = CreatePatternedMatrix(5, 2, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   Array2D<float> expected({
       {0, 1},
@@ -234,16 +235,16 @@ XLA_TEST_F(ConcatTest, Concat3x2With5x2) {
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x0x2_3x0x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR3FromArray3D(Array3D<float>(3, 0, 2));
   auto b = builder.ConstantR3FromArray3D(Array3D<float>(3, 0, 1));
-  auto concatenated = builder.ConcatInDim({a, b}, 2);
+  builder.ConcatInDim({a, b}, 2);
   ComputeAndCompareR3<float>(&builder, Array3D<float>(3, 0, 3), {},
                              ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_array({
       // 3x1x2
       {{0, 1}},
@@ -258,27 +259,29 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) {
   });
   auto a = builder.ConstantR3FromArray3D(a_array);
   auto b = builder.ConstantR3FromArray3D(b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 2);
+  builder.ConcatInDim({a, b}, 2);
 
   Array3D<float> expected({
-      {{0, 1, 6}}, {{2, 3, 7}}, {{4, 5, 8}},
+      {{0, 1, 6}},
+      {{2, 3, 7}},
+      {{4, 5, 8}},
   });
   ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_1x1_1x1_1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b, c}, 0);
+  builder.ConcatInDim({a, b, c}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_array({
       // 3x1x2
       {{0, 1}},
@@ -300,35 +303,35 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) {
   auto a = builder.ConstantR3FromArray3D(a_array);
   auto b = builder.ConstantR3FromArray3D(b_array);
   auto c = builder.ConstantR3FromArray3D(c_array);
-  auto concatenated = builder.ConcatInDim({a, b, c}, 2);
+  builder.ConcatInDim({a, b, c}, 2);
 
   Array3D<float> expected({
-      {{0, 1, 2, 3}}, {{4, 5, 6, 7}}, {{8, 9, 10, 11}},
+      {{0, 1, 2, 3}},
+      {{4, 5, 6, 7}},
+      {{8, 9, 10, 11}},
   });
   ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, DoubleConcatLeftAssociative) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
   // concatenated = (a concat b) concat c
-  auto concatenated =
-      builder.ConcatInDim({builder.ConcatInDim({a, b}, 0), c}, 0);
+  builder.ConcatInDim({builder.ConcatInDim({a, b}, 0), c}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, DoubleConcatRightAssociative) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
   // concatenated = a concat (b concat c)
-  auto concatenated =
-      builder.ConcatInDim({a, builder.ConcatInDim({b, c}, 0)}, 0);
+  builder.ConcatInDim({a, builder.ConcatInDim({b, c}, 0)}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -342,7 +345,7 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim0) {
     rhs(0, i) = i + 1024;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 0);
@@ -363,7 +366,7 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim1) {
     rhs(0, i) = i + 1024;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 1);
@@ -388,7 +391,7 @@ XLA_TEST_F(ConcatTest, Concat_64x64_With_64x2) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 1);
@@ -404,13 +407,13 @@ XLA_TEST_F(ConcatTest, Concat_64x64_With_64x2) {
 
 // Show that we can't concatenate with an opaques.
 XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto opaque_shape = ShapeUtil::MakeOpaqueShape();
   auto r1f32 = xla::ShapeUtil::MakeShape(xla::F32, {1});
   auto x = builder.Parameter(0, r1f32, "x");
   auto y = builder.Parameter(1, opaque_shape, "y");
-  auto concatenated = builder.ConcatInDim({x, y}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.ConcatInDim({x, y}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(
       computation_status.status().ToString(),
@@ -418,23 +421,23 @@ XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralBoxedPredicates) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto p0 = builder.ConstantR1<bool>({true});
   auto p1 = builder.ConstantR1<bool>({false});
   auto p2 = builder.ConstantR1<bool>({true});
-  auto concatenated = builder.ConcatInDim({p0, p1, p2}, 0);
+  builder.ConcatInDim({p0, p1, p2}, 0);
 
   bool expected[] = {true, false, true};
   ComputeAndCompareR1<bool>(&builder, expected, {});
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a0 = builder.ConstantR1<int32>({1});
   auto a1 = builder.ConstantR1<int32>({2, 3});
   auto a2 = builder.ConstantR1<int32>({4, 5, 6});
   auto a3 = builder.ConstantR1<int32>({7, 8, 9, 10});
-  auto concatenated = builder.ConcatInDim({a0, a1, a2, a3}, 0);
+  builder.ConcatInDim({a0, a1, a2, a3}, 0);
 
   std::vector<int32> expected(10);
   std::iota(expected.begin(), expected.end(), 1);
@@ -442,7 +445,7 @@ XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
 }
 
 XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array3D<float> arr0(9, 17, 1);
   arr0.Fill(1);
@@ -462,14 +465,14 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
     }
   }
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto p0 = CreateR3Parameter<float>(arr0, /*parameter_number=*/0, "p0",
                                      &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto p1 = CreateR3Parameter<float>(arr1, /*parameter_number=*/1, "p1",
                                      &builder, &h1);
 
-  auto concatenated = builder.ConcatInDim({h0, h1}, 2);
+  builder.ConcatInDim({h0, h1}, 2);
 
   ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
 }
@@ -495,7 +498,7 @@ TEST_P(ConcatR2BinaryTest, DoIt) {
   Array2D<int32> rhs(spec.rhs_dim0, spec.rhs_dim1);
   rhs.FillUnique(1000);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a0 = builder.ConstantR2FromArray2D<int32>(lhs);
   auto a1 = builder.ConstantR2FromArray2D<int32>(rhs);
   builder.ConcatInDim({a0, a1}, spec.concat_dimension);
@@ -521,7 +524,7 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, f32_scalar, "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto mul = builder.Mul(x, y);
@@ -545,7 +548,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto z = builder.Parameter(2, f32_scalar, "z");
@@ -573,7 +576,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) {
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto z = builder.Parameter(2, f32_scalar, "y");
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index bc821674820fb128823786d7149037fc59b22ab6..b917dee77b5400db8f2c0a6a86258fee64723d71 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -571,5 +571,56 @@ XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
                                    "only parameter of true_computation"));
 }
 
+XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({r0f32_, r0f32_});
+  Computation swapper;
+  {
+    ComputationBuilder builder(client_, TestName() + ".swapper");
+    auto param0 = builder.Parameter(0, tuple_shape, "sp0");
+    auto x = builder.GetTupleElement(param0, 0);
+    auto y = builder.GetTupleElement(param0, 1);
+    builder.Tuple({y, x});
+    swapper = builder.Build().ConsumeValueOrDie();
+  }
+  Computation forwarder;
+  {
+    ComputationBuilder builder(client_, TestName() + ".forwarder");
+    auto param0 = builder.Parameter(0, tuple_shape, "fp0");
+    auto x = builder.GetTupleElement(param0, 0);
+    auto y = builder.GetTupleElement(param0, 1);
+    builder.Tuple({x, y});
+    forwarder = builder.Build().ConsumeValueOrDie();
+  }
+  Computation main;
+  {
+    ComputationBuilder builder(client_, TestName() + ".main");
+    auto param0 = builder.Parameter(0, tuple_shape, "mp0");
+    auto x = builder.GetTupleElement(param0, 0);
+    auto y = builder.GetTupleElement(param0, 1);
+    auto lt_pred = builder.Lt(x, y);
+    auto res = builder.Conditional(lt_pred, param0, forwarder, param0, swapper);
+    auto ge_pred = builder.Ge(x, y);
+    builder.Conditional(ge_pred, res, swapper, res, forwarder);
+    main = builder.Build().ConsumeValueOrDie();
+  }
+
+  auto test_swap = [&](float a, float b) {
+    ComputationBuilder builder(client_, TestName());
+    auto x = builder.ConstantR0<float>(a);
+    auto y = builder.ConstantR0<float>(b);
+    auto tuple_operand = builder.Tuple({x, y});
+    builder.Call(main, {tuple_operand});
+
+    ComputeAndCompareTuple(
+        &builder,
+        *Literal::MakeTuple({Literal::CreateR0<float>(a).get(),
+                             Literal::CreateR0<float>(b).get()}),
+        {}, error_spec_);
+  };
+
+  test_swap(3.11f, 9.4f);
+  test_swap(11.24f, 5.55f);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 59d6d7a4153be1b76ed8195a12a90cb103baa422..0842a8918bcfec037ab0f9aa24014c7d8296cdf8 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -177,6 +178,24 @@ XLA_TEST_F(ConvertTest, ConvertR1U32ToR1F32) {
   ComputeAndCompareR1<float>(&builder, expected, {arg_data.get()});
 }
 
+XLA_TEST_F(ConvertTest, ConvertR1F32ToR1U32) {
+  ComputationBuilder builder(client_, TestName());
+  std::vector<float> arg{0.0f,        1.0f,          16777216.0f,
+                         16777218.0f, 2147483647.0f, 4294967040.0f};
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<float>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, U32);
+
+  std::vector<uint32> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<uint32>(arg[i]);
+  }
+  ComputeAndCompareR1<uint32>(&builder, expected, {arg_data.get()});
+}
+
 XLA_TEST_F(ConvertTest, ConvertR1U32ToR1S64) {
   ComputationBuilder builder(client_, TestName());
   std::vector<uint32> arg{0, 1, 0x1000, 0x7fffffff, 0x80000082, 0xFFFFFFFF};
@@ -211,6 +230,43 @@ XLA_TEST_F(ConvertTest, ConvertR1S32ToR1S64) {
   ComputeAndCompareR1<int64>(&builder, expected, {arg_data.get()});
 }
 
+XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) {
+  ComputationBuilder builder(client_, TestName());
+  // Test cases from compiler_rt library.
+  std::vector<float> arg{0.0f,
+                         0.5f,
+                         0.99f,
+                         1.0f,
+                         1.5f,
+                         1.99f,
+                         2.0f,
+                         2.01f,
+                         2147483648.f,
+                         -0.5f,
+                         -0.99f,
+                         -1.0f,
+                         -1.5f,
+                         -1.99f,
+                         -2.0f,
+                         -2.01f,
+                         0x1.FFFFFEp+62F,
+                         0x1.FFFFFCp+62F,
+                         -0x1.FFFFFEp+62F,
+                         -0x1.FFFFFCp+62F};
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<float>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, S64);
+
+  std::vector<int64> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<int64>(arg[i]);
+  }
+  ComputeAndCompareR1<int64>(&builder, expected, {arg_data.get()});
+}
+
 XLA_TEST_F(ConvertTest, ConvertR1U8ToR1F32) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<uint8_t>({32, 64});
@@ -366,5 +422,44 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F16) {
 
   ComputeAndCompareR1<half>(&builder, expected_output, {dot_lhs_handle.get()});
 }
+
+XLA_TEST_F(ConvertTest, ConvertC64ToC64) {
+  ComputationBuilder builder(client_, TestName());
+  std::vector<complex64> x = {{42.0f, 64.0f}};
+  builder.ConvertElementType(builder.ConstantR1<complex64>(x), C64);
+  ComputeAndCompareR1<complex64>(&builder, x, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(ConvertTest, ConvertS64S64) {
+  ComputationBuilder builder(client_, TestName());
+  std::vector<int64> x = {{-42, 64}};
+  builder.ConvertElementType(builder.ConstantR1<int64>(x), S64);
+  ComputeAndCompareR1<int64>(&builder, x, {});
+}
+
+XLA_TEST_F(ConvertTest, ConvertU64U64) {
+  ComputationBuilder builder(client_, TestName());
+  std::vector<uint64> x = {{42, 64}};
+  builder.ConvertElementType(builder.ConstantR1<uint64>(x), U64);
+  ComputeAndCompareR1<uint64>(&builder, x, {});
+}
+
+XLA_TEST_F(ConvertTest, ConvertU64S64) {
+  ComputationBuilder builder(client_, TestName());
+  std::vector<uint64> unsigned_x = {{42, UINT64_MAX}};
+  builder.ConvertElementType(builder.ConstantR1<uint64>(unsigned_x), S64);
+  std::vector<int64> signed_x = {{42, -1}};
+  ComputeAndCompareR1<int64>(&builder, signed_x, {});
+}
+
+XLA_TEST_F(ConvertTest, ConvertS64U64) {
+  ComputationBuilder builder(client_, TestName());
+  std::vector<int64> signed_x = {{42, -1, INT64_MIN}};
+  builder.ConvertElementType(builder.ConstantR1<int64>(signed_x), U64);
+  std::vector<uint64> unsigned_x = {
+      {42, UINT64_MAX, tensorflow::MathUtil::IPow<uint64>(2, 63)}};
+  ComputeAndCompareR1<uint64>(&builder, unsigned_x, {});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 09b1dd283e4d026a2f0007240d88cd9ac38acb19..7b994a4c172cafee53ede9bfd4f30b0e0c9888d5 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -54,6 +54,25 @@ using TypesF16F32F64CF64 =
 #error "Situation not handled yet"
 #endif
 
+// Check that we can safely pass an input tuple's elements to a dot operation.
+TEST_F(DotOperationTest, DotOfInputTupleElem) {
+  ComputationBuilder builder(client_, TestName());
+
+  ComputationDataHandle param;
+  auto param_data = CreateParameterAndTransferLiteral(
+      0,
+      *Literal::MakeTuple({Literal::CreateR2<float>({{1, 2}, {3, 4}}).get(),
+                           Literal::CreateR2<float>({{5, 6}, {7, 8}}).get()}),
+      "arg0", &builder, &param);
+  auto lhs = builder.GetTupleElement(param, 0);
+  auto rhs = builder.GetTupleElement(param, 1);
+  builder.Dot(lhs, rhs);
+
+  ComputeAndCompareLiteral(&builder,
+                           *Literal::CreateR2<float>({{19, 22}, {43, 50}}),
+                           {param_data.get()});
+}
+
 template <typename T>
 class DotOperationTest_F16F32F64CF64 : public DotOperationTest {};
 TYPED_TEST_CASE(DotOperationTest_F16F32F64CF64, TypesF16F32F64CF64);
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 4f354e6aefe70a51c09be1c0ca151af2bb9f0a2c..5f00c34002803553b9c17b4fce0abafda7369796 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -18,9 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
@@ -112,10 +111,8 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   void TestR3Wrap() {
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
     RunR3<IndexT, DataT>(
-      {{{1, 2}, {3, 4}, {5, 6}},
-       {{7, 8}, {9, 10}, {11, 12}}},
-      {0, 2, 1}, {2, 1, 2},
-      {{{6, 5}}, {{12, 11}}});
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {0, 2, 1},
+        {2, 1, 2}, {{{6, 5}}, {{12, 11}}});
   }
 
   template <typename IndexT, typename DataT>
@@ -137,9 +134,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -163,9 +160,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -189,9 +186,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -281,6 +278,15 @@ XLA_TEST_F(DynamicSliceTest, Int32R3Pred) {
 
 class DynamicUpdateSliceTest : public ClientLibraryTestBase {
  protected:
+  template <typename IndexT, typename DataT>
+  void TestR0() {
+    // Disable algebraic simplifier, otherwise the op will be replaced by a
+    // constant.
+    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+        "algsimp");
+    RunR0<IndexT, DataT>(0, 123, {}, 123);
+  }
+
   template <typename IndexT, typename DataT>
   void TestR1() {
     // Slice at dimension start.
@@ -341,6 +347,35 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
         {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 15}, {9, 10}, {11, 13}}});
   }
 
+  template <typename IndexT, typename DataT>
+  void RunR0(int input_value_int, int update_value_int,
+             const std::vector<IndexT> slice_starts, int expected_value_int) {
+    Literal input_value =
+        std::move(*Literal::CreateR0(input_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal update_value =
+        std::move(*Literal::CreateR0(update_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_value =
+        std::move(*Literal::CreateR0(expected_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
+    ComputationBuilder builder(client_, TestName());
+    // Initialize and transfer dynamic slice start indices parameter.
+    ComputationDataHandle starts;
+    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
+        slice_starts, 0, "slice_starts", &builder, &starts);
+    // Build dynamic slice computation.
+    auto input = builder.ConstantLiteral(input_value);
+    auto update = builder.ConstantLiteral(update_value);
+    builder.DynamicUpdateSlice(input, update, starts);
+    // Run computation and compare against expected values.
+    ComputeAndCompareLiteral(&builder, expected_value, {start_data.get()});
+  }
+
   template <typename IndexT, typename DataT>
   void RunR1(tensorflow::gtl::ArraySlice<int> input_values_int,
              tensorflow::gtl::ArraySlice<int> update_values_int,
@@ -359,9 +394,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -390,9 +425,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -421,9 +456,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -474,13 +509,13 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     }
 
     // Build dynamic slice computation.
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer input parameter.
-    ComputationDataHandle input;
+    XlaOp input;
     std::unique_ptr<GlobalData> input_data =
         CreateR3Parameter<T>(input_values, 0, "input_values", &builder, &input);
     // Initialize and transfer update parameter.
-    ComputationDataHandle update;
+    XlaOp update;
     std::unique_ptr<GlobalData> update_data = CreateR3Parameter<T>(
         update_values, 1, "update_values", &builder, &update);
     auto starts = builder.ConstantR1<int32>({index, 0, 0});
@@ -500,6 +535,11 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 };
 
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R0BF16) { TestR0<int32, bfloat16>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R0) { TestR0<int32, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0<int64, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0<uint64, float>(); }
+
 // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
 XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R1BF16)) {
   TestR1<int32, bfloat16>();
@@ -672,7 +712,7 @@ void BM_DynamicSlice(int num_iters) {
       TransferManager::GetForPlatform(platform).ValueOrDie();
   int device_ordinal = client->default_device_ordinal();
 
-  ComputationBuilder builder(client, "DynamicSlice");
+  XlaBuilder builder("DynamicSlice");
 
   // Create input as a constant: shape [1, 2, 3, 4]
   auto input_literal = Literal::CreateR4(
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
index 6fe7737de7af349dca2931b52d62dbc03b14e0b3..b28fe0c15a89a1331698a29f70b966380bd3fcb9 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -71,8 +71,8 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) {
 #ifdef XLA_TEST_BACKEND_CPU
   // TODO(b/73141998): The vectorized Log implementation gives results outside
   // our error spec in this range (these numbers are bitwise representations of
-  // floats expressed as a zero extended int64):
-  std::pair<int64, int64> known_incorrect_range = {1, 8315654};
+  // floats expressed as a zero extended int64).
+  std::pair<int64, int64> known_incorrect_range = {1, 8388608};
 #else
   std::pair<int64, int64> known_incorrect_range = {0, 0};
 #endif
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 4e2f19ade10794fd159ff89807d6ab34630dbb43..9db68ff7a6dcbd9204fb2b3a37734a9aaed35dfd 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
@@ -31,12 +33,16 @@ class GatherOperationTest : public HloTestBase {
  protected:
   void RunTest(const string& hlo_text, Literal* operand,
                Literal* gather_indices) {
+    RunTest(hlo_text, {operand, gather_indices});
+  }
+
+  void RunTest(const string& hlo_text,
+               tensorflow::gtl::ArraySlice<Literal*> args) {
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsForTest());
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                             tools::Parse(hlo_text, config));
-    EXPECT_TRUE(
-        RunAndCompare(std::move(module), {operand, gather_indices}, nullopt));
+    EXPECT_TRUE(RunAndCompare(std::move(module), args, nullopt));
   }
 };
 
@@ -259,5 +265,197 @@ ENTRY main {
   RunTest(hlo_text, operand.get(), gather_indices.get());
 }
 
+XLA_TEST_F(GatherOperationTest, OutOfBoundsIndex) {
+  // Out of bounds indices must not crash, and the indices in range should
+  // produce the same values across all backends.
+  //
+  // TODO(b/74360564): Once we have a well defined semantics for OOB accesses,
+  // we should get rid of the mask and check that backends produce the same
+  // value for OOB indices too.
+
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = s32[6,2]{1,0} parameter(1)
+  gather = s32[6,1,1]{2,1,0} gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1}
+  gather_reshaped = s32[6]{0} reshape(gather)
+  in_bounds_mask = s32[6]{0} parameter(2)
+  ROOT result = s32[6]{0} multiply(gather_reshaped, in_bounds_mask)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR2<int32>(
+      {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483647, 1}, {1, 2}});
+  std::unique_ptr<Literal> in_bounds_mask =
+      Literal::CreateR1<int32>({0, 1, 1, 0, 0, 1});
+
+  RunTest(hlo_text,
+          {operand.get(), gather_indices.get(), in_bounds_mask.get()});
+}
+
+XLA_TEST_F(GatherOperationTest, NegativeIndex) {
+  // Negative indices must not crash, and the indices in range should produce
+  // the same values across all backends.
+  //
+  // TODO(b/74360564): Once we have a well defined semantics for negative
+  // accesses, we should get rid of the mask and check that backends produce the
+  // same value for negative indices too.
+
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = s32[6,2]{1,0} parameter(1)
+  gather = s32[6,1,1]{2,1,0} gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1}
+  gather_reshaped = s32[6]{0} reshape(gather)
+  in_bounds_mask = s32[6]{0} parameter(2)
+  ROOT result = s32[6]{0} multiply(gather_reshaped, in_bounds_mask)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR2<int32>(
+      {{2, -1}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}});
+  std::unique_ptr<Literal> in_bounds_mask =
+      Literal::CreateR1<int32>({0, 1, 1, 0, 0, 1});
+
+  RunTest(hlo_text,
+          {operand.get(), gather_indices.get(), in_bounds_mask.get()});
+}
+
+XLA_TEST_F(GatherOperationTest, OneScalarIndex) {
+  const char* hlo_text = R"(
+HloModule OneScalarIndex
+
+ENTRY main {
+  operand = s32[2,3,2]{2,1,0} parameter(0)
+  index = s32[] parameter(1)
+  ROOT gather = s32[1,3,2]{2,1,0} gather(operand, index),
+      output_window_dims={0,1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=0,
+      window_bounds={1,3,2}
+}
+)";
+  std::unique_ptr<Literal> operand = Literal::CreateR3<int32>(
+      {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR0<int32>(1);
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, ScalarResult) {
+  const char* hlo_text = R"(
+HloModule ScalarResult
+
+ENTRY main {
+  operand = s32[4]{0} parameter(0)
+  index = s32[] parameter(1)
+  ROOT gather = s32[] gather(operand, index),
+      output_window_dims={},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=0,
+      window_bounds={1}
+}
+)";
+  std::unique_ptr<Literal> operand = Literal::CreateR1<int32>({1, 2, 3, 4});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR0<int32>(1);
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, ZeroSizedResult) {
+  const string hlo_text = R"(
+HloModule ZeroSizedResult
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[0] parameter(1)
+  ROOT gather = s32[0,3] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1, 3}
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+class GatherClientLibraryTest : public ClientLibraryTestBase {};
+
+// TODO(b/30671675): Asynchronous execution on stream is not yet supported on
+// GPU and CPU_PARALLEL.
+XLA_TEST_F(GatherClientLibraryTest,
+           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(Basic))) {
+  // We create this HLO, but using the ComputationBuilder API.
+  //
+  // ENTRY main {
+  //   operand = s32[3,3] parameter(0)
+  //   indices = s32[2] parameter(1)
+  //   ROOT gather = s32[2,3] gather(operand, indices),
+  //       output_window_dims={1},
+  //       elided_window_dims={0},
+  //       gather_dims_to_operand_dims={0},
+  //       index_vector_dim=1,
+  //       window_bounds={1, 3}
+  // }
+
+  ComputationBuilder builder(client_, "gather_basic");
+
+  Shape operand_shape = ShapeUtil::MakeShape(S32, {3, 3});
+  Shape indices_shape = ShapeUtil::MakeShape(S32, {2});
+
+  auto operand = builder.Parameter(0, operand_shape, "operand");
+  auto indices = builder.Parameter(1, indices_shape, "indices");
+  GatherDimensionNumbers dim_numbers;
+  dim_numbers.add_output_window_dims(1);
+  dim_numbers.add_elided_window_dims(0);
+  dim_numbers.add_gather_dims_to_operand_dims(0);
+  dim_numbers.set_index_vector_dim(1);
+  builder.Gather(operand, indices, dim_numbers, {1, 3});
+
+  std::vector<int32> expected = {};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> operand_arg,
+                          client_->TransferToServer(*Literal::CreateR2<int32>(
+                              {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> indices_arg,
+      client_->TransferToServer(*Literal::CreateR1<int32>({0, 2})));
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<xla::DeviceHandle> devices,
+                          client_->GetDeviceHandles(1));
+  xla::ExecutionOptions execution_options = CreateDefaultExecutionOptions();
+  *execution_options.add_device_handles() = devices[0];
+  TF_ASSERT_OK_AND_ASSIGN(Computation computation, builder.Build());
+  std::vector<xla::Client::ComputationInstance> computation_instances = {
+      {computation,
+       {operand_arg.get(), indices_arg.get()},
+       execution_options,
+       /*execution_profile=*/nullptr}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<xla::GlobalData>> result_data,
+      client_->ExecuteParallel(computation_instances));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          client_->Transfer(*(result_data[0])));
+  LiteralTestUtil::ExpectEqual(
+      *result_literal, *Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
index eded2077fce965ab1c729c610764afa2228ca128..cf971dd61b71ad329b20b0bb7c16166126562681 100644
--- a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
+++ b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
@@ -30,7 +29,7 @@ class HloMetadataTest : public LocalClientTestBase {
     metadata_.set_op_name("my_sum_op");
   }
 
-  void BuildAddComputation(ComputationBuilder* builder) {
+  void BuildAddComputation(XlaBuilder* builder) {
     auto x = builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder->Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder->Add(x, y);
@@ -40,7 +39,7 @@ class HloMetadataTest : public LocalClientTestBase {
 };
 
 TEST_F(HloMetadataTest, MetadataPropagation) {
-  ComputationBuilder builder(local_client_, "add");
+  XlaBuilder builder("add");
   builder.SetOpMetadata(metadata_);
   BuildAddComputation(&builder);
   builder.ClearOpMetadata();
@@ -61,7 +60,7 @@ TEST_F(HloMetadataTest, MetadataPropagation) {
 }
 
 TEST_F(HloMetadataTest, MetadataClearing) {
-  ComputationBuilder builder(local_client_, "add");
+  XlaBuilder builder("add");
   builder.SetOpMetadata(metadata_);
   // Some other pretend computation here.
   builder.ClearOpMetadata();
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 5f62c44f25dd62b563bd8ce02477bd741f264182..e574644dea7c1ba144ba87fbeb7f28cc52312e26 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -115,6 +115,13 @@ StatusOr<std::unique_ptr<Literal>> HloTestBase::Execute(
   return test_runner_.Execute(std::move(module), arguments);
 }
 
+StatusOr<std::unique_ptr<Literal>> HloTestBase::ExecuteNoHloPasses(
+    std::unique_ptr<HloModule> module,
+    tensorflow::gtl::ArraySlice<Literal*> arguments) {
+  return test_runner_.Execute(std::move(module), arguments,
+                              /*run_hlo_passes=*/false);
+}
+
 std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
     std::unique_ptr<HloModule> module,
     tensorflow::gtl::ArraySlice<Literal*> arguments) {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index e375f13a44e0618f2a498325859d928c1adb830e..3e8e2360bb3a87e127920cd222803c0f7b9161f4 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -98,6 +98,12 @@ class HloTestBase : public ::testing::Test {
       std::unique_ptr<HloModule> module,
       tensorflow::gtl::ArraySlice<Literal*> arguments);
 
+  // Same as above, except the module will be executed without running any HLO
+  // passes on it.
+  StatusOr<std::unique_ptr<Literal>> ExecuteNoHloPasses(
+      std::unique_ptr<HloModule> module,
+      tensorflow::gtl::ArraySlice<Literal*> arguments);
+
   std::unique_ptr<Literal> ExecuteAndTransfer(
       std::unique_ptr<HloModule> module,
       tensorflow::gtl::ArraySlice<Literal*> arguments);
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
index 641907acf260c099a5ac885c362d92a0b6d78a42..da4cf4ae0c31bc194cd2ec9b845df36afbde69b0 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
@@ -64,7 +64,8 @@ HloModule& HloVerifiedTestBase::module() {
   return *module_;
 }
 
-void HloVerifiedTestBase::ParseAndVerifyModule(const char* hlo_text) {
+void HloVerifiedTestBase::ParseAndVerifyModule(
+    tensorflow::StringPiece hlo_text) {
   CHECK(!module_) << "Called ParseModule when test already has a module.";
   TF_ASSERT_OK_AND_ASSIGN(module_, tools::Parse(hlo_text));
   VerifyModule();
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
index c0cb12bc93f56a5cb5ebdac94488369331f0cea6..e5bb14a8839acbdef8fd2b79bb0f574c46ea3d40 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
@@ -44,7 +44,7 @@ class HloVerifiedTestBase : public HloTestBase {
   // Returns the default HloModule, lazily creating it if necessary via
   // HloTestBase::CreateNewModule().
   HloModule& module();
-  void ParseAndVerifyModule(const char* hlo_text);
+  void ParseAndVerifyModule(tensorflow::StringPiece hlo_text);
 
   // Sets the shape-size function used during hlo verification. If this isn't
   // called, a default ShapeVerifier is used instead.
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 50d7b5074d201d2292cf90224ef4cd37efdbb8d3..d24927d22b6534b46e711cd442f19a3e5cfcebdf 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -57,6 +57,11 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using FuncGeneratorForType = Computation (*)(PrimitiveType,
+                                             ComputationBuilder*);
+
+using FuncGenerator = Computation (*)(ComputationBuilder*);
+
 class ReduceTest : public ClientLibraryTestBase {
  protected:
   ReduceTest() {
@@ -755,53 +760,57 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) {
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Add) {
-  RunVectorizedReduceTest(CreateScalarAddComputation,
-                          [](float a, float b) { return a + b; },
-                          [](int32 a, int32 b) {
-                            return static_cast<int32>(static_cast<uint32>(a) +
-                                                      static_cast<uint32>(b));
-                          },
-                          [](uint32 a, uint32 b) { return a + b; }, 0.0, 0, 0);
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarAddComputation),
+      [](float a, float b) { return a + b; },
+      [](int32 a, int32 b) {
+        return static_cast<int32>(static_cast<uint32>(a) +
+                                  static_cast<uint32>(b));
+      },
+      [](uint32 a, uint32 b) { return a + b; }, 0.0, 0, 0);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Multiply) {
-  RunVectorizedReduceTest(CreateScalarMultiplyComputation,
-                          [](float a, float b) { return a * b; },
-                          [](int32 a, int32 b) {
-                            return static_cast<int32>(static_cast<uint32>(a) *
-                                                      static_cast<uint32>(b));
-                          },
-                          [](uint32 a, uint32 b) { return a * b; }, 1.0, 1, 1);
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMultiplyComputation),
+      [](float a, float b) { return a * b; },
+      [](int32 a, int32 b) {
+        return static_cast<int32>(static_cast<uint32>(a) *
+                                  static_cast<uint32>(b));
+      },
+      [](uint32 a, uint32 b) { return a * b; }, 1.0, 1, 1);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Max) {
-  RunVectorizedReduceTest(CreateScalarMaxComputation,
-                          [](float a, float b) { return std::max(a, b); },
-                          [](int32 a, int32 b) { return std::max(a, b); },
-                          [](uint32 a, uint32 b) { return std::max(a, b); },
-                          std::numeric_limits<float>::min(),
-                          std::numeric_limits<int32>::min(),
-                          std::numeric_limits<uint32>::min());
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMaxComputation),
+      [](float a, float b) { return std::max(a, b); },
+      [](int32 a, int32 b) { return std::max(a, b); },
+      [](uint32 a, uint32 b) { return std::max(a, b); },
+      std::numeric_limits<float>::min(), std::numeric_limits<int32>::min(),
+      std::numeric_limits<uint32>::min());
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Min) {
-  RunVectorizedReduceTest(CreateScalarMinComputation,
-                          [](float a, float b) { return std::min(a, b); },
-                          [](int32 a, int32 b) { return std::min(a, b); },
-                          [](uint32 a, uint32 b) { return std::min(a, b); },
-                          std::numeric_limits<float>::max(),
-                          std::numeric_limits<int32>::max(),
-                          std::numeric_limits<uint32>::max());
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMinComputation),
+      [](float a, float b) { return std::min(a, b); },
+      [](int32 a, int32 b) { return std::min(a, b); },
+      [](uint32 a, uint32 b) { return std::min(a, b); },
+      std::numeric_limits<float>::max(), std::numeric_limits<int32>::max(),
+      std::numeric_limits<uint32>::max());
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanAnd) {
   RunVectorizedReduceTestForType<bool>(
-      CreateScalarAndComputation, [](bool a, bool b) { return a && b; }, true);
+      static_cast<FuncGenerator>(CreateScalarAndComputation),
+      [](bool a, bool b) { return a && b; }, true);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanOr) {
   RunVectorizedReduceTestForType<bool>(
-      CreateScalarOrComputation, [](bool a, bool b) { return a || b; }, false);
+      static_cast<FuncGenerator>(CreateScalarOrComputation),
+      [](bool a, bool b) { return a || b; }, false);
 }
 
 class ReduceR3ToR2Test : public ReduceTest,
@@ -884,5 +893,47 @@ XLA_TEST_F(ReduceTest, ReduceOrPredR2_64x32_To_R1) {
   RunR2ToR1PredTest</*cols=32*/ 32>(/*and_reduce=false*/ false, /*rows=64*/ 64);
 }
 
+// Tests reductions with different initial values.  There's no test macro that
+// combines TYPED_TEST and TYPED_P, so we have to do it manually.
+class ReduceInitializerTest : public ReduceTest {
+ protected:
+  template <typename T>
+  void DoTest(T initializer, int num_elems) {
+    ComputationBuilder builder(client_, TestName());
+    Computation max_fn = CreateScalarMaxComputation(
+        primitive_util::NativeToPrimitiveType<T>(), &builder);
+
+    auto init = builder.ConstantR0<T>(initializer);
+    std::vector<T> input_arr(num_elems, std::numeric_limits<T>::lowest());
+    auto input_literal = Literal::CreateR1<T>(input_arr);
+    auto input_data =
+        client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+    builder.Reduce(builder.Parameter(0, input_literal->shape(), "input"), init,
+                   max_fn, {0});
+
+    ComputeAndCompareR0<T>(&builder, initializer, {input_data.get()});
+  }
+};
+
+XLA_TEST_F(ReduceInitializerTest, U8Small) { DoTest<uint8>(42, 2); }
+
+XLA_TEST_F(ReduceInitializerTest, U8BigPowerOf2) { DoTest<uint8>(42, 4096); }
+
+XLA_TEST_F(ReduceInitializerTest, U8InitializerBigNonPowerOf2) {
+  DoTest<uint8>(42, 4095);
+}
+
+XLA_TEST_F(ReduceInitializerTest, U64InitializerZero) {
+  DoTest<uint64>(0, 1024);
+}
+
+XLA_TEST_F(ReduceInitializerTest, U64InitializerOne) {
+  DoTest<uint64>(1, 1024);
+}
+
+XLA_TEST_F(ReduceInitializerTest, U64InitializerBigValue) {
+  DoTest<uint64>(1234556789123, 1024);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 8b736f62f045bb913ac09add2f00d5edb0692d83..8dd24f1237136e2807cea8a261ead25f5c7adbb2 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -252,6 +252,48 @@ TEST_P(ReduceWindowTest, AmongMajor2DimsMediumSize) {
                            DefaultErrorSpec());
 }
 
+// Tests the super windowing logic w.r.t handling prime number of windows in a
+// major dimension with reduction.
+TEST_P(ReduceWindowTest, PrimeWindowsInReductionDimension) {
+  Array4D<float> input_array(15, 15, 4, 128);
+  input_array.FillRandom(2.f, 4.f);
+
+  int win_len = 3;
+  int win_stride = 2;
+
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
+
+  Padding padding = Padding::kSame;
+  // Reduce only along the x and y dimensions, according to the win_len.
+  ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
+                  {win_stride, win_stride, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {win_len, win_len, 1, 1},
+      {win_stride, win_stride, 1, 1}, padding);
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
+}
+
+TEST_P(ReduceWindowTest, ReduceAlongLaneDimension) {
+  Array4D<float> input_array(19, 17, 8, 256);
+  input_array.FillWithMinorDimNum();
+
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
+
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input_data_handle, {1, 1, 1, 11}, {1, 1, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {1, 1, 1, 11}, {1, 1, 1, 1}, padding);
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
+}
+
 // Tests a reduction function that is not a simple add/min/max/etc.
 XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   Array4D<float> input_array(1, 2, 2, 1);
@@ -1021,6 +1063,15 @@ struct R2ReduceWindowTestData {
      /*strides=*/{1, 1}, /*pad_low=*/{0, 130}, /*pad_high=*/{0, 0},
      /*layout=*/{1, 0},
      /*reducer=*/Reducer::kAdd},
+// TODO(b/76025683): These tests fail on TPU.
+#if defined(XLA_TEST_BACKEND_CPU) || defined(XLA_TEST_BACKEND_GPU)
+    {/*base_bounds=*/{4096, 4096}, /*window_bounds=*/{1, 4},
+     /*strides=*/{1, 1024}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
+     /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{8, 256}, /*window_bounds=*/{1, 4},
+     /*strides=*/{1, 64}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
+#endif
 };
 
 string R2ReduceWindowTestDataToString(
@@ -1351,5 +1402,41 @@ ENTRY R2Window {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
 }
 
+TEST_F(ReduceWindowTextTest, R2EffectiveScalar) {
+  const string& hlo_string = R"(
+HloModule R2Window
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+ENTRY R2Window {
+  operand = f32[1,1]{1,0} parameter(0)
+  negate = f32[1,1]{1,0} negate(operand)
+  constant = f32[] constant(1)
+  ROOT reduce-window = f32[1,1]{1,0} reduce-window(negate, constant), window={size=1x1 pad=0_0x0_0}, to_apply=mul
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
+}
+
+TEST_F(ReduceWindowTextTest, R3EffectiveScalar) {
+  const string& hlo_string = R"(
+HloModule R3Window
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+ENTRY R3Window {
+  operand = f32[1,1,1]{2,1,0} parameter(0)
+  negate = f32[1,1,1]{2,1,0} negate(operand)
+  constant = f32[] constant(1)
+  ROOT reduce-window = f32[1,1,1]{2,1,0} reduce-window(negate, constant), window={size=1x1x1 pad=0_0x0_0x0_0}, to_apply=mul
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index f7b04debd4f5c40a904e32c832b6fc384a03c33b..02272d60171c70896f44b0d6b96f176ea52e686f 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -207,9 +208,9 @@ XLA_TEST_P(ReshapeTest, Trivial3x1) {
 //
 // Splits an empty vector into an empty matrix.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(R1ToR2_0_To_2x0)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
@@ -221,10 +222,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(R1ToR2_0_To_2x0)) {
 
 // Splits a vector into a matrix.
 XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal =
       Literal::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
@@ -241,9 +242,9 @@ XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
 //
 // Transposes a 2x0 array to a 0x2 array.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Reshape0x2To2x0)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 2));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -255,10 +256,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Reshape0x2To2x0)) {
 
 // Transposes a 2-dimensional row vector to a column vector.
 XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto simple = MakeLinspaceArray2D(1.0f, 3.0f, 1, 3);
   auto input_literal = Literal::CreateFromArray(*simple);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -272,10 +273,10 @@ XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
 
 // Transposes a 2-dimensional array.
 XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -291,11 +292,11 @@ XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 //
-// Transposes a 0x4 array with ComputationBuilder::Trans.
+// Transposes a 0x4 array with XlaBuilder::Transpose.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Transpose0x4)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 4));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Transpose(parameter, {1, 0});
@@ -306,10 +307,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Transpose0x4)) {
 
 // Transposes a 2-dimensional array with ComputationBuilder::Trans.
 XLA_TEST_P(ReshapeTest, Transpose4x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Transpose(parameter, {1, 0});
@@ -327,9 +328,9 @@ XLA_TEST_P(ReshapeTest, Transpose4x3) {
 // Reshapes an empty 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(6, 0));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index fe36df160daacc4fdfbdb0b75f8304f91e1a4245..69fbe98bd63661322d37936c90a5fe3580efc2de 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -41,7 +41,7 @@ TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {3, 3, 1}, {1, 1, 1});
 
@@ -54,7 +54,7 @@ TEST_F(SliceTest, Slice3x3x3_To_3x1x3_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {3, 1, 3}, {1, 1, 1});
 
@@ -67,7 +67,7 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {1, 3, 3}, {1, 1, 1});
 
@@ -77,7 +77,7 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
   builder.Slice(original, {0, 0}, {0, 0}, {1, 1});
 
@@ -85,7 +85,7 @@ XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 20));
   builder.Slice(original, {0, 15}, {0, 20}, {1, 1});
 
@@ -93,7 +93,7 @@ XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice3x0to2x0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
   builder.Slice(original, {1, 0}, {3, 0}, {1, 1});
 
@@ -108,7 +108,7 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {128, 128}, {256, 256}, {1, 1});
 
@@ -126,7 +126,7 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) {
   Array2D<float> values(1, 4096);
   std::iota(values.data(), values.data() + 4096, 0.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1});
 
@@ -147,7 +147,7 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) {
       }
     }
   }
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {0, 0}, {16, 2}, {1, 1});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -159,7 +159,7 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
   values.FillRandom(3.14f);
   auto expected = ReferenceUtil::Slice4D(
       values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}}, /*strides=*/{{1, 1, 1, 1}});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR4FromArray4D(values);
   builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1});
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
@@ -172,7 +172,7 @@ XLA_TEST_F(SliceTest, StridedSliceR4WithOutputLayout) {
                                          /*strides=*/{{1, 1, 2, 1}});
   auto expected_literal = Literal::CreateR4FromArray4DWithLayout(
       *expected, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR4FromArray4D(values);
   builder.Slice(original, {0, 0, 0, 0}, {2, 4, 6, 8}, {1, 1, 2, 1});
   ComputeAndCompareLiteral(&builder, *expected_literal, {}, ErrorSpec(0.000001),
@@ -193,15 +193,18 @@ class SliceR1Test : public ClientLibraryTestBase,
  protected:
   template <typename NativeT>
   void Run(const R1Spec& spec) {
-    std::vector<NativeT> input(spec.input_dim0);
+    // This can't be an std::vector, since you can't grab an ArraySlice of a
+    // vector<bool>.
+    tensorflow::gtl::InlinedVector<NativeT, 1> input(spec.input_dim0);
     std::iota(input.begin(), input.end(), NativeT());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto original = builder.ConstantR1<NativeT>(input);
     builder.Slice(original, {spec.slice_start}, {spec.slice_limit},
                   {spec.slice_stride});
 
-    std::vector<NativeT> expected;
+    // Ditto.
+    tensorflow::gtl::InlinedVector<NativeT, 1> expected;
     for (int i = spec.slice_start; i < spec.slice_limit;
          i += spec.slice_stride) {
       expected.push_back(i);
@@ -211,6 +214,9 @@ class SliceR1Test : public ClientLibraryTestBase,
   }
 };
 
+// A version of SliceR1Test used to label and disable 'large' tests
+class SliceR1LargeTest : public SliceR1Test {};
+
 string SliceR1TestDataToString(const ::testing::TestParamInfo<R1Spec>& data) {
   const R1Spec& spec = data.param;
   return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0,
@@ -230,6 +236,21 @@ XLA_TEST_P(SliceR1Test, DoIt_U64) { Run<uint64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_S64) { Run<int64>(GetParam()); }
 
+XLA_TEST_P(SliceR1LargeTest, DoIt_F32) { Run<float>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_F64) { Run<double>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U32) { Run<uint32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S32) { Run<int32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U64) { Run<uint64>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S64) { Run<int64>(GetParam()); }
+
+XLA_TEST_P(SliceR1Test, DoIt_PRED) { Run<bool>(GetParam()); }
+
+
 // Tests for R1 slice ops.
 // The format for each testcase is {input size, start, limit, stride}.
 // clang-format off
@@ -237,12 +258,6 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestInstantiation,
     SliceR1Test,
     ::testing::Values(
-// TODO(b/69425338): This uses too much memory on GPU.
-#ifndef XLA_TEST_BACKEND_GPU
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1},
-#endif
         R1Spec{10, 0, 0, 1},
         R1Spec{10, 7, 7, 1},
         R1Spec{10, 0, 5, 1},
@@ -278,6 +293,20 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestDataToString
 );
 
+// TODO(b/69425338): This uses too much memory on GPU.
+#ifndef XLA_TEST_BACKEND_GPU
+INSTANTIATE_TEST_CASE_P(
+    SliceR1TestBigSlicesInstantiation,
+    SliceR1LargeTest,
+    ::testing::Values(
+          R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
+          R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
+          R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1}
+    ),
+    SliceR1TestDataToString
+);
+#endif
+
 INSTANTIATE_TEST_CASE_P(
     SliceStridedR1TestInstantiation,
     SliceR1Test,
@@ -334,7 +363,7 @@ XLA_TEST_P(SliceR2Test, DoIt) {
   Array2D<int32> input(spec.input_dim0, spec.input_dim1);
   input.FillUnique();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2DWithLayout<int32>(
       input, LayoutUtil::MakeLayout(spec.layout));
   builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
@@ -424,7 +453,7 @@ class SliceR4Test : public ClientLibraryTestBase,
     values.FillRandom(3.14f);
     auto expected = ReferenceUtil::Slice4D(
         values, spec.slice_starts, spec.slice_limits, spec.slice_strides);
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto literal = Literal::CreateR4FromArray4DWithLayout(
         values, LayoutUtil::MakeLayout(spec.input_layout));
     auto parameter = builder.Parameter(0, literal->shape(), "p0");
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index 978a669bcab720bddec5c4bcd0144810ba3c8477..be35ec6c6ee4c015755622b2dc9bb92e23af7c85 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 0bc7df2a65b44a76f877b6513e6bf93b99fbc1a3..821432ef7dc7249d547a2d5f8868300388dc9d37 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -23,14 +23,14 @@ namespace xla {
 
 namespace {
 
-template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal,
-                                         std::minstd_rand0* engine) {
+template <typename FloatT, typename GeneratorT>
+void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
+                                             std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   // Create uniform numbers between 1 and 1.125 to avoid creating denormal
   // numbers.
-  std::uniform_real_distribution<FloatT> generator(1.0f, 1.125f);
+  std::uniform_real_distribution<GeneratorT> generator(1.0f, 1.125f);
   const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
   TF_CHECK_OK(literal->Populate<FloatT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices) {
@@ -52,10 +52,22 @@ void PopulateWithRandomFloatingPointData(Literal* literal,
         FloatT index_bias =
             static_cast<FloatT>(index_product % 113 - negative_bias) /
             static_cast<FloatT>(256.0f);
-        return (generator(*engine) - 1.0625) + index_bias;
+        return static_cast<FloatT>(generator(*engine) - 1.0625f) + index_bias;
       }));
 }
 
+template <typename FloatT>
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
+  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine);
+}
+
+template <>
+void PopulateWithRandomFloatingPointData<half>(Literal* literal,
+                                               std::minstd_rand0* engine) {
+  PopulateWithRandomFloatingPointDataImpl<half, float>(literal, engine);
+}
+
 // The standard library does not have a case for bfloat16, unsurprisingly, so we
 // handle that one specially.
 template <>
@@ -100,6 +112,9 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
     case BF16:
       PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
       break;
+    case F16:
+      PopulateWithRandomFloatingPointData<half>(literal.get(), engine);
+      break;
     case F32:
       PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
       break;
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 2029312f94a14bc81706368b9ecfc2727fd9fe4c..098be6d7aabe88d0deef600716229ddbd0bcae2f 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -20,11 +20,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -40,7 +43,7 @@ class TupleTest : public ClientLibraryTestBase {
 
 // Tests a tuple-shaped constant.
 XLA_TEST_F(TupleTest, TupleConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const float constant_scalar = 7.3f;
   std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
@@ -53,13 +56,13 @@ XLA_TEST_F(TupleTest, TupleConstant) {
                           Literal::CreateR1<float>(constant_vector).get(),
                           Literal::CreateR2<float>(constant_matrix).get()});
 
-  auto result = builder.ConstantLiteral(*value);
+  builder.ConstantLiteral(*value);
   ComputeAndCompareTuple(&builder, *value, {}, error_spec_);
 }
 
 // Tests a tuple made of scalar constants.
 XLA_TEST_F(TupleTest, TupleScalarConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const float constant_scalar1 = 7.3f;
   const float constant_scalar2 = 1.2f;
@@ -67,13 +70,13 @@ XLA_TEST_F(TupleTest, TupleScalarConstant) {
       Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar1).get(),
                           Literal::CreateR0<float>(constant_scalar2).get()});
 
-  auto result = builder.ConstantLiteral(*value);
+  builder.ConstantLiteral(*value);
   ComputeAndCompareTuple(&builder, *value, {}, error_spec_);
 }
 
 // Tests the creation of tuple data.
 XLA_TEST_F(TupleTest, TupleCreate) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const float constant_scalar = 7.3f;
   std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
@@ -81,9 +84,9 @@ XLA_TEST_F(TupleTest, TupleCreate) {
       {1.1f, 2.2f, 3.5f},  // row 0
       {4.8f, 5.0f, 6.7f},  // row 1
   };
-  auto result = builder.Tuple({builder.ConstantR0<float>(constant_scalar),
-                               builder.ConstantR1<float>(constant_vector),
-                               builder.ConstantR2<float>(constant_matrix)});
+  builder.Tuple({builder.ConstantR0<float>(constant_scalar),
+                 builder.ConstantR1<float>(constant_vector),
+                 builder.ConstantR2<float>(constant_matrix)});
 
   auto expected =
       Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar).get(),
@@ -94,9 +97,9 @@ XLA_TEST_F(TupleTest, TupleCreate) {
 
 // Tests the creation of tuple data.
 XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  auto result = builder.Tuple(
+  builder.Tuple(
       {builder.ConstantR0<float>(7.0), builder.ConstantR1<float>({})});
 
   auto expected = Literal::MakeTuple({Literal::CreateR0<float>(7.0).get(),
@@ -106,15 +109,15 @@ XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
 
 // Tests the creation of an empty tuple.
 XLA_TEST_F(TupleTest, EmptyTupleCreate) {
-  ComputationBuilder builder(client_, TestName());
-  auto result = builder.Tuple({});
+  XlaBuilder builder(TestName());
+  builder.Tuple({});
   auto expected = Literal::MakeTuple({});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
 // Trivial test for extracting a tuple element with GetTupleElement.
 XLA_TEST_F(TupleTest, GetTupleElement) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -122,23 +125,23 @@ XLA_TEST_F(TupleTest, GetTupleElement) {
   };
   auto tuple_data = builder.Tuple({builder.ConstantR1<float>(constant_vector),
                                    builder.ConstantR2<float>(constant_matrix)});
-  auto matrix_element = builder.GetTupleElement(tuple_data, 1);
+  builder.GetTupleElement(tuple_data, 1);
   ComputeAndCompareR2<float>(&builder, Array2D<float>(constant_matrix), {},
                              error_spec_);
 }
 
 // Trivial test for extracting a tuple element with GetTupleElement.
 XLA_TEST_F(TupleTest, GetTupleElementWithZeroElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto tuple_data = builder.Tuple(
       {builder.ConstantR1<float>({}),
        builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 101))});
-  auto matrix_element = builder.GetTupleElement(tuple_data, 1);
+  builder.GetTupleElement(tuple_data, 1);
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 101), {}, error_spec_);
 }
 
 XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto value = builder.ConstantR1<float>({4.5f});
   builder.GetTupleElement(value, 1);
   auto result_status = builder.Build();
@@ -151,7 +154,7 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
 // Extracts both elements from a tuple with GetTupleElement and then adds them
 // together.
 XLA_TEST_F(TupleTest, AddTupleElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -163,22 +166,22 @@ XLA_TEST_F(TupleTest, AddTupleElements) {
   auto matrix_element = builder.GetTupleElement(tuple_data, 1);
   auto vector_shape = builder.GetShape(vector_element).ConsumeValueOrDie();
   auto matrix_shape = builder.GetShape(matrix_element).ConsumeValueOrDie();
-  auto result = builder.Add(matrix_element, vector_element,
-                            /*broadcast_dimensions=*/{1});
+  builder.Add(matrix_element, vector_element,
+              /*broadcast_dimensions=*/{1});
 
   Array2D<float> expected({
       {2.f, 4.f, 6.f},  // row 0
       {5.f, 7.f, 9.f},  // row 1
   });
-  ASSERT_TRUE(ShapeUtil::ShapeIs(*vector_shape, F32, {3}));
-  ASSERT_TRUE(ShapeUtil::ShapeIs(*matrix_shape, F32, {/*y=*/2, /*x=*/3}));
+  ASSERT_TRUE(ShapeUtil::ShapeIs(vector_shape, F32, {3}));
+  ASSERT_TRUE(ShapeUtil::ShapeIs(matrix_shape, F32, {/*y=*/2, /*x=*/3}));
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
 // Extracts both elements from a tuple and then puts them into a new tuple in
 // the opposite order.
 XLA_TEST_F(TupleTest, TupleGTEToTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -186,8 +189,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
   };
   auto tuple_data = builder.Tuple({builder.ConstantR1<float>(constant_vector),
                                    builder.ConstantR2<float>(constant_matrix)});
-  auto new_tuple = builder.Tuple({builder.GetTupleElement(tuple_data, 1),
-                                  builder.GetTupleElement(tuple_data, 0)});
+  builder.Tuple({builder.GetTupleElement(tuple_data, 1),
+                 builder.GetTupleElement(tuple_data, 0)});
   auto expected =
       Literal::MakeTuple({Literal::CreateR2<float>(constant_matrix).get(),
                           Literal::CreateR1<float>(constant_vector).get()});
@@ -195,8 +198,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
 }
 
 XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle v1, v2;
+  XlaBuilder b(TestName());
+  XlaOp v1, v2;
 
   for (bool direction : {false, true}) {
     std::unique_ptr<GlobalData> v1_data =
@@ -209,7 +212,7 @@ XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
     auto v2_gt = b.Gt(v2, v1);             // true
     auto v1_v2 = b.Tuple({v1_gt, v2_gt});  // {false, true}
     auto v2_v1 = b.Tuple({v2_gt, v1_gt});  // {true, false}
-    auto select = b.Select(direction ? v1_gt : v2_gt, v1_v2, v2_v1);
+    b.Select(direction ? v1_gt : v2_gt, v1_v2, v2_v1);
     auto expected =
         Literal::MakeTuple({Literal::CreateR0<bool>(direction).get(),
                             Literal::CreateR0<bool>(!direction).get()});
@@ -236,7 +239,7 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
   //              \                (tuple10)--                     /
   //               \              /           \                   /
   //                -----(GTE 0)--             --(GTE 1)----------
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -256,8 +259,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
   auto addvectors = builder.Add(vector_from_01, vector_from_10);
   auto addmatrices = builder.Add(matrix_from_01, matrix_from_10);
 
-  auto result = builder.Add(addmatrices, addvectors,
-                            /*broadcast_dimensions=*/{1});
+  builder.Add(addmatrices, addvectors,
+              /*broadcast_dimensions=*/{1});
 
   Array2D<float> expected({
       {4.f, 8.f, 12.f},    // row 0
@@ -268,7 +271,7 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
 
 XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) {
   // Tests a selection between tuples with "false" path taken.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -277,8 +280,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) {
   auto tuple21 = builder.Tuple(
       {builder.ConstantR1<float>(vec2), builder.ConstantR1<float>(vec1)});
 
-  auto select =
-      builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
+  builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
   auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec2).get(),
                                       Literal::CreateR1<float>(vec1).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
@@ -313,7 +315,7 @@ XLA_TEST_F(TupleTest, TuplesInAMap) {
 
 XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
   // Tests a selection between tuples with "true" path taken.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -322,8 +324,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
   auto tuple21 = builder.Tuple(
       {builder.ConstantR1<float>(vec2), builder.ConstantR1<float>(vec1)});
 
-  auto select =
-      builder.Select(builder.ConstantR0<bool>(true), tuple12, tuple21);
+  builder.Select(builder.ConstantR0<bool>(true), tuple12, tuple21);
   auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec1).get(),
                                       Literal::CreateR1<float>(vec2).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
@@ -332,7 +333,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
 XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) {
   // Tests a selection between tuples but the final result is an element of the
   // tuple, not the whole tuple.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -343,7 +344,7 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) {
 
   auto select =
       builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
-  auto element = builder.GetTupleElement(select, 0);
+  builder.GetTupleElement(select, 0);
 
   ComputeAndCompareR1<float>(&builder, vec2, {}, error_spec_);
 }
@@ -367,7 +368,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) {
   //                                /             --(GTE 1)--
   //                               /
   //                          (tuple 21)
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -383,8 +384,8 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) {
       builder.Select(builder.GetTupleElement(pred_tuple, 0), tuple12, tuple21);
   auto select2 =
       builder.Select(builder.GetTupleElement(pred_tuple, 1), tuple21, select1);
-  auto result = builder.Add(builder.GetTupleElement(select2, 0),
-                            builder.GetTupleElement(select2, 1));
+  builder.Add(builder.GetTupleElement(select2, 0),
+              builder.GetTupleElement(select2, 1));
 
   ComputeAndCompareR1<float>(&builder, {3.f, 6.f, 9.f}, {}, error_spec_);
 }
@@ -393,7 +394,7 @@ XLA_TEST_F(TupleTest,
            DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesReuseConstants)) {
   // Similar to SelectBetweenTuples, but the constants are shared between the
   // input tuples.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -402,19 +403,18 @@ XLA_TEST_F(TupleTest,
   auto tuple12 = builder.Tuple({c1, c2});
   auto tuple21 = builder.Tuple({c2, c1});
 
-  auto select =
-      builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
+  builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
+
   auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec2).get(),
                                       Literal::CreateR1<float>(vec1).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
 XLA_TEST_F(TupleTest, NestedTuples) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto inner_tuple = builder.Tuple(
       {builder.ConstantR1<float>({1.0, 2.0}), builder.ConstantR0<float>(42.0)});
-  auto outer_tuple =
-      builder.Tuple({inner_tuple, builder.ConstantR1<float>({22.0, 44.0})});
+  builder.Tuple({inner_tuple, builder.ConstantR1<float>({22.0, 44.0})});
 
   auto expected_v1 = Literal::CreateR1<float>({1.0, 2.0});
   auto expected_s = Literal::CreateR0<float>(42.0);
@@ -428,7 +428,7 @@ XLA_TEST_F(TupleTest, NestedTuples) {
 }
 
 XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Shape data_shape = ShapeUtil::MakeShape(F32, {3});
   Shape inner_tuple_shape = ShapeUtil::MakeTupleShape({data_shape, data_shape});
@@ -459,7 +459,7 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
 }
 
 XLA_TEST_F(TupleTest, ComplexTuples) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape c64r0 = ShapeUtil::MakeShape(C64, {});
     Shape c64r1 = ShapeUtil::MakeShape(C64, {2});
@@ -514,5 +514,33 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
                          error_spec_);
 }
 
+class TupleHloTest : public HloTestBase {};
+
+// Disabled on CPU parallel because that's broken and will be removed soon.
+// Disabled on the interpreter because bitcast doesn't exist on the interpreter.
+TEST_F(TupleHloTest,
+       DISABLED_ON_INTERPRETER(DISABLED_ON_CPU_PARALLEL(BitcastAfterGTE))) {
+  const char* testcase = R"(
+    HloModule m
+
+    ENTRY test {
+      name.1 = (f32[3]{0}) parameter(0)
+      get-tuple-element.1 = f32[3]{0} get-tuple-element(name.1), index=0
+      bitcast = f32[1,3]{1,0} bitcast(get-tuple-element.1)
+      copy = f32[1,3]{1,0} copy(bitcast)
+      ROOT tuple.4 = (f32[1,3]{1,0}) tuple(copy)
+    }
+  )";
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::MakeTupleOwned(Literal::CreateR1<float>({1, 2, 3}));
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          ExecuteNoHloPasses(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result,
+      *Literal::MakeTupleOwned(Literal::CreateR2<float>({{1, 2, 3}}))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 33d457c70bac84c2da10e3cf9302c2c952cf1bc2..89ce2ce797f979b8668fbdb172a4a3abc5922b9f 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -54,29 +54,28 @@ TEST_F(WhileTest, WhileWithScalarS32Result) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int32>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -91,29 +90,28 @@ TEST_F(WhileTest, WhileWithScalarS64Result) {
   auto result_shape = ShapeUtil::MakeShape(S64, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int64>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int64>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int64>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int64>(&builder, 5, {});
 }
@@ -123,31 +121,30 @@ TEST_F(WhileTest, WhileWithScalarResultNonConstInit) {
   auto orig_shape = ShapeUtil::MakeShape(S32, {2});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int32>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.Reduce(builder.ConstantR1<int32>(2, 1),
                              builder.ConstantR0<int32>(0),
                              CreateScalarAddComputation(S32, &builder), {0});
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -156,28 +153,28 @@ TEST_F(WhileTest, WhileWithPredicateResult) {
   auto result_shape = ShapeUtil::MakeShape(PRED, {});
 
   // Create a computation for the condition: run until condition is true.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Ne(builder.ConstantR0<bool>(true), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: or condition with true.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
-    auto result = builder.Or(prev, builder.ConstantR0<bool>(true));
+    builder.Or(prev, builder.ConstantR0<bool>(true));
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.Ne(builder.ConstantR0<bool>(false),
                          builder.ConstantR0<bool>(true));
-  auto result = builder.While(condition, body, init);
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
@@ -194,9 +191,9 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {0});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -205,33 +202,34 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 15.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>({});
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>({});
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   ComputeAndCompareR1<float>(&builder, {}, {}, ErrorSpec(0.0001));
 }
@@ -247,9 +245,9 @@ TEST_F(WhileTest, WhileWithVectorResult) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -258,33 +256,34 @@ TEST_F(WhileTest, WhileWithVectorResult) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 5.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>(8, 0.125f);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>(8, 0.f);
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   // Individual elements with increase by 1/8 each time through the loop, so
   // the sum will increase by 1.0.  It will first be >15.5 when the elements
@@ -306,9 +305,9 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -317,34 +316,34 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 5.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>(8, 0.125f);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>(8, 0.f);
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   builder.Tuple({result});
 
   // Individual elements with increase by 1/8 each time through the loop, so
@@ -366,9 +365,9 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
   // Create a computation for the condition.
   // Repeat for N iterations.
   const int N = 2;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(N), iteration);
@@ -377,28 +376,28 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable and permute the weights.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto w1 = builder.GetTupleElement(prev, 1);
     auto w2 = builder.GetTupleElement(prev, 2);
     auto w3 = builder.GetTupleElement(prev, 3);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
        builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(N);
   auto expected_w1 = Literal::CreateR1<float>({1.0f, 1.0f, 1.0f});
@@ -419,9 +418,9 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   // Create a computation for the condition.
   // Repeat for N iterations.
   const int N = 2;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(N), iteration);
@@ -430,21 +429,21 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable permute the weights.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto w1 = builder.GetTupleElement(prev, 1);
     auto w2 = builder.GetTupleElement(prev, 2);
     auto w3 = builder.GetTupleElement(prev, 3);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
        builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
@@ -455,7 +454,7 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   auto result = builder.Add(add12, builder.GetTupleElement(xla_while, 3));
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   std::vector<float> expected = {6.f, 6.f, 6.f};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
@@ -474,9 +473,9 @@ TEST_F(WhileTest, WhileWithTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -486,26 +485,27 @@ TEST_F(WhileTest, WhileWithTupleResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR1<float>(
@@ -523,9 +523,9 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -534,27 +534,27 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable and or the predicate with true
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto pred = builder.GetTupleElement(prev, 1);
     auto new_pred = builder.Or(pred, builder.ConstantR0<bool>(true));
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_pred});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple({builder.ConstantR0<int32>(0),
                              builder.Ne(builder.ConstantR0<bool>(false),
                                         builder.ConstantR0<bool>(true))});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_predicate = Literal::CreateR0<bool>(true);
@@ -570,9 +570,9 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -582,25 +582,24 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and set the other tuple element to a
   // constant.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
-    auto result =
-        builder.Tuple({builder.Add(iteration, builder.ConstantR0<int32>(1)),
-                       builder.ConstantR0<int32>(7)});
+    builder.Tuple({builder.Add(iteration, builder.ConstantR0<int32>(1)),
+                   builder.ConstantR0<int32>(7)});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR0<int32>(7)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR0<int32>(7);
@@ -631,20 +630,20 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -654,34 +653,34 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  Computation body2;
+  XlaComputation body2;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body2, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -692,11 +691,11 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -710,20 +709,20 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -733,21 +732,21 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -758,11 +757,11 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -777,20 +776,20 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -800,21 +799,21 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -824,11 +823,11 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -844,9 +843,9 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -856,9 +855,9 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     // TupleElement 0
     auto iteration = builder.GetTupleElement(prev, 0);
@@ -873,18 +872,18 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // UpdateSlice.
     auto out1 = builder.DynamicUpdateSlice(input, update, starts);
 
-    auto result = builder.Tuple({out0, out1});
+    builder.Tuple({out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR1<float>(
@@ -915,18 +914,18 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
 
   // Create a computation for the condition: repeat for count iterations.
   auto build_condition = [this, v6s32](int count) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto prev = builder.Reshape(
         builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0},
-          {});
+        {});
     builder.Gt(builder.ConstantR0<int32>(count), prev);
     return builder.Build().ConsumeValueOrDie();
   };
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, v6s32, "prev");
     auto inc = builder.ConcatInDim(
         {builder.ConstantR1<int32>({1}),
@@ -934,16 +933,15 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
                             builder.ConstantR0<int32>(100),
                             ShapeUtil::MakeShape(S32, {5}))},
         0);
-    auto result = builder.Add(inc, prev);
+    builder.Add(inc, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
   auto while_loop = [this, &body, build_condition](int count) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto init = builder.ConstantR1<int32>({0, 0, 0, 0, 0, 0});
-    auto result = builder.While(build_condition(count), body, init);
-    auto shape = builder.GetShape(result).ConsumeValueOrDie();
+    builder.While(build_condition(count), body, init);
     return builder.Build();
   };
 
@@ -1107,9 +1105,9 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
   auto inner_result_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})});
 
-  Computation inner_condition;
+  XlaComputation inner_condition;
   {
-    ComputationBuilder builder(client_, "inner_condition");
+    XlaBuilder builder("inner_condition");
     auto params = builder.Parameter(0, inner_result_shape, "prev");
     auto i = builder.GetTupleElement(params, 0);
     builder.Lt(i, builder.ConstantR0<int32>(7));
@@ -1118,9 +1116,9 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 
   // Creates a computation for the outer loop condition:
   // repeat while result < 30.
-  Computation outer_condition;
+  XlaComputation outer_condition;
   {
-    ComputationBuilder builder(client_, "outer_condition");
+    XlaBuilder builder("outer_condition");
     auto prev = builder.Parameter(0, outer_result_shape, "prev");
     builder.Lt(prev, builder.ConstantR0<int32>(30));
     outer_condition = builder.Build().ConsumeValueOrDie();
@@ -1128,34 +1126,33 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 
   // Creates a computation for the inner loop body: add 1 to `i`, and add 2 to
   // `result`.
-  Computation inner_body;
+  XlaComputation inner_body;
   {
-    ComputationBuilder builder(client_, "inner_body");
+    XlaBuilder builder("inner_body");
     auto params = builder.Parameter(0, inner_result_shape, "prev");
     auto i = builder.GetTupleElement(params, 0);
     auto result = builder.GetTupleElement(params, 1);
     i = builder.Add(builder.ConstantR0<int32>(1), i);
     result = builder.Add(builder.ConstantR0<int32>(2), result);
-    auto output = builder.Tuple({i, result});
+    builder.Tuple({i, result});
     inner_body = builder.Build().ConsumeValueOrDie();
   }
 
   // Creates a computation for the outer loop: run the inner loop with i = 0.
-  Computation outer_body;
+  XlaComputation outer_body;
   {
-    ComputationBuilder builder(client_, "outer_body");
+    XlaBuilder builder("outer_body");
     auto prev = builder.Parameter(0, outer_result_shape, "prev");
     auto init = builder.Tuple({builder.ConstantR0<int32>(0), prev});
     auto result = builder.While(inner_condition, inner_body, init);
-    auto output = builder.GetTupleElement(result, 1);
+    builder.GetTupleElement(result, 1);
     outer_body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(outer_condition, outer_body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(outer_condition, outer_body, init);
 
   ComputeAndCompareR0<int32>(&builder, 42, {});
 }
@@ -1170,18 +1167,18 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition_callee;
+  XlaComputation condition_callee;
   {
-    ComputationBuilder builder(client_, "condition_callee");
+    XlaBuilder builder("condition_callee");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Tuple({builder.Gt(builder.ConstantR0<int32>(5), prev)});
 
     condition_callee = builder.Build().ConsumeValueOrDie();
   }
 
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto result = builder.Call(condition_callee, {prev});
     builder.GetTupleElement(result, 0);
@@ -1189,20 +1186,19 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -1214,28 +1210,28 @@ TEST_F(WhileTest, WhileWithLoopInvariantOperation) {
       {scalar_s32, matrix_shape, matrix_shape, matrix_shape});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto state = builder.Parameter(0, while_shape, "state");
     builder.Gt(builder.ConstantR0<int32>(5), builder.GetTupleElement(state, 0));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto state = builder.Parameter(0, while_shape, "state");
     auto indvar = builder.GetTupleElement(state, 0);
     auto input_0 = builder.GetTupleElement(state, 1);
     auto input_1 = builder.GetTupleElement(state, 2);
     auto output = builder.Tanh(builder.Dot(input_0, input_1));
     auto indvar_next = builder.Add(indvar, builder.ConstantR0<int32>(1));
-    auto tuple_result = builder.Tuple({indvar_next, input_0, input_1, output});
+    builder.Tuple({indvar_next, input_0, input_1, output});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto matrix_input = builder.Parameter(0, matrix_shape, "matrix");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), matrix_input, matrix_input, matrix_input});
@@ -1268,9 +1264,9 @@ void BM_WhileLoop(int num_iters) {
 
   // Create while condition computation with 'loop_limit'.
   const int32 loop_limit = 100;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, loop_state_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(loop_limit));
@@ -1278,9 +1274,9 @@ void BM_WhileLoop(int num_iters) {
   }
 
   // Create while body computation with unit loop increment.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, loop_state_shape, "prev");
     // TupleElement 0
     auto iteration = builder.GetTupleElement(prev, 0);
@@ -1294,12 +1290,12 @@ void BM_WhileLoop(int num_iters) {
     auto starts = builder.ConstantR1<int32>({0, 0, 0});
     // UpdateSlice.
     auto out1 = builder.DynamicUpdateSlice(input, update, starts);
-    auto result = builder.Tuple({out0, out1});
+    builder.Tuple({out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While instruction.
-  ComputationBuilder builder(client, "while");
+  XlaBuilder builder("while");
   auto zero = builder.ConstantR0<float>(0.0);
   auto input = builder.Broadcast(zero, {seq_len, 1024, 1024});
   auto init = builder.Tuple({builder.ConstantR0<int32>(0), input});
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 9ad2a1985331b80625dd0687ea052300bc99e440..ff3418a128eed82b730a6602d6e3faba4ad7be32 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -144,7 +145,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LocalExecutable> local_executable,
       client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
-                      ExecutableBuildOptions()));
+                      ExecutableBuildOptions().set_hlo_profile(true)));
 
   Executable* executable = local_executable->executable();
   HloExecutionProfile hlo_execution_profile(
@@ -294,7 +295,8 @@ XLA_TEST_F(HloProfileTest,
   auto while_body_profile_start =
       std::find_if(profile_output_lines.begin(), profile_output_lines.end(),
                    [](tensorflow::StringPiece s) {
-                     return s.starts_with("Execution profile for body");
+                     return tensorflow::str_util::StartsWith(
+                         s, "Execution profile for body");
                    });
 
   ASSERT_NE(while_body_profile_start, profile_output_lines.end());
diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
index 92b2b1ee778f8b0f8104e7d7ff27a5c11db59768..0af40bc15a41f7c4ef6382b1a94412afe5741a86 100644
--- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
+++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 GTEST_API_ int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
@@ -25,7 +28,37 @@ GTEST_API_ int main(int argc, char** argv) {
     return 2;
   }
 
+  // If the --benchmarks flag is passed in then only run the benchmarks, not the
+  // tests.
+  for (int i = 1; i < argc; i++) {
+    tensorflow::StringPiece arg(argv[i]);
+    if (arg == "--benchmarks" || arg.starts_with("--benchmarks=")) {
+      const char* pattern = nullptr;
+      if (arg.starts_with("--benchmarks=")) {
+        pattern = argv[i] + strlen("--benchmarks=");
+      } else {
+        // Handle flag of the form '--benchmarks foo' (no '=').
+        if (i + 1 >= argc ||
+            tensorflow::StringPiece(argv[i + 1]).starts_with("--")) {
+          LOG(ERROR) << "--benchmarks flag requires an argument.";
+          return 2;
+        }
+        pattern = argv[i + 1];
+      }
+      // Unfortunately Google's internal benchmark infrastructure has a
+      // different API than Tensorflow's.
+#if defined(PLATFORM_GOOGLE)
+      base::SetFlag(&FLAGS_benchmarks, pattern);
+      RunSpecifiedBenchmarks();
+#else
+      tensorflow::testing::Benchmark::Run(pattern);
+#endif
+      return 0;
+    }
+  }
+
   testing::InitGoogleTest(&argc, argv);
+
   if (argc > 1) {
     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
     return 2;
diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc
index 6fa4c48e11d1102367b21bc21d4734466495ef0e..44f874cd2ae8e6f65dc282b8675f195ec9c09415 100644
--- a/tensorflow/compiler/xla/text_literal_reader.cc
+++ b/tensorflow/compiler/xla/text_literal_reader.cc
@@ -38,7 +38,7 @@ namespace xla {
 
 StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadPath(
     tensorflow::StringPiece path) {
-  CHECK(!path.ends_with(".gz"))
+  CHECK(!tensorflow::str_util::EndsWith(path, ".gz"))
       << "TextLiteralReader no longer supports reading .gz files";
   std::unique_ptr<tensorflow::RandomAccessFile> file;
   Status s =
@@ -115,7 +115,7 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
     tensorflow::StringPiece value_string = pieces[1];
     tensorflow::str_util::RemoveWhitespaceContext(&coordinates_string);
     tensorflow::str_util::RemoveWhitespaceContext(&value_string);
-    if (!coordinates_string.Consume("(")) {
+    if (!tensorflow::str_util::ConsumePrefix(&coordinates_string, "(")) {
       return InvalidArgument(
           "expected '(' at the beginning of coordinates: \"%s\"", line.c_str());
     }
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 091fa0c3ec807a66449eca0bfbb141285b8eb532..0bc4045a5490319994b6cf24daf99fe856167507 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -75,6 +75,7 @@ cc_library(
     name = "replay_computation_library",
     srcs = ["replay_computation.cc"],
     deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -222,17 +223,3 @@ tf_cc_binary(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD
index 97aacf6b39f83978e732060817cd93ede81ca782..0fa4b98d0a41a1e7c681bb2302da3b752315867b 100644
--- a/tensorflow/compiler/xla/tools/parser/BUILD
+++ b/tensorflow/compiler/xla/tools/parser/BUILD
@@ -70,17 +70,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 863081d654390440aa6506bab4576b3cc5c1cbd1..adc8b1d620eb65fdca19072831360b71847abf9e 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -894,7 +895,7 @@ class HloParserTest : public ::testing::Test,
                       public ::testing::WithParamInterface<TestData> {
  protected:
   static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-    EXPECT_TRUE(StringPiece(s).contains(expected))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(s, expected))
         << "'" << s << "' does not contain '" << expected << "'";
   }
 
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index eda5effbb92db92c9317a956497a00c0ec15c27c..62a353ad09af009e4abf47664a5c5f7bd70a049e 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -66,6 +67,7 @@ struct Options {
   bool use_fake_data = false;
   bool print_result = true;
   int num_runs = 1;
+  bool xla_hlo_profile_last_run = false;
 };
 
 // Invokes the given computation passing arbitrary data for every (unbound)
@@ -122,16 +124,21 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   std::unique_ptr<Literal> result;
   for (int i = 0; i < opts.num_runs; ++i) {
     ExecutionProfile profile;
+    ExecutionOptions execution_options = CreateDefaultExecutionOptions();
+    if (opts.xla_hlo_profile_last_run && i == opts.num_runs - 1) {
+      execution_options.mutable_debug_options()->set_xla_hlo_profile(true);
+    }
+
     if (opts.print_result) {
-      TF_ASSIGN_OR_RETURN(result, client->ExecuteAndTransfer(
-                                      computation, execute_arguments,
-                                      /*execution_options=*/nullptr, &profile));
+      TF_ASSIGN_OR_RETURN(
+          result, client->ExecuteAndTransfer(computation, execute_arguments,
+                                             &execution_options, &profile));
     } else {
       // If we're not printing the result, execute the computation but don't
       // bother retrieving the result.  This can be a significant speedup.
       TF_RETURN_IF_ERROR(client
                              ->Execute(computation, execute_arguments,
-                                       /*execution_options=*/nullptr, &profile)
+                                       &execution_options, &profile)
                              .status());
     }
     LOG(INFO) << "Execution took "
@@ -191,6 +198,9 @@ int main(int argc, char** argv) {
                        "Number of times to run each computation"),
       tensorflow::Flag("fake_infeed_shape", &opts.fake_infeed_shape,
                        "Shape of fake data to construct for (infinite) infeed"),
+      tensorflow::Flag(
+          "xla_hlo_profile_last_run", &opts.xla_hlo_profile_last_run,
+          "Pass --xla_hlo_profile the last time we run the computation."),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index dc4f7a1cb436183f5acfa360fb092795258b6a75..e43498e381b8e63543e2ddda08ca7c0df91817e4 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -243,8 +243,8 @@ string HumanReadableNumOps(double flops, double nanoseconds,
       static_cast<int64>(nano_flops * 1e9));
   tensorflow::StringPiece sp(throughput);
   // Use the more common "G(FLOPS)", rather than "B(FLOPS)"
-  if (sp.ends_with("B") ||  // Ends in 'B', ignoring case
-      sp.ends_with("b")) {
+  if (tensorflow::str_util::EndsWith(sp, "B") ||  // Ends in 'B', ignoring case
+      tensorflow::str_util::EndsWith(sp, "b")) {
     *throughput.rbegin() = 'G';
   }
   throughput += tensorflow::strings::StrCat(op_prefix, "OP/s");
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index ff99d3728d1c3b58fc94d3eb3de78be23407edc9..2da9f9ed6f40fcf5b2512f974519df0b355da10f 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -519,6 +519,15 @@ int64 FindIndex(const C& c, Value&& value) {
   auto it = c_find(c, std::forward<Value>(value));
   return std::distance(c.begin(), it);
 }
+
+// Returns true if `x` fits in 32-bits.
+template <typename T>
+bool IsInt32(T x) {
+  // Following conversion rules: "the value is unchanged if it can be
+  // represented in the destination type (and bit-field width); otherwise, the
+  // value is implementation-defined."
+  return static_cast<int32>(x) == x;
+}
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index edf1b07af82b5d43fe67c6efdabdb0a9b4b1edea..5cb18113e5ba9c49809c4410d56ca7bb5a50dae5 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -299,6 +299,11 @@ message ComputationStatsRequest {
   DebugOptions debug_options = 2;
 }
 
+message ComputationGraphStatsRequest {
+  HloModuleProto computation = 1;
+  DebugOptions debug_options = 2;
+}
+
 message ComputationStatsResponse {
   ComputationStats stats = 1;
 }
@@ -355,6 +360,10 @@ message ExecuteParallelRequest {
   repeated ExecuteRequest requests = 1;
 }
 
+message ExecuteGraphParallelRequest {
+  repeated ExecuteGraphRequest requests = 1;
+}
+
 message ExecuteResponse {
   GlobalDataHandle output = 1;
   ExecutionProfile profile = 2;
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index c2663c5e83352a1088166dc7581a0346c7b104a4..bf69144ad83c9b5f9a51d4c9e6fbfe61b5f16fb2 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -34,6 +34,7 @@ py_library(
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
         "//tensorflow/contrib/data",
+        "//tensorflow/contrib/distribute:distribute",
         "//tensorflow/contrib/deprecated:deprecated_py",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -73,11 +74,12 @@ py_library(
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
+        "//tensorflow/contrib/optimizer_v2:optimizer_v2_py",
         "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
-        "//tensorflow/contrib/py2tf",
+        "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
@@ -108,10 +110,15 @@ py_library(
         "//tensorflow/python:util",
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_tensorrt([
         "//tensorflow/contrib/tensorrt:init_py",
-    ]) + if_not_windows([
-        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",  # unix dependency, need to fix code
+    ]) + select({
+        "//tensorflow:with_kafka_support_windows_override": [],
+        "//tensorflow:with_kafka_support": [
+            "//tensorflow/contrib/kafka",
+        ],
+        "//conditions:default": [],
+    }) + if_not_windows([
+        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
-        "//tensorflow/contrib/kafka",  # has some linking issue on opensssl.
     ]),
 )
 
@@ -121,9 +128,7 @@ cc_library(
     deps = [
         "//tensorflow/contrib/boosted_trees:boosted_trees_kernels",
         "//tensorflow/contrib/coder:all_kernels",
-        "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_kernels",
         "//tensorflow/contrib/data/kernels:dataset_kernels",
-        "//tensorflow/contrib/kafka:dataset_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
@@ -136,7 +141,13 @@ cc_library(
         "//tensorflow/contrib/text:all_kernels",
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_cuda([
         "//tensorflow/contrib/nccl:nccl_kernels",
-    ]),
+    ]) + select({
+        "//tensorflow:with_kafka_support_windows_override": [],
+        "//tensorflow:with_kafka_support": [
+            "//tensorflow/contrib/kafka:dataset_kernels",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
@@ -145,12 +156,10 @@ cc_library(
     deps = [
         "//tensorflow/contrib/boosted_trees:boosted_trees_ops_op_lib",
         "//tensorflow/contrib/coder:all_ops",
-        "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_ops_op_lib",
         "//tensorflow/contrib/data:dataset_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
-        "//tensorflow/contrib/kafka:dataset_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
@@ -161,17 +170,11 @@ cc_library(
         "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
         "//tensorflow/contrib/text:all_ops",
         "//tensorflow/contrib/tpu:all_ops",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
+    ] + select({
+        "//tensorflow:with_kafka_support_windows_override": [],
+        "//tensorflow:with_kafka_support": [
+            "//tensorflow/contrib/kafka:dataset_ops_op_lib",
         ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+        "//conditions:default": [],
+    }),
 )
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 669d611b01b585d91ab48921b7ba17703dd6bc98..1c5b00f92eace598dea5f035e4954b4b2de8da0e 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -1,3 +1,4 @@
+# pylint: disable=g-import-not-at-top
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,6 +33,7 @@ from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
 from tensorflow.contrib import data
 from tensorflow.contrib import deprecated
+from tensorflow.contrib import distribute
 from tensorflow.contrib import distributions
 from tensorflow.contrib import estimator
 from tensorflow.contrib import factorization
@@ -85,8 +87,9 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
-if os.name != 'nt':
+if os.name != "nt":
   from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.optimizer_v2 import optimizer_v2_symbols as optimizer_v2
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
 from tensorflow.contrib.specs import python as specs
diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index 8dff93b4f825277dcf0a64aa3b96bd809d36e1e9..62d1b1cf079d04d50e4899cfd9ba1d405ee1efb9 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -45,16 +45,3 @@ tf_py_test(
         "//tensorflow/python:state_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 6658f0d9c13f6db17b25354cde2593d57f104f17..8add2aacff1d64f1617cd24167c4c6c6706044da 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -38,16 +38,15 @@ def _flatten_tensors(tensors):
     shape: the original shape of each element of input tensors
 
   Raises:
-    ValueError: tensors are empty or non-isomorphic.
+    ValueError: tensors are empty or non-isomorphic or have unknown shape.
   """
   if not tensors:
     raise ValueError("tensors cannot be empty")
   shape = tensors[0].shape
   for tensor in tensors:
     shape = shape.merge_with(tensor.shape)
-  if shape.ndims is None:
-    raise ValueError("At least one of the tensors in 'tensors' must have "
-                     "statically known rank.")
+  if not shape.is_fully_defined():
+    raise ValueError("Tensors must have statically known shape.")
   if len(shape) != 1:
     reshaped = []
     for t in tensors:
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce_test.py b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
index 47bab0a3670a90644972b2c961954a3036b8ecba..b3f5d92259df8475b205110dd3f0cee1cb5bde6f 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce_test.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
@@ -36,6 +36,12 @@ from tensorflow.python.platform import tf_logging
 
 class AllReduceTest(test_util.TensorFlowTestCase):
 
+  def testFlattenTensorsShapesDefined(self):
+    x = array_ops.placeholder(types_pb2.DT_FLOAT, [None])
+    with self.assertRaisesRegexp(ValueError,
+                                 "must have statically known shape"):
+      ar._flatten_tensors([x, x])
+
   def testRingPermutations(self):
     # 0 devices
     pred_by_c_d, rank_by_c_d = ar._ring_permutations(1, 0, [])
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index 4bff3c27d22c4550747a651a59909bdef80e8285..60306ebdc6cddb04e8807bfd495fa92a56e55ecd 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -38,20 +38,6 @@ cc_library(
     alwayslink = 1,
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # JAR with Java bindings to TF.
 android_library(
     name = "android_tensorflow_inference_java",
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.cc b/tensorflow/contrib/android/asset_manager_filesystem.cc
index 380a652435ad089f46f3ca80e4fd43097fd96e10..513d519eabbd54f46fde9ec0f004247c02277732 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.cc
+++ b/tensorflow/contrib/android/asset_manager_filesystem.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 
 namespace tensorflow {
 namespace {
@@ -228,9 +229,8 @@ string AssetManagerFileSystem::NormalizeDirectoryPath(const string& fname) {
 }
 
 string AssetManagerFileSystem::RemoveAssetPrefix(const string& name) {
-  string output(name);
-  StringPiece piece(output);
-  piece.Consume(prefix_);
+  StringPiece piece(name);
+  str_util::ConsumePrefix(&piece, prefix_);
   return piece.ToString();
 }
 
@@ -243,6 +243,11 @@ bool AssetManagerFileSystem::DirectoryExists(const std::string& fname) {
   return AAssetDir_getNextFileName(dir.get()) != NULL;
 }
 
+Status AssetManagerFileSystem::GetMatchingPaths(const string& pattern,
+                                                std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status AssetManagerFileSystem::NewWritableFile(
     const string& fname, std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("Asset storage is read only.");
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.h b/tensorflow/contrib/android/asset_manager_filesystem.h
index 665304b5eef1f8a3633c8c522259e20d744b1808..a87ff42ae217c429ecf5d2458b88b3431551ad97 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.h
+++ b/tensorflow/contrib/android/asset_manager_filesystem.h
@@ -66,6 +66,9 @@ class AssetManagerFileSystem : public FileSystem {
   Status DeleteDir(const string& d) override;
   Status RenameFile(const string& s, const string& t) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
  private:
   string RemoveAssetPrefix(const string& name);
 
diff --git a/tensorflow/contrib/android/cmake/CMakeLists.txt b/tensorflow/contrib/android/cmake/CMakeLists.txt
index a115d1610e2334a6626f29674f3dd195e3a3c648..ecf1a103d2981f409a4598d762fb26100217f779 100644
--- a/tensorflow/contrib/android/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/android/cmake/CMakeLists.txt
@@ -75,7 +75,6 @@ target_link_libraries(tensorflow_inference
 include_directories(
     ${PREBUILT_DIR}/proto
     ${PREBUILT_DIR}/protobuf/include
-    ${PREBUILT_DIR}/nsync/public
     ${TENSORFLOW_ROOT_DIR}/tensorflow/contrib/makefile/downloads/eigen
     ${TENSORFLOW_ROOT_DIR}
     ${CMAKE_CURRENT_SOURCE_DIR}/..)
diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/autograph/BUILD
similarity index 75%
rename from tensorflow/contrib/py2tf/BUILD
rename to tensorflow/contrib/autograph/BUILD
index d91220f6ddb859ff52d4e5853948cb667981009b..30dd846893c30b9205972bd5216cc1871ab03d76 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/autograph/BUILD
@@ -15,16 +15,16 @@ filegroup(
 )
 
 py_library(
-    name = "py2tf",
+    name = "autograph",
     srcs = [
         "__init__.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/py2tf/impl",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/contrib/autograph/impl",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/contrib/autograph/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/py2tf/README.md b/tensorflow/contrib/autograph/README.md
similarity index 87%
rename from tensorflow/contrib/py2tf/README.md
rename to tensorflow/contrib/autograph/README.md
index cd50675ad57316b9c749c137e6acd30b91c10073..7e84f237dc9a83098f142a54c48cf5b6ba35aaaa 100644
--- a/tensorflow/contrib/py2tf/README.md
+++ b/tensorflow/contrib/autograph/README.md
@@ -1,4 +1,4 @@
-# Py2TF
+# Autograph
 
 A compiler for generating TensorFlow numeric and control flow ops from Python
 code.
diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/autograph/__init__.py
similarity index 59%
rename from tensorflow/contrib/py2tf/__init__.py
rename to tensorflow/contrib/autograph/__init__.py
index 6531183cb59af774299eb767cce111d2ec6f32b4..a39f44b21aa0ddf683b30c18bbe15a43262f7db2 100644
--- a/tensorflow/contrib/py2tf/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Py2TF compiles Python code into equivalent TensorFlow code.
+"""Autograph compiles Python code into equivalent TensorFlow code.
 
 Equivalent here means that they have the same effect when executed.
 """
@@ -21,18 +21,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.impl.api import convert
-from tensorflow.contrib.py2tf.impl.api import converted_call
-from tensorflow.contrib.py2tf.impl.api import graph_ready
-from tensorflow.contrib.py2tf.impl.api import to_code
-from tensorflow.contrib.py2tf.impl.api import to_graph
-from tensorflow.contrib.py2tf.pyct.transformer import PyFlowParseError
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.impl.api import convert
+from tensorflow.contrib.autograph.impl.api import converted_call
+from tensorflow.contrib.autograph.impl.api import do_not_convert
+from tensorflow.contrib.autograph.impl.api import RunMode
+from tensorflow.contrib.autograph.impl.api import to_code
+from tensorflow.contrib.autograph.impl.api import to_graph
+from tensorflow.contrib.autograph.pyct.transformer import AutographParseError
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'to_graph', 'to_code', 'convert', 'graph_ready', 'converted_call', 'utils',
-    'PyFlowParseError'
+    'utils', 'convert', 'converted_call', 'do_not_convert', 'RunMode',
+    'to_code', 'to_graph', 'AutographParseError'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
similarity index 90%
rename from tensorflow/contrib/py2tf/converters/BUILD
rename to tensorflow/contrib/autograph/converters/BUILD
index 4bb6f76019739fc3b5bf4bf52e302a698693db5a..c5a0dc10959ccb64e090292794bcd0b4fd2dbbd2 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -49,9 +49,9 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":converters",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
-        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/contrib/autograph/pyct/static_analysis",
+        "//tensorflow/contrib/autograph/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
@@ -61,6 +61,7 @@ py_test(
     name = "asserts_test",
     srcs = ["asserts_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
         "//tensorflow/python:client_testlib",
@@ -81,7 +82,7 @@ py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
         "//tensorflow/python:client_testlib",
@@ -90,12 +91,13 @@ py_test(
 
 py_test(
     name = "call_trees_test",
+    size = "large",
     srcs = ["call_trees_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/impl",
+        "//tensorflow/contrib/autograph/impl",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -145,7 +147,7 @@ py_test(
     srcs = ["name_scopes_test.py"],
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -201,7 +203,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -212,7 +214,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/contrib/py2tf/converters/__init__.py b/tensorflow/contrib/autograph/converters/__init__.py
similarity index 95%
rename from tensorflow/contrib/py2tf/converters/__init__.py
rename to tensorflow/contrib/autograph/converters/__init__.py
index ca10896ee5c6c23d9b20ff23add9945de68e5bf9..e4e8eda42f655e204310eaa9defdd5c90bf06e15 100644
--- a/tensorflow/contrib/py2tf/converters/__init__.py
+++ b/tensorflow/contrib/autograph/converters/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Code converters used by Py2TF."""
+"""Code converters used by Autograph."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/py2tf/converters/asserts.py b/tensorflow/contrib/autograph/converters/asserts.py
similarity index 93%
rename from tensorflow/contrib/py2tf/converters/asserts.py
rename to tensorflow/contrib/autograph/converters/asserts.py
index 5b9b8e772bed82df2429fd6cb94dbf7b565e22b3..f011a97ade94f2979486ef6329673a0160dd9bac 100644
--- a/tensorflow/contrib/py2tf/converters/asserts.py
+++ b/tensorflow/contrib/autograph/converters/asserts.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
 class AssertsTransformer(transformer.Base):
diff --git a/tensorflow/contrib/py2tf/converters/asserts_test.py b/tensorflow/contrib/autograph/converters/asserts_test.py
similarity index 90%
rename from tensorflow/contrib/py2tf/converters/asserts_test.py
rename to tensorflow/contrib/autograph/converters/asserts_test.py
index 6611f2777a93a7e819c8becfa06a09b27f4e6aaf..cc913febe8d0f411588af69b87ec52ce58f4469c 100644
--- a/tensorflow/contrib/py2tf/converters/asserts_test.py
+++ b/tensorflow/contrib/autograph/converters/asserts_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.converters import asserts
-from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import asserts
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
similarity index 92%
rename from tensorflow/contrib/py2tf/converters/break_statements.py
rename to tensorflow/contrib/autograph/converters/break_statements.py
index bfb709c5e32c6f19dc0fd109df61ece925d701a3..48026bccab5ff3474e9d54e365dad4a589b931fc 100644
--- a/tensorflow/contrib/py2tf/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -20,14 +20,14 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class BreakCanonicalizationTransformer(transformer.Base):
-  """Canonicalizes continue statements into additional conditionals."""
+  """Canonicalizes break statements into additional conditionals."""
 
   def __init__(self, context):
     super(BreakCanonicalizationTransformer, self).__init__(context)
diff --git a/tensorflow/contrib/py2tf/converters/break_statements_test.py b/tensorflow/contrib/autograph/converters/break_statements_test.py
similarity index 95%
rename from tensorflow/contrib/py2tf/converters/break_statements_test.py
rename to tensorflow/contrib/autograph/converters/break_statements_test.py
index 095fcdff07d44ecc6b9bb7f8d3e2c7c43df72a02..dd4914a022f57b3bb4a19ec132f311f12269fa9e 100644
--- a/tensorflow/contrib/py2tf/converters/break_statements_test.py
+++ b/tensorflow/contrib/autograph/converters/break_statements_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import break_statements
-from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import break_statements
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions.py b/tensorflow/contrib/autograph/converters/builtin_functions.py
similarity index 92%
rename from tensorflow/contrib/py2tf/converters/builtin_functions.py
rename to tensorflow/contrib/autograph/converters/builtin_functions.py
index f1129ef153e6be6cbcbbf4bab63c4fe32ec77147..0349ce29ceb097fbebc36a0378b9072750772416 100644
--- a/tensorflow/contrib/py2tf/converters/builtin_functions.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
 class BuiltinFunctionTransformer(transformer.Base):
@@ -38,13 +38,13 @@ class BuiltinFunctionTransformer(transformer.Base):
 
   def _convert_builtin(self, node):
     template = """
-      py2tf_utils.dynamic_builtin(func, args)
+      autograph_utils.dynamic_builtin(func, args)
     """
     return templates.replace(template, func=node.func, args=node.args)[0].value
 
   def _convert_print(self, node):
     template = """
-      py2tf_utils.dynamic_print(args)
+      autograph_utils.dynamic_print(args)
     """
     return templates.replace(template, args=node.args)[0].value
 
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions_test.py b/tensorflow/contrib/autograph/converters/builtin_functions_test.py
similarity index 96%
rename from tensorflow/contrib/py2tf/converters/builtin_functions_test.py
rename to tensorflow/contrib/autograph/converters/builtin_functions_test.py
index eb60a1d8ae2b56907df8f3ffafe7604883cfc2a9..ac7e756c47c31816ad34a7ea6926917712afa6c3 100644
--- a/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions_test.py
@@ -22,8 +22,8 @@ import sys
 
 import six
 
-from tensorflow.contrib.py2tf.converters import builtin_functions
-from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import builtin_functions
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
diff --git a/tensorflow/contrib/py2tf/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
similarity index 82%
rename from tensorflow/contrib/py2tf/converters/call_trees.py
rename to tensorflow/contrib/autograph/converters/call_trees.py
index ca8726f9160d106ebd82e01e399e65fb77b02aab..61f6bfd7e733fc3e2e0bea35a955509c39d57bc9 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -22,18 +22,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from collections import namedtuple
 import types
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import inspect_utils
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import inspect_utils
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
 
+class FunctionInfo(namedtuple('FunctionInfo', ('dtype',))):
+  pass
+
+
+# TODO(mdan): Move this to config.py.
+KNOWN_NUMPY_FUNCTIONS = {
+    ('numpy', 'random', 'binomial'): FunctionInfo(dtype='tf.int64'),
+}
+
+
 class FunctionNamer(object):
   """Describes the interface for CallTreeTransformer's namer."""
 
@@ -106,6 +118,12 @@ class CallTreeTransformer(transformer.Base):
 
   def _should_compile(self, node, fqn):
     """Determines whether an entity should be compiled in the context."""
+    # TODO(mdan): Needs cleanup. We should remove the use of fqn altogether.
+    module_name = fqn[0]
+    for mod in self.uncompiled_modules:
+      if module_name.startswith(mod[0] + '.'):
+        return False
+
     for i in range(1, len(fqn)):
       if fqn[:i] in self.uncompiled_modules:
         return False
@@ -179,11 +197,27 @@ class CallTreeTransformer(transformer.Base):
     return node
 
   def _wrap_to_py_func_no_return(self, node):
-    # TODO(mdan): Properly handle varargs, kwargs, etc.
+    # TODO(mdan): Properly handle varargs, etc.
+    template = """
+      autograph_utils.wrap_py_func(func, None, (args,), kwargs, True)
+    """
+    return templates.replace(
+        template,
+        func=node.func,
+        args=node.args,
+        kwargs=ast_util.keywords_to_dict(node.keywords))
+
+  def _wrap_to_py_func_single_return(self, node, dtype):
+    # TODO(mdan): Properly handle varargs, etc.
     template = """
-      py2tf_utils.wrap_py_func(func, None, (original_args,), True)
+      autograph_utils.wrap_py_func(func, dtype, (args,), kwargs, False)
     """
-    return templates.replace(template, func=node.func, original_args=node.args)
+    return templates.replace_as_expression(
+        template,
+        func=node.func,
+        dtype=parser.parse_expression(dtype),
+        args=node.args,
+        kwargs=ast_util.keywords_to_dict(node.keywords))
 
   def _insert_dynamic_conversion(self, node):
     """Inlines a dynamic conversion for a dynamic function."""
@@ -204,10 +238,9 @@ class CallTreeTransformer(transformer.Base):
     # Before we could convert all the time though, we'd need a reasonable
     # caching mechanism.
     template = """
-      py2tf_api.converted_call(func, True, False, {}, original_args)
+      autograph_api.converted_call(func, True, False, {}, args)
     """
-    call_expr = templates.replace(
-        template, func=node.func, original_args=node.args)
+    call_expr = templates.replace(template, func=node.func, args=node.args)
     new_call = call_expr[0].value
     # TODO(mdan): Improve the template mechanism to better support this.
     new_call.keywords = node.keywords
@@ -248,10 +281,19 @@ class CallTreeTransformer(transformer.Base):
     self.generic_visit(node)
     if anno.hasanno(node.func, 'live_val'):
       target_entity = anno.getanno(node.func, 'live_val')
+      if anno.hasanno(node.func, 'fqn'):
+        target_fqn = anno.getanno(node.func, 'fqn')
+      else:
+        target_fqn = None
       if self._function_is_compilable(target_entity):
         node = self._rename_compilable_function(node)
+      elif target_fqn and target_fqn in KNOWN_NUMPY_FUNCTIONS:
+        # TODO(mdan): Should we replace these with equivalent TF ops instead?
+        node = self._wrap_to_py_func_single_return(
+            node, KNOWN_NUMPY_FUNCTIONS[target_fqn].dtype)
       else:
-        raise NotImplementedError('py_func with return values')
+        raise NotImplementedError(
+            'py_func with return values (unknown function)')
     else:
       if self.context.recursive:
         node = self._insert_dynamic_conversion(node)
diff --git a/tensorflow/contrib/py2tf/converters/call_trees_test.py b/tensorflow/contrib/autograph/converters/call_trees_test.py
similarity index 85%
rename from tensorflow/contrib/py2tf/converters/call_trees_test.py
rename to tensorflow/contrib/autograph/converters/call_trees_test.py
index d482a9ef7897388839bbf8f9e4bfc5839d42b2d7..c666dcb73b232ce443898cfe3359f74605af98f2 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees_test.py
+++ b/tensorflow/contrib/autograph/converters/call_trees_test.py
@@ -18,9 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import call_trees
-from tensorflow.contrib.py2tf.converters import converter_test_base
+import numpy as np
+
+from tensorflow.contrib.autograph.converters import call_trees
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -105,6 +109,20 @@ class CallTreesTest(converter_test_base.TestCase):
         sess.run(sess.graph.get_operations()[0])
         self.assertEquals('bar', a.foo)
 
+  def test_py_func_wrap_known_function(self):
+
+    def test_fn():
+      return np.random.binomial(2, 0.5)
+
+    node = self.parse_and_analyze(test_fn, {'np': np})
+    node = call_trees.transform(node, self.ctx, (), ())
+
+    with self.compiled(node, dtypes.int64) as result:
+      result.np = np
+      with self.test_session() as sess:
+        self.assertTrue(isinstance(result.test_fn(), ops.Tensor))
+        self.assertIn(sess.run(result.test_fn()), (0, 1, 2))
+
   def test_uncompiled_modules(self):
 
     def test_fn(a):
diff --git a/tensorflow/contrib/py2tf/converters/continue_statements.py b/tensorflow/contrib/autograph/converters/continue_statements.py
similarity index 94%
rename from tensorflow/contrib/py2tf/converters/continue_statements.py
rename to tensorflow/contrib/autograph/converters/continue_statements.py
index 4069a678b118b56b59d2e5491bb80cf52efd8143..4299a8a9d59715d032222c47794bbb4393f34ce6 100644
--- a/tensorflow/contrib/py2tf/converters/continue_statements.py
+++ b/tensorflow/contrib/autograph/converters/continue_statements.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class ContinueCanonicalizationTransformer(transformer.Base):
diff --git a/tensorflow/contrib/py2tf/converters/continue_statements_test.py b/tensorflow/contrib/autograph/converters/continue_statements_test.py
similarity index 95%
rename from tensorflow/contrib/py2tf/converters/continue_statements_test.py
rename to tensorflow/contrib/autograph/converters/continue_statements_test.py
index a598dcd1aed29478b7e3fe27e3c1b20010247dd9..bcbb316d7459aa5a25bb0bd128cd6e359a393288 100644
--- a/tensorflow/contrib/py2tf/converters/continue_statements_test.py
+++ b/tensorflow/contrib/autograph/converters/continue_statements_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import continue_statements
-from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import continue_statements
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py
similarity index 93%
rename from tensorflow/contrib/py2tf/converters/control_flow.py
rename to tensorflow/contrib/autograph/converters/control_flow.py
index 762c26f0c77e13c077761ceec41cb29db9149a35..49d932026ffa9e79e7ddc640f7d3deaec0f4b8a6 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow.py
+++ b/tensorflow/contrib/autograph/converters/control_flow.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import ast_util
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class SymbolNamer(object):
@@ -82,7 +82,7 @@ class ControlFlowTransformer(transformer.Base):
   def _create_cond_expr(self, results, test, body_name, orelse_name):
     if results is not None:
       template = """
-        results = py2tf_utils.run_cond(test, body_name, orelse_name)
+        results = autograph_utils.run_cond(test, body_name, orelse_name)
       """
       return templates.replace(
           template,
@@ -92,7 +92,7 @@ class ControlFlowTransformer(transformer.Base):
           orelse_name=orelse_name)
     else:
       template = """
-        py2tf_utils.run_cond(test, body_name, orelse_name)
+        autograph_utils.run_cond(test, body_name, orelse_name)
       """
       return templates.replace(
           template, test=test, body_name=body_name, orelse_name=orelse_name)
@@ -204,7 +204,7 @@ class ControlFlowTransformer(transformer.Base):
       def body_name(state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = py2tf_utils.run_while(test_name, body_name, [state])
+      state_ast_tuple = autograph_utils.run_while(test_name, body_name, [state])
     """
     node = templates.replace(
         template,
diff --git a/tensorflow/contrib/py2tf/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py
similarity index 95%
rename from tensorflow/contrib/py2tf/converters/control_flow_test.py
rename to tensorflow/contrib/autograph/converters/control_flow_test.py
index b785b284a7fb7a0257551326c88b44a341b295ba..86fed51f27bee07f772633f3928ac5263bf57652 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow_test.py
+++ b/tensorflow/contrib/autograph/converters/control_flow_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import control_flow
-from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import control_flow
+from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py
similarity index 85%
rename from tensorflow/contrib/py2tf/converters/converter_test_base.py
rename to tensorflow/contrib/autograph/converters/converter_test_base.py
index 8c08c5492a4b10d4abb0ec3b19b39d5b17e41a0a..3ea2cfd668270a69427c24cdf1bbf11d32d66ebe 100644
--- a/tensorflow/contrib/py2tf/converters/converter_test_base.py
+++ b/tensorflow/contrib/autograph/converters/converter_test_base.py
@@ -21,15 +21,15 @@ from __future__ import print_function
 import contextlib
 import imp
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import pretty_printer
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import pretty_printer
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis import live_values
+from tensorflow.contrib.autograph.pyct.static_analysis import type_info
 from tensorflow.python.platform import test
 
 
@@ -75,8 +75,8 @@ class TestCase(test.TestCase):
     try:
       result, source = compiler.ast_to_object(node)
       result.tf = self.make_fake_mod('fake_tf', *symbols)
-      result.py2tf_utils = utils
-      result.py2tf_api = self.make_fake_mod('fake_api', converted_call)
+      result.autograph_utils = utils
+      result.autograph_api = self.make_fake_mod('fake_api', converted_call)
       yield result
     except Exception:  # pylint:disable=broad-except
       if source is None:
diff --git a/tensorflow/contrib/py2tf/converters/decorators.py b/tensorflow/contrib/autograph/converters/decorators.py
similarity index 96%
rename from tensorflow/contrib/py2tf/converters/decorators.py
rename to tensorflow/contrib/autograph/converters/decorators.py
index 68bf241ef33292f0581ccb3c44f313f853c92ba7..92445f31746cf94856ea43893f99a2ba60355fb5 100644
--- a/tensorflow/contrib/py2tf/converters/decorators.py
+++ b/tensorflow/contrib/autograph/converters/decorators.py
@@ -24,8 +24,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import pretty_printer
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import pretty_printer
 
 
 class DecoratorsTransformer(gast.NodeTransformer):
diff --git a/tensorflow/contrib/py2tf/converters/decorators_test.py b/tensorflow/contrib/autograph/converters/decorators_test.py
similarity index 95%
rename from tensorflow/contrib/py2tf/converters/decorators_test.py
rename to tensorflow/contrib/autograph/converters/decorators_test.py
index c75e5461746f27d14a54b7ac06e7f77d868372c8..e67ab1cd6a15ceb66fe75140419c7abca9653ae4 100644
--- a/tensorflow/contrib/py2tf/converters/decorators_test.py
+++ b/tensorflow/contrib/autograph/converters/decorators_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 from functools import wraps
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import decorators
-from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import decorators
+from tensorflow.contrib.autograph.pyct import compiler
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/converters/for_loops.py b/tensorflow/contrib/autograph/converters/for_loops.py
similarity index 67%
rename from tensorflow/contrib/py2tf/converters/for_loops.py
rename to tensorflow/contrib/autograph/converters/for_loops.py
index 4297c1cf2a3632e097973280cc985fc48da64475..4999c47bdc79ec0ea352472cfd3e97b94ebc7cce 100644
--- a/tensorflow/contrib/py2tf/converters/for_loops.py
+++ b/tensorflow/contrib/autograph/converters/for_loops.py
@@ -22,10 +22,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class ForLoopCanonicalizationTransformer(transformer.Base):
@@ -38,19 +38,19 @@ class ForLoopCanonicalizationTransformer(transformer.Base):
     self.generic_visit(node)
     body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     i_var = self.context.namer.new_symbol('i', body_scope.referenced)
-    n_var = self.context.namer.new_symbol('n', body_scope.referenced)
-    iterated_var = self.context.namer.new_symbol('iterated',
-                                                 body_scope.referenced)
+    smart_loop_iter_var = self.context.namer.new_symbol('smart_loop_iter',
+                                                        body_scope.referenced)
+    cont_var = self.context.namer.new_symbol('cont', body_scope.referenced)
     # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
     if anno.hasanno(node, 'extra_cond'):
       template = """
         i = 0
-        iterated = loop_iter
-        n = len(iterated)
-        while i < n and extra_cond:
-          target = iterated[i]
+        smart_loop_iter = autograph_utils.dynamic_dataset(loop_iter)
+        cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
+        while cont and extra_cond:
           body
           i += 1
+          cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
       """
       return templates.replace(
           template,
@@ -58,18 +58,18 @@ class ForLoopCanonicalizationTransformer(transformer.Base):
           target=node.target,
           body=node.body,
           i=i_var,
-          n=n_var,
-          iterated=iterated_var,
+          smart_loop_iter=smart_loop_iter_var,
+          cont=cont_var,
           extra_cond=anno.getanno(node, 'extra_cond'))
     else:
       template = """
         i = 0
-        iterated = loop_iter
-        n = len(iterated)
-        while i < n:
-          target = iterated[i]
+        smart_loop_iter = autograph_utils.dynamic_dataset(loop_iter)
+        cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
+        while cont:
           body
           i += 1
+          cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
       """
       repl = templates.replace(
           template,
@@ -77,8 +77,8 @@ class ForLoopCanonicalizationTransformer(transformer.Base):
           target=node.target,
           body=node.body,
           i=i_var,
-          n=n_var,
-          iterated=iterated_var)
+          smart_loop_iter=smart_loop_iter_var,
+          cont=cont_var)
       return repl
 
   def visit_Continue(self, node):
diff --git a/tensorflow/contrib/py2tf/converters/for_loops_test.py b/tensorflow/contrib/autograph/converters/for_loops_test.py
similarity index 93%
rename from tensorflow/contrib/py2tf/converters/for_loops_test.py
rename to tensorflow/contrib/autograph/converters/for_loops_test.py
index b6e3e8c8d8d4960977e2b72b56a3fab8329ad2a7..943f52de55a3629fdb18e6188e42269a4cb06275 100644
--- a/tensorflow/contrib/py2tf/converters/for_loops_test.py
+++ b/tensorflow/contrib/autograph/converters/for_loops_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import for_loops
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import for_loops
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/converters/ifexp.py b/tensorflow/contrib/autograph/converters/ifexp.py
similarity index 88%
rename from tensorflow/contrib/py2tf/converters/ifexp.py
rename to tensorflow/contrib/autograph/converters/ifexp.py
index 5fd6f348af0df81a6ff35745da603bd431130e20..bb0c0a36a7827e5c73e0fa67f09aa4f54d497a2c 100644
--- a/tensorflow/contrib/py2tf/converters/ifexp.py
+++ b/tensorflow/contrib/autograph/converters/ifexp.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
 class IfExp(transformer.Base):
@@ -27,7 +27,7 @@ class IfExp(transformer.Base):
 
   def visit_IfExp(self, node):
     template = """
-        py2tf_utils.run_cond(test, lambda: body, lambda: orelse)
+        autograph_utils.run_cond(test, lambda: (body,), lambda: (orelse,))
     """
     desugared_ifexp = templates.replace_as_expression(
         template, test=node.test, body=node.body, orelse=node.orelse)
diff --git a/tensorflow/contrib/py2tf/converters/ifexp_test.py b/tensorflow/contrib/autograph/converters/ifexp_test.py
similarity index 86%
rename from tensorflow/contrib/py2tf/converters/ifexp_test.py
rename to tensorflow/contrib/autograph/converters/ifexp_test.py
index 9c357ef35b550833bcb79d39f0bdbc6d758d31a5..ac6849dcb4bd7dacd84bb205f5c65395d8c2f51e 100644
--- a/tensorflow/contrib/py2tf/converters/ifexp_test.py
+++ b/tensorflow/contrib/autograph/converters/ifexp_test.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import ifexp
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import ifexp
 from tensorflow.python.platform import test
 
 
@@ -38,7 +38,7 @@ class IfExpTest(converter_test_base.TestCase):
       return 1 if x else 0
 
     with self.compiled_fn(test_fn) as result:
-      result.py2tf_util = utils
+      result.autograph_util = utils
       for x in [0, 1]:
         self.assertEqual(test_fn(x), result.test_fn(x))
 
@@ -52,7 +52,7 @@ class IfExpTest(converter_test_base.TestCase):
       return y
 
     with self.compiled_fn(test_fn) as result:
-      result.py2tf_util = utils
+      result.autograph_util = utils
       result.f = f
       for x in [-2, 2]:
         self.assertEqual(test_fn(x), result.test_fn(x))
@@ -63,7 +63,7 @@ class IfExpTest(converter_test_base.TestCase):
       return x * x if x > 0 else x
 
     with self.compiled_fn(test_fn) as result:
-      result.py2tf_util = utils
+      result.autograph_util = utils
       for x in [-2, 2]:
         self.assertEqual(test_fn(x), result.test_fn(x))
 
@@ -73,7 +73,7 @@ class IfExpTest(converter_test_base.TestCase):
       return x * x if x > 0 else x if x else 1
 
     with self.compiled_fn(test_fn) as result:
-      result.py2tf_util = utils
+      result.autograph_util = utils
       for x in [-2, 0, 2]:
         self.assertEqual(test_fn(x), result.test_fn(x))
 
@@ -85,7 +85,7 @@ class IfExpTest(converter_test_base.TestCase):
       return -x
 
     with self.compiled_fn(test_fn) as result:
-      result.py2tf_util = utils
+      result.autograph_util = utils
       for x in [-2, 2, 5]:
         self.assertEqual(test_fn(x), result.test_fn(x))
 
@@ -97,7 +97,7 @@ class IfExpTest(converter_test_base.TestCase):
       return x
 
     with self.compiled_fn(test_fn) as result:
-      result.py2tf_util = utils
+      result.autograph_util = utils
       for x in [-2, 2, 5]:
         self.assertEqual(test_fn(x), result.test_fn(x))
 
diff --git a/tensorflow/contrib/py2tf/converters/list_comprehension.py b/tensorflow/contrib/autograph/converters/list_comprehension.py
similarity index 93%
rename from tensorflow/contrib/py2tf/converters/list_comprehension.py
rename to tensorflow/contrib/autograph/converters/list_comprehension.py
index e8744831100e4852919b5cd1253b74acea4d790d..d7f292015164e047d054c5d1fb0b391e960bb73d 100644
--- a/tensorflow/contrib/py2tf/converters/list_comprehension.py
+++ b/tensorflow/contrib/autograph/converters/list_comprehension.py
@@ -31,9 +31,9 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
 class ListCompCanonicalizationTransformer(transformer.Base):
diff --git a/tensorflow/contrib/py2tf/converters/list_comprehension_test.py b/tensorflow/contrib/autograph/converters/list_comprehension_test.py
similarity index 93%
rename from tensorflow/contrib/py2tf/converters/list_comprehension_test.py
rename to tensorflow/contrib/autograph/converters/list_comprehension_test.py
index 025fac11e41e6771fbb9b80ff3da70dc3ceec73e..4758671f5ec83c26cfa54be0ef68f5f564094f6c 100644
--- a/tensorflow/contrib/py2tf/converters/list_comprehension_test.py
+++ b/tensorflow/contrib/autograph/converters/list_comprehension_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import list_comprehension
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import list_comprehension
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/converters/lists.py b/tensorflow/contrib/autograph/converters/lists.py
similarity index 90%
rename from tensorflow/contrib/py2tf/converters/lists.py
rename to tensorflow/contrib/autograph/converters/lists.py
index 06e1dad8f4d652da78ed39309f5b40598e368ea6..234a0a7487d5fc9e068acf4a19af3bac84f4737e 100644
--- a/tensorflow/contrib/py2tf/converters/lists.py
+++ b/tensorflow/contrib/autograph/converters/lists.py
@@ -32,9 +32,9 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.python.framework import dtypes
 
 
@@ -61,17 +61,20 @@ class ListTransformer(transformer.Base):
     return templates.replace_as_expression(template, dtype_name=dtype_name)
 
   def _pre_populated_list(self, node):
-    raise NotImplementedError()
+    raise NotImplementedError('pre-populated lists')
 
   def visit_Expr(self, node):
     node = self.generic_visit(node)
     if isinstance(node.value, gast.Call):
       call_node = node.value
+
+      if not anno.hasanno(call_node.func, anno.Basic.QN):
+        return node
       qn = anno.getanno(call_node.func, anno.Basic.QN)
 
       if qn.qn[-1] == 'append' and (len(call_node.args) == 1):
         template = """
-          target = py2tf_utils.dynamic_list_append(target, element)
+          target = autograph_utils.dynamic_list_append(target, element)
         """
         node = templates.replace(
             template,
diff --git a/tensorflow/contrib/py2tf/converters/lists_test.py b/tensorflow/contrib/autograph/converters/lists_test.py
similarity index 90%
rename from tensorflow/contrib/py2tf/converters/lists_test.py
rename to tensorflow/contrib/autograph/converters/lists_test.py
index 671a1cc7b1225061a00731596c536c4403e0bdff..749ba14347314f975c5a6e1111133336e2f5c5e6 100644
--- a/tensorflow/contrib/py2tf/converters/lists_test.py
+++ b/tensorflow/contrib/autograph/converters/lists_test.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import lists
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import lists
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/py2tf/converters/logical_expressions.py b/tensorflow/contrib/autograph/converters/logical_expressions.py
similarity index 71%
rename from tensorflow/contrib/py2tf/converters/logical_expressions.py
rename to tensorflow/contrib/autograph/converters/logical_expressions.py
index 10192e6a036c4a44aa1e6f1b4a390579bd703373..3a795a315a3c2aa08ac1577a204102755b6e849c 100644
--- a/tensorflow/contrib/py2tf/converters/logical_expressions.py
+++ b/tensorflow/contrib/autograph/converters/logical_expressions.py
@@ -23,9 +23,10 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
 # TODO(mdan): Properly extrack boolean ops according to lazy eval rules.
@@ -44,19 +45,20 @@ class LogicalExpressionTransformer(transformer.Base):
   def __init__(self, context):
     super(LogicalExpressionTransformer, self).__init__(context)
     # TODO(mdan): Look into replacing with bitwise operators instead.
+    # TODO(mdan): Skip replacing if the function is trivial.
     self.op_mapping = {
-        gast.And: 'logical_and',
-        gast.Eq: 'equal',
-        gast.Gt: 'greater',
-        gast.GtE: 'greater_equal',
-        gast.Lt: 'less',
-        gast.LtE: 'less_equal',
-        gast.Not: 'logical_not',
-        gast.NotEq: 'not_equal',
-        gast.Or: 'logical_or',
-        gast.USub: 'negative',
-        gast.Is: 'py2tf_utils.dynamic_is',
-        gast.IsNot: 'py2tf_utils.dynamic_is_not'
+        gast.And: 'tf.logical_and',
+        gast.Eq: 'tf.equal',
+        gast.Gt: 'tf.greater',
+        gast.GtE: 'tf.greater_equal',
+        gast.Lt: 'tf.less',
+        gast.LtE: 'tf.less_equal',
+        gast.Not: 'tf.logical_not',
+        gast.NotEq: 'tf.not_equal',
+        gast.Or: 'tf.logical_or',
+        gast.USub: 'tf.negative',
+        gast.Is: 'autograph_utils.dynamic_is',
+        gast.IsNot: 'autograph_utils.dynamic_is_not'
     }
 
   def _expect_simple_symbol(self, operand):
@@ -70,27 +72,19 @@ class LogicalExpressionTransformer(transformer.Base):
         '"a.x or b"; for a workaround, assign the expression to a local '
         'variable and use that instead, for example "tmp = a.x", "tmp or b"')
 
-  def _matching_tf_op(self, operator):
+  def _matching_func(self, operator):
     op_type = type(operator)
     mapped_op = self.op_mapping.get(op_type)
     if not mapped_op:
       raise NotImplementedError('operator %s is not yet supported' % op_type)
     return mapped_op
 
-  def _inline_tf_op(self, op_name, args):
-    if 'py2tf_utils' in op_name:
-      # TODO(alexbw): explicitly spelling out the attribute function name
-      # until fix for issue highlighted in cl/188931581 lands.
-      template = """
-      py2tf_utils.op_name(args)
+  def _as_function(self, func_name, args):
+    template = """
+      func_name(args)
     """
-      op_name = op_name.replace('py2tf_utils.', '')
-    else:
-      template = """
-        tf.op_name(args)
-      """
     replacement = templates.replace_as_expression(
-        template, op_name=op_name, args=args)
+        template, func_name=parser.parse_expression(func_name), args=args)
     anno.setanno(replacement, SAFE_BOOLEAN_OPERAND, True)
     return replacement
 
@@ -104,14 +98,14 @@ class LogicalExpressionTransformer(transformer.Base):
     #   a < b < c   ->   a < b and b < c
     while ops_and_comps:
       op, right = ops_and_comps.pop(0)
-      binary_comparison = self._inline_tf_op(self._matching_tf_op(op),
-                                             (left, right))
+      binary_comparison = self._as_function(
+          self._matching_func(op), (left, right))
       if isinstance(left, gast.Name) and isinstance(right, gast.Name):
         anno.setanno(binary_comparison, SAFE_BOOLEAN_OPERAND, True)
       if op_tree:
         self._expect_simple_symbol(right)
-        op_tree = self._inline_tf_op('logical_and',
-                                     (binary_comparison, op_tree))
+        op_tree = self._as_function('tf.logical_and',
+                                    (binary_comparison, op_tree))
       else:
         op_tree = binary_comparison
       left = right
@@ -120,7 +114,7 @@ class LogicalExpressionTransformer(transformer.Base):
 
   def visit_UnaryOp(self, node):
     node = self.generic_visit(node)
-    return self._inline_tf_op(self._matching_tf_op(node.op), node.operand)
+    return self._as_function(self._matching_func(node.op), node.operand)
 
   def visit_BoolOp(self, node):
     node = self.generic_visit(node)
@@ -130,7 +124,7 @@ class LogicalExpressionTransformer(transformer.Base):
     while node_values:
       left = node_values.pop()
       self._expect_simple_symbol(left)
-      right = self._inline_tf_op(self._matching_tf_op(node.op), (left, right))
+      right = self._as_function(self._matching_func(node.op), (left, right))
     return right
 
 
diff --git a/tensorflow/contrib/py2tf/converters/logical_expressions_test.py b/tensorflow/contrib/autograph/converters/logical_expressions_test.py
similarity index 92%
rename from tensorflow/contrib/py2tf/converters/logical_expressions_test.py
rename to tensorflow/contrib/autograph/converters/logical_expressions_test.py
index eb28c309a429f2267cc1ae1f6f65a8cde0ad91b8..2814060c4d831e4dddacb3dcbcbe1db42160db20 100644
--- a/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
+++ b/tensorflow/contrib/autograph/converters/logical_expressions_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import logical_expressions
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import logical_expressions
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/py2tf/converters/name_scopes.py b/tensorflow/contrib/autograph/converters/name_scopes.py
similarity index 93%
rename from tensorflow/contrib/py2tf/converters/name_scopes.py
rename to tensorflow/contrib/autograph/converters/name_scopes.py
index c702823fcf047fcad3254318bd323d2b8fddd700..2a3f474360e94635470bf9581222e4c79f46b7a1 100644
--- a/tensorflow/contrib/py2tf/converters/name_scopes.py
+++ b/tensorflow/contrib/autograph/converters/name_scopes.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
 
 
 class FunctionNameScopeTransformer(transformer.Base):
diff --git a/tensorflow/contrib/py2tf/converters/name_scopes_test.py b/tensorflow/contrib/autograph/converters/name_scopes_test.py
similarity index 95%
rename from tensorflow/contrib/py2tf/converters/name_scopes_test.py
rename to tensorflow/contrib/autograph/converters/name_scopes_test.py
index a8ca341602ee5f06dbb812643a58794339d98afe..61e5db2af826d0c2238f1af0f3240411596f7429 100644
--- a/tensorflow/contrib/py2tf/converters/name_scopes_test.py
+++ b/tensorflow/contrib/autograph/converters/name_scopes_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import name_scopes
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import name_scopes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/autograph/converters/side_effect_guards.py
similarity index 91%
rename from tensorflow/contrib/py2tf/converters/side_effect_guards.py
rename to tensorflow/contrib/autograph/converters/side_effect_guards.py
index 30976b3ec6db5a6607023ac804d9d54cfb296190..1c1293d2c411b51b563ac3965284a48725ed3278 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/autograph/converters/side_effect_guards.py
@@ -36,12 +36,12 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import ast_util
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class SymbolNamer(object):
@@ -160,8 +160,8 @@ class SideEffectGuardTransformer(transformer.Base):
               [alias_map.get(s, s).ast() for s in guarded_args], None)
 
         template = """
-          with py2tf_utils.control_dependency_on_returns(call):
-            aliased_guarded_args = py2tf_utils.alias_tensors(guarded_args)
+          with autograph_utils.control_dependency_on_returns(call):
+            aliased_guarded_args = autograph_utils.alias_tensors(guarded_args)
         """
         control_deps_guard = templates.replace(
             template,
@@ -172,7 +172,7 @@ class SideEffectGuardTransformer(transformer.Base):
         alias_map = {}
 
         template = """
-          with py2tf_utils.control_dependency_on_returns(call):
+          with autograph_utils.control_dependency_on_returns(call):
             pass
         """
         control_deps_guard = templates.replace(template, call=node.value)[-1]
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
similarity index 97%
rename from tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
rename to tensorflow/contrib/autograph/converters/side_effect_guards_test.py
index 463db2e770213ba9636d2537b095a77dece5d8f6..ce0ce33243a1352107eb8121050ee76474869809 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import side_effect_guards
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import side_effect_guards
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/py2tf/converters/single_return.py b/tensorflow/contrib/autograph/converters/single_return.py
similarity index 96%
rename from tensorflow/contrib/py2tf/converters/single_return.py
rename to tensorflow/contrib/autograph/converters/single_return.py
index 1194b98f5ebeffa79a41fc3b32aa79ffd8cc407b..bcc9ca9dfeb00ef2d2e60edf6a1abfba19a1bad7 100644
--- a/tensorflow/contrib/py2tf/converters/single_return.py
+++ b/tensorflow/contrib/autograph/converters/single_return.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import ast_util
-from tensorflow.contrib.py2tf.pyct import templates
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 # TODO(mdan): Move this logic into transformer_base.
@@ -232,7 +232,7 @@ class DetectReturnInUnsupportedControlFlow(gast.NodeVisitor):
   def visit_Return(self, node):
     if self.cant_return:
       raise ValueError(
-          'Pyflow currently does not support `return` statements in loops. '
+          '`return` statements are not supported in loops. '
           'Try assigning to a variable in the while loop, and returning '
           'outside of the loop')
 
diff --git a/tensorflow/contrib/py2tf/converters/single_return_test.py b/tensorflow/contrib/autograph/converters/single_return_test.py
similarity index 97%
rename from tensorflow/contrib/py2tf/converters/single_return_test.py
rename to tensorflow/contrib/autograph/converters/single_return_test.py
index 2ea7a9d6d3e25c8dafd8f211994c8fe99bd0e781..d483005a09537ea8227814f65aa7e6402c853f60 100644
--- a/tensorflow/contrib/py2tf/converters/single_return_test.py
+++ b/tensorflow/contrib/autograph/converters/single_return_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import single_return
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import single_return
 from tensorflow.python.framework.ops import name_scope
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..d62390494b78c415212ba91ac914cdfee324f971
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
@@ -0,0 +1,1919 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Dev Summit 2018 - Autograph",
+      "version": "0.3.2",
+      "views": {},
+      "default_view": {},
+      "provenance": [
+        {
+          "file_id": "1wCZUh73zTNs1jzzYjqoxMIdaBWCdKJ2K",
+          "timestamp": 1522238054357
+        },
+        {
+          "file_id": "1_HpC-RrmIv4lNaqeoslUeWaX8zH5IXaJ",
+          "timestamp": 1521743157199
+        },
+        {
+          "file_id": "1mjO2fQ2F9hxpAzw2mnrrUkcgfb7xSGW-",
+          "timestamp": 1520522344607
+        }
+      ],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python2",
+      "display_name": "Python 2"
+    }
+  },
+  "cells": [
+    {
+      "metadata": {
+        "id": "g7nGs4mzVUHP",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Experimental: TF Autograph\n",
+        "**TensorFlow Dev Summit, 2018.**\n",
+        "\n",
+        "This interactive notebook demonstrates **autograph**, an experimental source-code transformation library to automatically convert TF.Eager and Python code to TensorFlow graphs.\n",
+        "\n",
+        "**Note: this is pre-alpha software!** The notebook works best with Python 2, for now.\n",
+        "\n",
+        "> ![alt text](https://lh3.googleusercontent.com/QOvy0clmg7siaVKzwmSPAjicWWNQ0OeyaB16plDjSJMf35WD3vLjF6mz4CGrhSHw60HnlZPJjkyDCBzw5XOI0oBGSewyYw=s688)\n",
+        "\n",
+        "### Table of Contents\n",
+        "1. _Write Eager code that is fast and scalable._\n",
+        "2. _Case study: complex control flow._\n",
+        "3. _Case study: training MNIST with Keras._\n",
+        "4. _Case study: building an RNN._"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "uFcgBENZqkB2",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# Install TensorFlow; note that Colab notebooks run remotely, on virtual\n",
+        "# instances provided by Google.\n",
+        "!pip install -U -q tf-nightly"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Pa2qpEmoVOGe",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import time\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import six\n",
+        "\n",
+        "from google.colab import widgets"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "ZVKfj5ttVkqz",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 1. Write Eager code that is fast and scalable\n",
+        "\n",
+        "TF.Eager gives you more flexibility while coding, but at the cost of losing the benefits of TensorFlow graphs. For example, Eager does not currently support distributed training, exporting models, and a variety of memory and computation optimizations.\n",
+        "\n",
+        "Autograph gives you the best of both worlds: write your code in an Eager style, and we will automatically transform it into the equivalent TF graph code. The graph code can be executed eagerly (as a single op), included as part of a larger graph, or exported."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "snaZRFdWd9ym",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "For example, autograph can convert a function like this:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9__n8cSIeDnD",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def g(x):\n",
+        "  if x > 0:\n",
+        "    x = x * x\n",
+        "  else:\n",
+        "    x = 0\n",
+        "  return x"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "gq0eQcuReHET",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "... into a TF graph-building function:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "sELSn599ePUF",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 413
+        },
+        "outputId": "bb0c7216-1ca3-4da1-d1fb-589902cdcd1a",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345737505,
+          "user_tz": 240,
+          "elapsed": 243,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "print(autograph.to_code(g))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "from tensorflow.contrib.autograph.impl import api as autograph_api\n",
+            "from tensorflow.contrib.autograph import utils as autograph_utils\n",
+            "\n",
+            "def tf__g(x):\n",
+            "  with tf.name_scope('g'):\n",
+            "\n",
+            "    def if_true():\n",
+            "      with tf.name_scope('if_true'):\n",
+            "        x_1, = x,\n",
+            "        x_1 = x_1 * x_1\n",
+            "        return x_1,\n",
+            "\n",
+            "    def if_false():\n",
+            "      with tf.name_scope('if_false'):\n",
+            "        x_1, = x,\n",
+            "        x_1 = 0\n",
+            "        return x_1,\n",
+            "    x = autograph_utils.run_cond(tf.greater(x, 0), if_true, if_false)\n",
+            "    return x\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "j74n-8hEe6dk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "You can then use the converted function as you would any regular TF op -- you can pass `Tensor` arguments and it will return `Tensor`s:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "AkVaY0-dfEbH",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "outputId": "4ffe3757-c44d-424c-c2a8-7ddc973bfcce",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345737841,
+          "user_tz": 240,
+          "elapsed": 257,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "tf_g = autograph.to_graph(g)\n",
+        "\n",
+        "with tf.Graph().as_default():  \n",
+        "\n",
+        "  g_ops = tf_g(tf.constant(9))\n",
+        "\n",
+        "  with tf.Session() as sess:\n",
+        "    tf_g_result = sess.run(g_ops)\n",
+        "\n",
+        "  print('g(9) = %s' % g(9))\n",
+        "  print('tf_g(9) = %s' % tf_g_result)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "g(9) = 81\n",
+            "tf_g(9) = 81\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "trrHQBM1VnD0",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 2. Case study: complex control flow\n",
+        "\n",
+        "Autograph can convert a large chunk of the Python language into graph-equivalent code, and we're adding new supported language features all the time. In this section, we'll give you a taste of some of the functionality in autograph.\n",
+        "Autograph will automatically convert most Python control flow statements into their correct graph equivalent.\n",
+        "  "
+      ]
+    },
+    {
+      "metadata": {
+        "id": "u0YG3DPgZxoW",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "We support common statements like `while`, `for`, `if`, `break`, `return` and more. You can even nest them as much as you like. Imagine trying to write the graph version of this code by hand:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "xJYDzOcrZ8pI",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "6c244ee4-b141-4ad6-eefa-cfffa71f33c6",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345738402,
+          "user_tz": 240,
+          "elapsed": 483,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def sum_even(numbers):\n",
+        "  s = 0\n",
+        "  for n in numbers:\n",
+        "    if n % 2 > 0:\n",
+        "      continue\n",
+        "    s += n\n",
+        "  return s\n",
+        "\n",
+        "\n",
+        "tf_sum_even = autograph.to_graph(sum_even)\n",
+        "\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    result = sess.run(tf_sum_even(tf.constant([10, 12, 15, 20])))\n",
+        "\n",
+        "  print('Sum of even numbers: %s' % result)\n",
+        "  \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(sum_even))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Sum of even numbers: 42\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "_YXo4KOcbKrn",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Try replacing the `continue` in the above code with `break` -- Autograph supports that as well!"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "xHmC0rBIavW_",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "The Python code above is much more readable than the matching graph code. Autograph takes care of tediously converting every piece of Python code into the matching TensorFlow graph version for you, so that you can quickly write maintainable code, but still benefit from the optimizations and deployment benefits of graphs."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UEHWGpBXbS7g",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Let's try some other useful Python constructs, like `print` and `assert`. We automatically convert Python `assert` statements into the equivalent `tf.Assert` code.  "
+      ]
+    },
+    {
+      "metadata": {
+        "id": "qUU57xlEbauI",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "outputId": "add3db4a-2077-4dd5-f7a7-a5b5a4529c26",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345738697,
+          "user_tz": 240,
+          "elapsed": 253,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def f(x):\n",
+        "  assert x != 0, 'Do not pass zero!'\n",
+        "  return x * x\n",
+        "\n",
+        "tf_f = autograph.to_graph(f)\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    try:\n",
+        "      print(sess.run(tf_f(tf.constant(0))))\n",
+        "    except tf.errors.InvalidArgumentError as e:\n",
+        "      print('Got error message: %s' % e.message)\n",
+        "      \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(f))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Got error message: assertion failed: [Do not pass zero!]\n",
+            "\t [[Node: f/Assert/Assert = Assert[T=[DT_STRING], summarize=3, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](f/NotEqual, f/Assert/Assert/data_0)]]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "w5hBZaVJbck4",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "You can also use `print` functions in-graph:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "6NdzRKLEboRv",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "fb82dfc3-790f-4127-87f6-361805be9e9b",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345739013,
+          "user_tz": 240,
+          "elapsed": 247,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def print_sign(n):\n",
+        "  if n >= 0:\n",
+        "    print(n, 'is positive!')\n",
+        "  else:\n",
+        "    print(n, 'is negative!')\n",
+        "  return n\n",
+        "\n",
+        "\n",
+        "tf_print_sign = autograph.to_graph(print_sign)\n",
+        "with tf.Graph().as_default():\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf_print_sign(tf.constant(1)))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(print_sign))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "1 is positive!\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9u_Z3i3AivLA",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "We can convert lists to TensorArray, so appending to lists also works, with a few modifications:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "MjhCQJVuiTNR",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "dc320b87-595b-4392-d29c-994486fd8a0a",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345744470,
+          "user_tz": 240,
+          "elapsed": 5391,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def f(n):\n",
+        "  numbers = []\n",
+        "  # We ask you to tell us about the element dtype.\n",
+        "  autograph.utils.set_element_type(numbers, tf.int32)\n",
+        "  for i in range(n):\n",
+        "    numbers.append(i)\n",
+        "  return numbers.stack() # Stack the list so that it can be used as a Tensor\n",
+        "\n",
+        "\n",
+        "tf_f = autograph.to_graph(f)\n",
+        "with tf.Graph().as_default():\n",
+        "  with tf.Session() as sess:\n",
+        "    print(sess.run(tf_f(tf.constant(5))))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(f))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "[0 1 2 3 4]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UdG8ZFrkTAF2",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "And all of these functionalities, and more, can be composed into more complicated code:\n"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "DVs6wt8NKaGQ",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "cellView": "code",
+        "outputId": "0a4b8d08-8f65-4bbc-85ba-dc4c60563519",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345745186,
+          "user_tz": 240,
+          "elapsed": 658,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def print_primes(n):\n",
+        "  \"\"\"Returns all the prime numbers less than n.\"\"\"\n",
+        "  assert n > 0\n",
+        "  \n",
+        "  primes = []\n",
+        "  autograph.utils.set_element_type(primes, tf.int32)\n",
+        "  for i in range(2, n):\n",
+        "    is_prime = True\n",
+        "    for k in range(2, i):\n",
+        "      if i % k == 0:\n",
+        "        is_prime = False\n",
+        "        break\n",
+        "    if not is_prime:\n",
+        "      continue\n",
+        "    primes.append(i)\n",
+        "  all_primes = primes.stack()\n",
+        "\n",
+        "  print('The prime numbers less than', n, 'are:')\n",
+        "  print(all_primes)\n",
+        "  return tf.no_op()\n",
+        "\n",
+        "    \n",
+        "tf_print_primes = autograph.to_graph(print_primes)\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    n = tf.constant(50)\n",
+        "    sess.run(tf_print_primes(n))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(print_primes))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "The prime numbers less than 50 are:\n",
+            "[ 2  3  5  7 11 13 17 19 23 29 31 37 41 43 47]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "JQ8kQT99VqDk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 3. Case study: training MNIST with Keras\n",
+        "\n",
+        "As we've seen, writing control flow in Autograph is easy. So running a training loop in graph should be easy as well!\n",
+        "\n",
+        "Here, we show an example of such a training loop for a simple Keras model that trains on MNIST."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "0CrtGWgwuLJr",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "import gzip\n",
+        "import shutil\n",
+        "\n",
+        "from six.moves import urllib\n",
+        "\n",
+        "\n",
+        "def download(directory, filename):\n",
+        "  filepath = os.path.join(directory, filename)\n",
+        "  if tf.gfile.Exists(filepath):\n",
+        "    return filepath\n",
+        "  if not tf.gfile.Exists(directory):\n",
+        "    tf.gfile.MakeDirs(directory)\n",
+        "  url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'\n",
+        "  zipped_filepath = filepath + '.gz'\n",
+        "  print('Downloading %s to %s' % (url, zipped_filepath))\n",
+        "  urllib.request.urlretrieve(url, zipped_filepath)\n",
+        "  with gzip.open(zipped_filepath, 'rb') as f_in, open(filepath, 'wb') as f_out:\n",
+        "    shutil.copyfileobj(f_in, f_out)\n",
+        "  os.remove(zipped_filepath)\n",
+        "  return filepath\n",
+        "\n",
+        "\n",
+        "def dataset(directory, images_file, labels_file):\n",
+        "  images_file = download(directory, images_file)\n",
+        "  labels_file = download(directory, labels_file)\n",
+        "\n",
+        "  def decode_image(image):\n",
+        "    # Normalize from [0, 255] to [0.0, 1.0]\n",
+        "    image = tf.decode_raw(image, tf.uint8)\n",
+        "    image = tf.cast(image, tf.float32)\n",
+        "    image = tf.reshape(image, [784])\n",
+        "    return image / 255.0\n",
+        "\n",
+        "  def decode_label(label):\n",
+        "    label = tf.decode_raw(label, tf.uint8)\n",
+        "    label = tf.reshape(label, [])\n",
+        "    return tf.to_int32(label)\n",
+        "\n",
+        "  images = tf.data.FixedLengthRecordDataset(\n",
+        "      images_file, 28 * 28, header_bytes=16).map(decode_image)\n",
+        "  labels = tf.data.FixedLengthRecordDataset(\n",
+        "      labels_file, 1, header_bytes=8).map(decode_label)\n",
+        "  return tf.data.Dataset.zip((images, labels))\n",
+        "\n",
+        "\n",
+        "def mnist_train(directory):\n",
+        "  return dataset(directory, 'train-images-idx3-ubyte',\n",
+        "                 'train-labels-idx1-ubyte')\n",
+        "\n",
+        "def mnist_test(directory):\n",
+        "  return dataset(directory, 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "2zu1U9Nqir6L",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "First, we'll define a small three-layer neural network using the Keras API"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "x_MU13boiok2",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def mlp_model(input_shape):\n",
+        "  model = tf.keras.Sequential([\n",
+        "      tf.keras.layers.Dense(100, activation='relu', input_shape=input_shape),\n",
+        "      tf.keras.layers.Dense(100, activation='relu'),\n",
+        "      tf.keras.layers.Dense(10, activation='softmax')])\n",
+        "  model.build()\n",
+        "  return model"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Wuqg3H8mi0Xj",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Let's connect the model definition (here abbreviated as `m`) to a loss function, so that we can train our model."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "W51sfbONiz_5",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def predict(m, x, y):\n",
+        "  y_p = m(x)\n",
+        "  losses = tf.keras.losses.categorical_crossentropy(y, y_p)\n",
+        "  l = tf.reduce_mean(losses)\n",
+        "  accuracies = tf.keras.metrics.categorical_accuracy(y, y_p)\n",
+        "  accuracy = tf.reduce_mean(accuracies)\n",
+        "  return l, accuracy"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "035tNWQki9tr",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Now the final piece of the problem specification (before loading data, and clicking everything together) is backpropagating the loss through the model, and optimizing the weights using the gradient."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "CsAD0ajbi9iZ",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def fit(m, x, y, opt):\n",
+        "  l, accuracy = predict(m, x, y)\n",
+        "  opt.minimize(l)\n",
+        "  return l, accuracy"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "PcVRIacKjSwb",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "These are some utility functions to download data and generate batches for training"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "RVw57HdTjPzi",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def setup_mnist_data(is_training, hp, batch_size):\n",
+        "  if is_training:\n",
+        "    ds = mnist_train('/tmp/autograph_mnist_data')\n",
+        "    ds = ds.shuffle(batch_size * 10)\n",
+        "  else:\n",
+        "    ds = mnist_test('/tmp/autograph_mnist_data')\n",
+        "  ds = ds.repeat()\n",
+        "  ds = ds.batch(batch_size)\n",
+        "  return ds\n",
+        "\n",
+        "def get_next_batch(ds):\n",
+        "  itr = ds.make_one_shot_iterator()\n",
+        "  image, label = itr.get_next()\n",
+        "  x = tf.to_float(tf.reshape(image, (-1, 28 * 28)))\n",
+        "  y = tf.one_hot(tf.squeeze(label), 10)\n",
+        "  return x, y"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "2zEJH5XNjgFz",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "This function specifies the main training loop. We instantiate the model (using the code above), instantiate an optimizer (here we'll use SGD with momentum, nothing too fancy), and we'll instantiate some lists to keep track of training and test loss and accuracy over time.\n",
+        "\n",
+        "In the loop inside this function, we'll grab a batch of data, apply an update to the weights of our model to improve its performance, and then record its current training loss and accuracy. Every so often, we'll log some information about training as well."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UUI0566FjZPx",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def train(train_ds, test_ds, hp):\n",
+        "  m = mlp_model((28 * 28,))\n",
+        "  opt = tf.train.MomentumOptimizer(hp.learning_rate, 0.9)\n",
+        "  train_losses = []\n",
+        "  train_losses = autograph.utils.set_element_type(train_losses, tf.float32)\n",
+        "  test_losses = []\n",
+        "  test_losses = autograph.utils.set_element_type(test_losses, tf.float32)\n",
+        "  train_accuracies = []\n",
+        "  train_accuracies = autograph.utils.set_element_type(train_accuracies,\n",
+        "                                                      tf.float32)\n",
+        "  test_accuracies = []\n",
+        "  test_accuracies = autograph.utils.set_element_type(test_accuracies,\n",
+        "                                                     tf.float32)\n",
+        "  i = tf.constant(0)\n",
+        "  while i < hp.max_steps:\n",
+        "    train_x, train_y = get_next_batch(train_ds)\n",
+        "    test_x, test_y = get_next_batch(test_ds)\n",
+        "    step_train_loss, step_train_accuracy = fit(m, train_x, train_y, opt)\n",
+        "    step_test_loss, step_test_accuracy = predict(m, test_x, test_y)\n",
+        "    if i % (hp.max_steps // 10) == 0:\n",
+        "      print('Step', i, 'train loss:', step_train_loss, 'test loss:',\n",
+        "            step_test_loss, 'train accuracy:', step_train_accuracy,\n",
+        "            'test accuracy:', step_test_accuracy)\n",
+        "    train_losses.append(step_train_loss)\n",
+        "    test_losses.append(step_test_loss)\n",
+        "    train_accuracies.append(step_train_accuracy)\n",
+        "    test_accuracies.append(step_test_accuracy)\n",
+        "    i += 1\n",
+        "  return (train_losses.stack(), test_losses.stack(),  train_accuracies.stack(),\n",
+        "          test_accuracies.stack())"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "cYiUQ1ppkHzk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Everything is ready to go, let's train the model and plot its performance!"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "K1m8TwOKjdNd",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {},
+            {},
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 988
+        },
+        "outputId": "f9d3eef3-5bea-45c1-ddf9-4edee73e4436",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345800262,
+          "user_tz": 240,
+          "elapsed": 52391,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "with tf.Graph().as_default():\n",
+        "  hp = tf.contrib.training.HParams(\n",
+        "      learning_rate=0.05,\n",
+        "      max_steps=500,\n",
+        "  )\n",
+        "  train_ds = setup_mnist_data(True, hp, 50)\n",
+        "  test_ds = setup_mnist_data(False, hp, 1000)\n",
+        "  tf_train = autograph.to_graph(train)\n",
+        "  (train_losses, test_losses, train_accuracies,\n",
+        "   test_accuracies) = tf_train(train_ds, test_ds, hp)\n",
+        "\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf.global_variables_initializer())\n",
+        "    (train_losses, test_losses, train_accuracies,\n",
+        "     test_accuracies) = sess.run([train_losses, test_losses, train_accuracies,\n",
+        "                                  test_accuracies])\n",
+        "    plt.title('MNIST train/test losses')\n",
+        "    plt.plot(train_losses, label='train loss')\n",
+        "    plt.plot(test_losses, label='test loss')\n",
+        "    plt.legend()\n",
+        "    plt.xlabel('Training step')\n",
+        "    plt.ylabel('Loss')\n",
+        "    plt.show()\n",
+        "    plt.title('MNIST train/test accuracies')\n",
+        "    plt.plot(train_accuracies, label='train accuracy')\n",
+        "    plt.plot(test_accuracies, label='test accuracy')\n",
+        "    plt.legend(loc='lower right')\n",
+        "    plt.xlabel('Training step')\n",
+        "    plt.ylabel('Accuracy')\n",
+        "    plt.show()"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz to /tmp/autograph_mnist_data/train-images-idx3-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz to /tmp/autograph_mnist_data/train-labels-idx1-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz to /tmp/autograph_mnist_data/t10k-images-idx3-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz to /tmp/autograph_mnist_data/t10k-labels-idx1-ubyte.gz\n",
+            "Step 0 train loss: 2.244329 test loss: 2.2499208 train accuracy: 0.12 test accuracy: 0.161\n",
+            "Step 50 train loss: 0.64771986 test loss: 0.56013924 train accuracy: 0.82 test accuracy: 0.836\n",
+            "Step 100 train loss: 0.49011207 test loss: 0.42143965 train accuracy: 0.84 test accuracy: 0.879\n",
+            "Step 150 train loss: 0.3768609 test loss: 0.39319593 train accuracy: 0.88 test accuracy: 0.883\n",
+            "Step 200 train loss: 0.36007702 test loss: 0.37089333 train accuracy: 0.9 test accuracy: 0.881\n",
+            "Step 250 train loss: 0.182115 test loss: 0.28543878 train accuracy: 0.94 test accuracy: 0.915\n",
+            "Step 300 train loss: 0.2119576 test loss: 0.22305593 train accuracy: 0.92 test accuracy: 0.93\n",
+            "Step 350 train loss: 0.12932214 test loss: 0.29057172 train accuracy: 0.96 test accuracy: 0.906\n",
+            "Step 400 train loss: 0.22937602 test loss: 0.2200287 train accuracy: 0.92 test accuracy: 0.925\n",
+            "Step 450 train loss: 0.23444137 test loss: 0.19857481 train accuracy: 0.94 test accuracy: 0.94\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAFnCAYAAACPasF4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzs3XmAFNW9Pvynlt5mYdhmQMHggnGN\nS9zCD0ElKug1edUY9ZoQTYze3GuiRk1uYjRqRHNj4n5NrhKjiUYlbihGQFRUFDSoKIvgICAO6+xL\n711V5/2jlq7qZaZnpnumZ3g+/zjTXV1dXSP91PecU+dIQggBIiIiGjLkwT4AIiIi6h2GNxER0RDD\n8CYiIhpiGN5ERERDDMObiIhoiGF4ExERDTEMb6JeOOigg3DllVdmPf6rX/0KBx10kGe766+/3rPN\ne++9h9mzZwMAtm3bhkMPPdR57osvvsCPfvQjzJw5EzNnzsTZZ5+NV199FQBw0003YdasWZg1axYO\nO+wwnHLKKc7v4XDY8x7JZBLz58/v9edavXo1Lr300oK2XbBgAebMmdPn97J19/rZs2fjhRde6PO+\niYY7hjdRL3366aee0Ewmk1izZk3WditXrsQnn3xS0D6vu+46TJs2DYsXL8bixYtxyy234LrrrsPO\nnTtxyy23YNGiRVi0aBHGjRuH3//+987vVVVVnv188sknfQrUI444Ag8//HBB2y5fvhxTpkzp83vZ\n+vt6oj0Zw5uol0444QQsWbLE+f3tt9/GV77ylaztrrnmGtx+++0F7bO+vh5HHnmk8/uRRx6JxYsX\nY/z48QUfV3NzM3784x/jo48+wkUXXQTAbAF48MEHMXPmTOi6jlWrVuHcc8/FrFmzcOaZZ2L58uUA\nzFaB0047DQBw//334ze/+Q2uuOIKfP3rX8d5552HxsZG533ee+89HHzwwVnv9cEHH+Bb3/oWTjvt\nNJx//vloaGgAAOzevRsXX3wxzjzzTJx66qm4++67cx5rPu+99x7OOecczJo1C9/+9redC6Vc++3u\ncSEE/vd//xczZ87EKaecgjlz5kDXdQDAwoULcdZZZ+GMM87AN77xDbz33nsFn3eiwcDwJuqlM844\nAy+99JLz+z//+U/MmjUr53ZCCCxatKjHfU6fPh1XXnkl/va3v2HTpk0AgHHjxkGSpIKPa+zYsbjm\nmmtw1FFH4YknnnAeF0Jg8eLFUBQFv/71r3HppZdi0aJFuPzyy3HTTTfl3NeiRYtw/fXX49VXX8WY\nMWPw7LPPAgA2bdqE2tpaTJgwwfNe4XAY//mf/4lrrrkGS5Yswfe+9z1cddVVAIBHH30Uxx13HF5+\n+WUsWLAADQ0NMAwj57FmikQiuOqqq3DDDTdg0aJF+OEPf4jrrrsOhmHk3G9jY2Pex1944QUsWrQI\nzzzzDJYsWYKGhgY8+eSTAIBbbrkFDz74IBYuXIibbroJr7/+esHnnWgwMLyJeun444/Hxo0b0dLS\nglgshlWrVmHKlCk5t73++uvxhz/8AYlEott9/v73v8d3vvMdLFiwAGeddRZmzJjhBEt/nXzyyc7P\n8+fPxxlnnAEAOOaYY5zqONOxxx6LCRMmQJIkHHLIIdi5cycAYMWKFTk/6wcffIBx48Zh6tSpAICz\nzjoLX3zxBXbs2IExY8bg7bffxvvvvw+/34+77roLdXV1BR376tWrMX78eBxzzDEAgJkzZ6KtrQ3b\nt2/Pu998jy9duhTf+ta3UF1dDVVV8e1vfxuvvPIKAGDMmDF46qmnsH37dhx77LH45S9/WdjJJRok\n6mAfANFQoygKTj/9dCxcuBCjR4/GiSeeCFXN/U/psMMOw3HHHYdHHnkERx99dN59BgIBXHrppbj0\n0kvR2dmJRYsW4fbbb8fEiRMxbdq0fh3vyJEjnZ8XLFiAv/3tb4hEIjAMA/mWNqiurnZ+VhTFaV5+\n5513cMkll2Rt39nZiYaGBk8LhN/vR2trKy655BIYhoFbbrkFjY2N+M53voOf/OQnBR17a2srRowY\nkXVsLS0tefeb7/Guri48/PDDmDdvHgBA13WMHj0aAPCnP/0Jf/rTn3Duuedir732wvXXX4/jjz++\noGMkGgwMb6I+OPPMM3H33Xdj1KhRPfbZ/vSnP8W5556LiRMn5ny+tbUV69evd6rWESNG4Pzzz8ey\nZctQX1/f7/C27d69GzfccAOefvppHHLIIfj8888xc+bMgl+vaRrWrFmT8yKkrq4O+++/P5577rmc\nr7388stx+eWXY8uWLbjsssucSronY8aMQXt7u/O7EAIdHR0YM2YMVFXNud+pU6fmfLyurg4zZszA\nd7/73az3+dKXvoTf/va3MAwD8+fPx7XXXotly5YVeGaIBh6bzYn64Oijj0ZjYyM2btzYY4VWV1eH\n73znO7j//vtzPh+Px3HllVd6wmLr1q34+OOPceyxx/bquFRVRTgczllRt7a2oqKiAvvvvz80TXMq\n0EgkUtC+V69ejYMOOgh+vz/rvY488kg0NTXh448/BgA0NDTgZz/7GYQQ+PWvf4133nkHgBmSY8eO\nhSRJ3R6r7YgjjkBzczNWrVoFwBxfMH78eEycODHvfvM9/vWvfx0vvPACYrEYAOCpp57C888/j9bW\nVnz/+99HOByGLMs48sgjezXWgGgwsPIm6gNJknDaaachFotBlnu+Bv7BD36Ap59+Oudze++9N/70\npz/hvvvuw5w5cyCEQFVVFX75y196RqAX4phjjsEf/vAHTJs2DW+++abnuYMPPhjTp0/HzJkzMWbM\nGPziF7/Ahx9+iNmzZ+O///u/e9y3fYtYvve67777cOuttyISicDn8+Gqq66CJEm48MIL8etf/xq3\n3norhBCYMWMGpkyZgh07dnheryhK1ntWVFTgnnvuwa233opoNIrRo0fjrrvu6na/I0eOzPk4AGzc\nuBHnnHMOADPYb7vtNowePRrTpk3Dt771LSiKAp/Ph9tuu61X551ooElcz5uIiGhoYbM5ERHREMPw\nJiIiGmIY3kREREMMw5uIiGiIYXgTERENMUPmVrGmpq6i7m/UqAq0tUWLus89Ec9j//Ec9h/PYXHw\nPPZfsc9hbW11zsf32MpbVbPvKaXe43nsP57D/uM5LA6ex/4bqHO4x4Y3ERHRUMXwJiIiGmIY3kRE\nREMMw5uIiGiIYXgTERENMQxvIiKiIYbhTURENMQwvImIaNh6443XCt723nvvxI4d23vc7sMP38cN\nN/y8P4fVbwxvIiIalnbu3IFXX11c8PZXXXUt9t57QgmPqHiGzPSoREREvXHXXb/D+vXr8Mgjc2EY\nBnbs2I6dO3fgnnv+iN/+9jdoampELBbDD35wOaZOnYYf//hyXHPNz7F06WuIRML44out2L59G668\n8lpMmTI153u89toSzJv3dyiKgoMOOgS33XYL6us34M47fwefzwe/349bbvktdu7cnvVYdXXuqU8L\nsceGd0c4gfc3NOLYg+sG+1CIiIa9f7z+GVZuaCzqPo87uA7nz5ic9/l///fZeO65f+D7378MDz/8\nIDQthT/+8c9oa2vF8cd/DWeccRa2b9+GG2/8BaZOneZ5bWPjbvzhD/fh3XeX44UXns0Z3tFoFA89\n9AAeeeQJVFRU4Oc//yneffddvPzyyzjnnPMwa9a/4YMPVqK1tQUvv7wg6zGGdx9ceecbaO2M46ZL\njsOk8X0/gURENDQccshhAIDq6hFYv34dXnzxOUiSjM7OjqxtjzjiKABAXV0dwuFwzv01NHyBiRO/\nhIqKCgDA0Ucfg/Xr1+PEE0/CH/7wP2ho+AJf//ppmDRp35yP9cceGd5b23YiPOFNSMnD0dwRZ3gT\nEZXY+TMmd1slDwSfzwcAWLJkETo7O/HAA39GZ2cnfvjD2VnbKkp6gREhRM79SZL3OU1LQZJCOPbY\n4/HnP/8Ny5cvw5w5N+PHP74652Nf/eqxff4se2R4f7ztCyjVbTBG70RLZ3ywD4eIiEpAlmXoup71\neHt7O/baa2/Isow333wdqVSqT/vfZ59J2LbtC0SjEVRUVGLVqg9x1VU/xrPPzsOUKSfi9NPPgBAC\n9fUbsGXLpqzHGN69dPykA7G4CZArO9DSwfAmIhqOJk3aD59+ugH33XcnKiurnMdPPnkGfvGLa/DJ\nJ2vxb//2TdTV1eGRR+b2ev+hUAhXXHEVrr32J5AkGUcccRSOPfZY7NzZghtv/AWqqqrg8/lw/fU3\nob7+06zH+kMS+doDykxTU1dR93fjit+ipTOCQyLn4yfnHlHUfe9Jamuri/632dPwHPYfz2Fx8Dz2\nX7HPYW1t7m7dPfY+7y+P2Q+SL4mmcOtgHwoREVGv7LHhPbFmPACgLdk2yEdCRETUO3tseI8JjQIA\nxBFGPKkN8tEQEREVbs8N74rRAADJH+egNSIiGlL22PAeW2FW3pI/xtvFiIhoSNljw3uME96svImI\naGjZY8M75AvCLwcg+eNoZuVNRDQs9WZJUNtHH32ItjbvnUjlsAyo2x4b3gAwMlDDypuIaJjq7ZKg\ntn/+88Ws8C43e+QMa7a6ijFojDWiqSt7UnoiIhra3EuCXnDBRbj99lvQ1dUFXddx9dU/w+TJB+Lx\nxx/Fm28uhSzLmDp1Gg455FAsW/YGtmzZjDlz7sD48eOz9pu5DOjVV1/nLANaWRkCIJdkGVC3PTy8\nxwItQKfWPtiHQkQ0rD332UtY1bimqPs8uu4rOHfyWXmfdy8J+uijf8YJJ/w/fOMbZ2PLls24994/\n4J57/oinnnoc8+cvgqIomD//WRx33NcwefKXcc01P88Z3LmWAf3ww/fx1ltLcc4552H27AuxaNHr\nJVkG1G2PDu/a0FgAQAysvImIhrM1a1ajvb0Nixe/DABIJMzu0pNP/jquvvq/cNpps3D66bN63E+u\nZUDr6zc4S362tOzClCknlWQZULc9OrzrKszwTildMISALEmDfERERMPTuZPP6rZKLjWfT8VPf/oz\nHH64dy2L6677JbZu/Ryvv74EP/nJf+Chh/7a7X5yLQMaCAScJT/XrFlZsmVA3fboAWt25Y1gFNE4\nZ1kjIhpO3EuCHnro4XjrrTcAAFu2bMZTTz2OcDiMRx6Zi0mT9sX3v38ZqqtrEI1G8i4lCniXAQWA\nVas+xEEHHYpnn52Hzs4OfPOb38QFF1yE+voNzmOnn36G81ix7NGV96hgDSQhQw5EEYmnUBXyDfYh\nERFRkbiXBP3hD3+E2267Gf/1Xz+EYRi4+urrUFVVhfb2Nlx22fcQClXg8MOPwIgRNTjqqK/ihhv+\nG7/97Z3Yf/8DPPvMtQzokUcehVgsihtv/AVGjaoBIJdkGVC3PXZJUHvZtp++fgvicQM/O+pa7L/3\niKK+x56ASwj2H89h//EcFgfPY/9xSdABEpCCkNQUIvHUYB8KERFRQfb48A4qIUiqhs4oJ2ohIqKh\nYY8P7wo1BABoj4YH+UiIiIgKs8eHd6XPvFevIxEZ5CMhIiIqzB4f3iMClQCAjjjDm4iIhoY9PrxH\nVZgj+Xa1tw3ykRARERVmjw/v0RXm7WE7OjrQHk4M8tEQERH1bI8P70qfOWBNUpNYvallkI+GiIio\nZwxvn9nnDSWFpvbY4B4MERFRAUo6Peodd9yBDz74AJqm4T/+4z9w+umnO88tX74cd911FxRFwfTp\n03HFFVeU8lDysm8Vk9QUWjvZbE5EROWvZOH97rvvYuPGjZg3bx7a2tpwzjnneMJ7zpw5ePjhhzFu\n3Dh897vfxcyZMzF58uRSHU5eITVo/qBoaOviRC1ERFT+Shbexx13HI44wlx6bcSIEYjFYtB1HYqi\noKGhATU1Ndhrr70AACeddBJWrFgxKOHtV/wAAJ9foK2NlTcREZW/koW3oijOYuXPPPMMpk+fDkVR\nAABNTU0YPXq0s+3o0aPR0NDQ7f5GjaqAqipFPcba2mqM1M3K2+8XaI8kMXZsFSSu690r+SbOp8Lx\nHPYfz2Fx8Dz230Ccw5IvCfrqq6/imWeewV/+8pd+7aetLVqkIzLZK78IISBLMiTFQCKpY+u2NlQG\nuTRoobgKUf/xHPYfz2Fx8Dz237BYVWzZsmX4v//7P8ydOxfV1ekDqKurQ3Nzs/P77t27UVdXV8pD\nyUuSJPhlP2TVXHi9jYPWiIiozJUsvLu6unDHHXfgwQcfxMiRIz3PTZw4EeFwGNu2bYOmaVi6dCmm\nTp1aqkPpkV/xAbIZ3h2R5KAdBxERUSFK1mz+8ssvo62tDVdffbXz2AknnICDDjoIp512Gm6++WZc\ne+21AIAzzzwT++23X6kOpUd+xY9kyhxpHo5xXW8iIipvJQvvCy64ABdccEHe54877jjMmzevVG/f\nKwHFjw6YS4IyvImIqNzt8TOsAYBf9kMXZmhHGN5ERFTmGN4w+7wNGIBksPImIqKyx/BGeqIWyDrC\ncYY3ERGVN4Y3zD5vAGZ4s/ImIqIyx/CG2ecNAKrPYJ83ERGVPYY3rPu8AYRCEitvIiIqewxvpPu8\nQyEgHNMG+WiIiIi6x/BGus87GABiCQ26YQzyEREREeXH8Ea68g5YS3tH46y+iYiofDG8Afhls89b\nVc2KO57UB/NwiIiIusXwRrryllUBwGw6JyIiKlcMbwABJQAAzrKgrLyJiKicMbwBhFQzvCXVrLjj\nSVbeRERUvhjeAIKKNVJNNu/xjiVYeRMRUflieAMIWpW3IZkVd4yVNxERlTGGN4CQGgIAGJJZecdZ\neRMRURljeAMIWgPWdCQBsM+biIjKG8MbgCqrUCQFmhXe7PMmIqJyxvAGIEkSgmoAKWGFNytvIiIq\nYwxvS1AJImkkAABxTtJCRERljOFtCaoBJHQrvDlJCxERlTGGtyWkBpHQk1BkNpsTEVF5Y3hbgkoQ\nAgKBoOCtYkREVNYY3hZ7opZgCIiyz5uIiMoYw9sSVM0pUitCQCSWGuSjISIiyo/hbQlZ85sHQwJJ\nzUAixaZzIiIqTwxvi115B4IGAFbfRERUvhjeFrvP2+c3wzvM8CYiojLF8LbYzeYqw5uIiMocw9ti\nV96yz+zrZngTEVG5YnhbglblLavmbWIMbyIiKlcMb4tdeUNheBMRUXljeFtC1mhzQzJDm+FNRETl\niuFtCTK8iYhoiGB4W+w+b3tNb85vTkRE5YrhbfHJKmRJdtb0TunGIB8RERFRbgxviyRJCClBZ01v\nneFNRERliuHtElQDiGlxKLLEypuIiMoWw9slqAYR1xJQFRmaJgb7cIiIiHJieLsErWZzRQE0Vt5E\nRFSmGN4uITUAAQHVLxjeRERUthjeLj7Fb/5XNRjeRERUthjeLn7ZBwCQVYGUzj5vIiIqTwxvF5+s\nAgAU1YCm9b/ybutK4MEX16G5I9bvfREREdkY3i4+xay8FaU4fd5PvFqP9z7Zjb8u3NDvfREREdkY\n3i4+u9ncZ0ArQrN5PKl7/ktERFQMDG8Xu89bUQwYQsAw2O9NRETlh+HtYjebS4rZZM5Z1oiIqBwx\nvF2c0eayGdq8XYyIiMoRw9vF7vO2K+9i9HsTEREVG8PbxWk2tyvvItwuRkREVGwlDe/6+nqceuqp\nePzxx7OemzFjBi666CLMnj0bs2fPxu7du0t5KAWxK2/I5ujwfjebC1buRERUfGqpdhyNRnHrrbdi\nypQpebeZO3cuKisrS3UIvebPCG8OWCMionJUssrb7/dj7ty5qKurK9VbFF1Ws3mxwlsqzm6IiIiA\nElbeqqpCVbvf/U033YTt27fjmGOOwbXXXgtJGtyUs6dHFZLdbM5mbyIiKj8lC++eXHnllZg2bRpq\nampwxRVXYPHixZg1a1be7UeNqoCqKkU9htraas/vcf9IAIBqLi6Gqqpg1ja94fObp9enKv3aT7kb\nzp9toPAc9h/PYXHwPPbfQJzDQQvvs88+2/l5+vTpqK+v7za829qiRX3/2tpqNDV1eR4Lx1IAgJSW\nBAA0t4TRVBPo83ukkpq1Pz3rvYaLXOeReofnsP94DouD57H/in0O810IDMqtYl1dXbj00kuRTJoh\nuXLlShx44IGDcSge9mhzQ+KANSIiKl8lq7zXrl2L3/3ud9i+fTtUVcXixYsxY8YMTJw4Eaeddhqm\nT5+OCy64AIFAAIceemi3VfdA8St2n7dZMevs8yYiojJUsvA+/PDD8dhjj+V9/uKLL8bFF19cqrfv\nE6fyBitvIiIqX5xhzUWRFEiQYMCsvDnDGhERlSOGt4skSfApPqfy7uk+7x3hXXjsk38grsUH4vCI\niIgADOJo83Lll33QhTVKvIc+7/s+eghdyTDGVdTi9H1PGYjDIyIiYuWdKagEkDQSAAC9m8p7W2MY\nXckwACBpJAfk2IiIiACGd5bairGIGREEv/oqtic3593ulfcbnJ8lzn9KREQDiOGdYXyFORe7pGpY\nrb2af0N3i/ogT+tKRER7FoZ3hnGV6YVUVPjzbifAe8CJiGhwMLwzjK+oTf8iCquoZTabExHRAGJ4\nZxhfOc75OYEINEPLvaGn8GZ4ExHRwGF4Z6j2V+EHX/4h9I4xgCTQGm/r8TXs8iYiooHE8M5h/5pJ\nMLpGAQCaYq05t/GMV8tTebNXnIiISoHhnYOqSBApc7BaLJV7KVLhSmbeKkZERAOJ4Z2DqsiAYU4+\nl8g7AYsnvYmIiAYMwzsHVZEhDAUAkNBzh3chzeZERESlwPDOQVUkQDfDO5knvHuD4U5ERMXE8M5B\nkiQo1pot21o6cm8kvNsTERENFIZ3Hgp8AICV9TuwsyWS9TxHkhMR0WBheOdhhzdkHZ2R7pvO2SxO\nREQDieGdhyqlwzsX4bpXzBD5lw4lIiIqNoZ3HnZ4S0qe6VFd3EFORERUagzvPHyKz5yIRdaR1Lqv\nrA2w8iYiooHD8M5DlRXAUCApOpKp7KZzd7HNZnMiIhpIDO88fKp1r7esI5nqofJmszkREQ0ghnce\n5ixrKiRFQ0LLUXm7f2blTUREA4jhnYeqyAVX3nqe8GZBTkREpcDwzkOWJXN+c1lHIpljxLn7VjEO\nWCMiogHE8M7DMIQ5YE0WSGiprOe9zeY9lNicw4WIiIqI4Z2HYQhAN+c3j2mJ7rdlnzcREQ0ghnce\nuiGcZUFjqXj2Bp5bxdi5TUREA6eg8F67di2WLl0KALj77rtx8cUX4/333y/pgQ023RAQyQAAIKKH\nu92Wfd5ERDSQCgrvOXPmYL/99sP777+PNWvW4MYbb8R9991X6mMbVIYhIBIhAEBMdGU9z1vFiIho\nsBQU3oFAAPvuuy9ee+01nH/++Zg8eTJkeXi3uJuVdzfh7VmYhM3mREQ0cApK4FgshoULF+LVV1/F\niSeeiPb2dnR2dpb62AaVIQREMggASCLXet7pwK7f1pZzxDkXLCEiolIoKLyvueYaLFiwAD/96U9R\nVVWFxx57DJdcckmJD21w6a5m86Scq8873VS+uy2CpvZY9hZ2djPDiYioiNRCNvra176Gww8/HFVV\nVWhubsaUKVPw1a9+tdTHNqgMwwAMFUJToSvRrOfdlTckkQ5q9zZW5c0KnIiIiqmgyvvWW2/FwoUL\n0d7ejgsvvBCPP/44br755hIf2uD60rhqAIBIhKCrkawAzry3O3ezub1taY6RiIj2TAWF9yeffIJv\nf/vbWLhwIc455xzcc8892Lp1a6mPbVBdcsbB+N7Mg+DTqwFZR0cyo49fSieyxMqbiIgGUEHhbYfP\nG2+8gRkzZgAAkslk6Y6qDFQGfTj56AkIiBEAgMZok+d54b63WxI5VyGxA53ZTURExVRQeO+33344\n88wzEYlEcMghh2D+/Pmoqakp9bGVhZAwP+fOsDe8vROziJwBzcqbiIhKoaABa3PmzEF9fT0OOOAA\nAMDkyZNxxx13lPTAykW1MgotALZ37fY8nll557rXO+lrARSJfd5ERFRUBYV3PB7H66+/jnvvvReS\nJOGoo47C5MmTS31sZWGkbzSAXM3mmaPNvQm9qf1ztO31OvwVYyBaTy71YRIR0R6koGbzG2+8EeFw\nGBdeeCHOP/98NDc344Ybbij1sZWFmmAVhACi1uIkH3zaiBfe3gJkNJvruje869s2AQCUmhb2eRMR\nUVEVVHk3Nzfjrrvucn4/5ZRTMHv27JIdVDmpCKpAVIJm6ACAB55fCwA4cLLrukcS6EqGcdt7D+Oc\nyf+GQ8cchNZ4KwBApHzs8yYioqIqeHrUWCw9g1g0GkUi0f0a18NFZVAFhAxN1z2Pp9y/S8DqjlXY\nEdmFBz5+GADQEm8DAIhkiH3eRERUVAVV3hdccAHOOOMMHH744QCAdevW4aqrrirpgZWLiqAPEBI0\n4Q3vpK65fhMQGQndaoe3prLyJiKioioovM877zxMnToV69atgyRJuPHGG/HYY4+V+tjKgll5S9AN\n74xqKS0d3lLGgDUhhFN5QzYY3kREVFQFhTcA7LXXXthrr72c31evXl2SAyo3duWti+6azYUnoBN6\n0pk+VZJ1DlgjIqKi6vOi3HtKNVkZVCGEbC5U4pIy3GEunAFtABDX4+mnFH2POVdERDQw+hzekiQV\n8zjKVkVQBSBBhze8tYzKO2GkB/DFtHR4S7LOAWtERFRU3Tabn3TSSTlDWgiBtra2kh1UOamw+ryF\n8PZdp3QNAdd2yTzhzT5vIiIqtm7D+4knnhio4yhbiixDEjIMpKC5J2KR3TOsGXkrb7DPm4iIiqzb\n8J4wYcJAHUdZkyUJAgZSWrqpXJJdt4pJQMod3qmoazsDBpjeRERUPH3u8y5EfX09Tj31VDz++ONZ\nzy1fvhznnXceLrjgAjzwwAOlPIx+kyADEEhprn5v1R3eAkmRDu+2RIfn9ULSQEREVCwlC+9oNIpb\nb70VU6ZMyfn8nDlzcP/99+PJJ5/EO++8g88++6xUh9JviiRDSAaSrvD2VN4QSBnp9c2d8DbM0ysk\n721mRERE/VGy8Pb7/Zg7dy7q6uqynmtoaEBNTQ322msvyLKMk046CStWrCjVofSbLCkABOJJVwgr\n3klaUsIV3vF28wctCAAQYOVNRETFU7LwVlUVwWAw53NNTU0YPXq08/vo0aPR1NSUc9tyoMgyJFmg\nI5JuGpdUb+Wtwd1sboV3yhwf+2w3AAAgAElEQVSPzsqbiIiKqeAZ1gbbqFEVUFWlqPusra0uaDtV\nMU+TUFzXOlblLTQVkj/puQu8PWk1m1vhDVkv+L2GouH82QYKz2H/8RwWB89j/w3EORyU8K6rq0Nz\nc7Pz++7du3M2r7u1tUW7fb63amur0dTUVdC2spABCdi2M31vu2SHt+5zqvB9qvZGQ3gHuhJh87lU\nABIAQ9IKfq+hpjfnkXLjOew/nsPi4Hnsv2Kfw3wXAiUdbZ7PxIkTEQ6HsW3bNmiahqVLl2Lq1KmD\ncSgFUWTzNHVE0/3akDXz/m3NvP6pwlhMm+AdnCecZnP2eRMRUfGUrPJeu3Ytfve732H79u1QVRWL\nFy/GjBkzMHHiRJx22mm4+eabce211wIAzjzzTOy3336lOpR+UxUF0IHOqGvaU1UDdBWQzHu4fQhA\nldOn0y/7ENet5nb2eRMRURGVLLwPP/zwbpcNPe644zBv3rxSvX1RqbIV3rH0oDQoGoSuOn3fighA\nkdN98j7Fh6iumE0bDG8iIiqiQWk2H2pUK5S7oq7R5opZeUtOePuhSunwViU1fZ+3zGZzIiIqHoZ3\nAXyKGcrhuN3nLZzK2x6spghvs7kqqxCaFeYyK28iIioehncB/NatYl0xK7xlA5IkzD5v2A/5PM3m\nqqxA6NbvisaVxYiIqGgY3gXwWfeX68KqoJ3bxNzh7Tebyi2qpMIwrN9lnUuTEBFR0TC8C1Dh91k/\nmRHszGvuCm9J+Jy+cQBmFW5V3pJVeXdGk7j/2dXY1hgekOMmIqLhieFdAL/PCm/JmkdNsSpwwzXj\nm65mNJur6cpcMdf0/ufyrVi1sRn3Pbt6AI6aiIiGK4Z3ARTJOk2SgCSlK293szkMNavZ3A53STYr\nb3s98GSKA9iIiKjvGN4FUOxbwCSByqAPvoDVg+2qvCXd22yuuprNoegw2OlNRERFMmQWJhlMslV5\nS5JAZciHsGrAACB0BYmNR0EZ2QRZqvbcKqZIKgAZQpedypuIiKgYWHkXIN1sbuDEr4yHL2D1fRsq\njLbxSG35CoRhB7b9GsXZxu7zdkjSwBw4ERENSwzvAshWEP/7qZNx5tcmweczk9i5jxuArouMZnPV\n2UbKvM+bVTgREfUDw7sAduU9fmwIkiRB8dmVtyu8hchoNndV3jL7vImIqHgY3gWQrSVBDWGGtqxa\no8Vdo811XXjmNreb0IWuAIoGwzDSO2SzORER9QPDuwB2Fa1b4S1Z93m7m80NQzgD2wCkg1xXIUlA\nyuDiJEREVBwcbV4AO5S/6NyGz9o3A0rKfMJwVd6GAclVUctOs7n537iWXguceuetj3dgQm0lDti7\nZrAPhYioLDC8C2D3eS/e+joAQLZWGfMMWDPSk7CYr/Fuc+fqe3AELhqQ4x1OYgkNjy7cAAD4yy9m\nDPLREBGVBzabF0B29WUDgJCyJ2nRDYHHXql3frfDW1LNKj2hJyBghntnJIkHnlsDg6POe6TpRs8b\nERHtYRjeBVAk72kSMMy7vQxvn/fGhnbXa8zntN2T0tsgXZl/UN+Enc2REh0xERENZwzvAmSGNwAr\nuNN93PGk7unztkebG51jobWMN3+Gd05znfeP9YhniIgoG8O7ALKsZD+oe4cLRGIpz+8KXK+xKnQD\n3hHnDO+esWeBiCgbw7sAco7KWxgZ/eAwB1c5r5HdK45Z94lL3srbYHj3iOdoePvX+t247I6l2N0a\nHexDIRpSGN4FyN9s7hV2Vd+K69TaQS/YbN5rDO/h7c8vfQLdEFi2eudgHwrRkMLwLkDmaHMATjXt\n5g7j5vZk+glhbtssbYLkT1cYKY6k7hFH5A9v/PMS9Q3DuwC5Ku9RVaFuX7P4vW3pX6yg362uQ/Co\nt5yHUymGd08Y3kRE2RjeBcjV5z1+VBWqQj4AQCjQw1w3OZrYAVbehWCzORFRNoZ3AZQczeaqrDrL\nfFZX+Lp9vcjRxA4AyZSe83FKY3jvGbhWD1HvMLwLIOf4ZnEv/1kdyg7vQyeNxrUXHoWvHTouu/KW\nNQACb3e8jHe2vwcAWPDOFsxd8ElRj3s4YHYTEWVjeBcgksq+jcW9/Gd1hT/r+ZHVfhy272jzOeE9\nzZI/DskfxxfJT/HEp88CAJ5ftgUr1u0q8pEPnLWbW7BibfGPn5U3EVE2hncBJo3YBwDw5VGTncfM\nZnPz5xGV6fAWmlmFj6kYCQCQ5exmcykQg+RPrzJmrxMOwGmKz2f+ss34+LPmPnyK0rrrHx9j7kvF\nbznggDUiomwM7wJU+6vwwIw7cOa+pzqPqa5Z1/yq7Axei6+ZisTGo3DUhP0BABUBNavZXArEIAVi\nzu+NkXQYdxdWndEkXnznc9z7zOr+faAS6unio7fKObxffGeLs+IZEdFAYnj3guIKbFVWPfNu71NX\nBQCoCYzAdbPOwKTx1QCAiqAv655wSUlB8qfD+4GP/+KsEa7p3YR3JJn3uXKR1Io7gr6cm83nL9uC\ntz7eMdiHMaSV8bUZUVljePeCu59blVQ4y2ZIwMGTRgEAamtCzs8AUBlSs/q8IQlP5d2aaIW692YA\ngN7N7WPhaCrvc+Wi2CPoyzm8iYgGC8O7F7Iqbye7JZxxwpfwzan74rJvHOp5TWXQlzUPOmTDCe+v\n7zPd3F9tAyBrWZV3Uk9i4ZZX0Z7oQGe0/CvvRLHD23U6/ufvH6KxPZZ/40HCC4y+4y1iRH3D8O4F\n9/3e7j5vSQJURcbZ0/ZH7UjvzGuVOZrNIRmQ/HEoIoBzDzwLB4a+AknVIPnj0DIq73/Uv4CXtryC\nFzctQke4/MM7WeRZ49x93vUN7Zj32sai7r8YONlO37HZnKhvGN69oHbT551PZUjN7vOWDEi+JFQj\nCAAwdOt5SUBzVXHtiQ6s2LnS+b0jo897xY6VWLBpUS8/RWkVu/IWGVVtOS7mknnBRURUagzvXvBU\n3pKCQtK7MuiDENnN5lBSkI0AAEDT0o+7+7zvWzU3/RJJdgashQLm/h7f8DQWbX0dulE+M7UVu887\nM6zLsYk6VeRBekREPWF490L2aHMzSLrrtzNvFcucpCUBSQIk3QzvlDUOTZIM6Fafd2ckieZYi/Oa\nqBZzKu+qjBndolr59AMnSthsnuv3cqAxvPuNfd9EvcPw7gXPaHO5h8VILLIsZd/n7TMnaJE0c3IX\n3S5WJQOaYQbBtX98C7rQceDIAwAAsVQMHZEEAMCvKp77qbuS4d5/mCJyH0vxR5tn/l4e4a27Dox9\n3n0nCup8IqJMDO9eUFyBrcqq606xHsoG4X1e8pshbM/GJgzreUkgkdTxxqrt0GWzyq5UK+BX/Ihq\nMcQTZjDqhkBcT8/QFklF+vyZisHdtF30Pu/MyrtMwlvT0sfR3b35ROXglZUNWL+1bbAPg4qI4d0L\n7nW9PQPWemzyywhvnxnMwqq8Dd16Xjbw7Fub8bfFn0JSzI5wvxxEhRpCTIs5FZ4hBLqS6cAO55h7\nfSC5w9tdeacMDXd/+Ces2LEy18sKkt1s3uddFZW72uaANSpniZSOp17biN8/uWqwD4WKiOHdC1kD\n1iw9ZXfdqFDOx42kWXk74S0Z2N5kNoFLqtkRHrDCO6rFnYFRhiEQTrnDe5Arb1d4ufu8t3Y24LP2\nLXh8w9N933eZjjZ3BzYHrFE5K5fWKiouhncvSK5RNYprkpae3Pz94zC+60QkPj3G83gqYTbD233e\nkiTgXApY06X65QBCaghxLY6UZm5oCOFpKh/sZnMtT+WtGVquzXsl84unXAasuQepsc+7H6w/Z+bY\nBiLqHsO7j3yyAvf0qN0J+lVMCnwZRkcthKv/OxaVYRgCuqvytkmqGXw+KYAKXxACAklh9pUbRmaz\n+WBX3q4+by0d3rmWUu2tzLDOvO97sLgvWDjavP/K5aJsOOK5HZ4Y3n2UOT1qTxTFOtVGeluR8iMS\nT0HX0n3e6RdYlbcUQIVaYT1vPmYIIJxKjzAPJ7NDMvMfbCKl4911u5zqvZjczebJZPrnLtcxLnrv\niz7tO/N7Rx/gLyJNN7BkZQOice+88u7AZp93/7Fpt3TKpauJiovh3UfmwiSmQu5R9dnh7VqkROgq\nWjsTcPLUGm0OpPu8VRFASA1ab2pW45l93pnN5l/s7sIPf7cUb3603XnsuTc346EFn2D+si0FfT63\ndVta8doH2/I+7xlt7ro4CLtuYVv4Xu/fF8jRbD7AX0TPv7UZT762EX9fUu953N1Uzmbz/mN4l065\ntFZRcTG8+6jQ+7xtimIlvHuFMV3BLY+uRDhid3obTsVsh7cCPypUc8CbZFXjhiE8TdI7Irs8s6wt\nX7sLAPDU6585jzU0dgEANu3o7NVxA8Cd8z7C35fU521+y9fn7b7/3JD7tiJa1gxrA/w9tHFbBwCg\ntTPheZwD1orD/nOyabd0mN3DE8O7j1RZ6dWiCnblLQz7vxKc028FuiS5m83NKlsRfgTUgPVYesBa\nXDPv8z669itoT3Rgbct656V2FSP7EqhvMwPcp5qj4/vTbJ6vOvKMNk+6wtvVIiAUb/gV/J5Z93kP\nbFC2dZnHPbI64Hnc22wuEI2nsO7z1gE9tuGEAVM6bNUYnhjefaRIhU2PalNVO6itjQ1X5S7Sk7TY\n7CpbNgLpJnopfatYzArv0yedAgB4a9sK57VOv/A+a3DvqofwcdNa+K33T/ajSszXt5tvkhZ35d3X\n8M5s8hvoUcntYfO4R1T4PY+nXIP0NM3AnfM+xp1PfYT6hvYBPb6hzv6nM9AXZW5CCCz9cBt2tQ7u\nfAmlwlaN4Ynh3UeqrOLkoycAAA760qgCtvc2m8vCHd7Wn8E1YE3yJyAMCbLwwWc10UtyepKWmBaD\nLBQ89sIuHDhyf2xo24hdkd3m7uzAC5lBMn/Ty/D5zPdI9WPu8XwDX9yjzd39v+5BdYbct+VMM9/S\nEAKabmTNvFYq9mfOfD8tY5KWLTvN7oimMlxvvJw5zeaD2POweWcnHnulHr+a++7gHUQJsfIenhje\nfeSTFXzntC/jjh9NwWH7ju5xe6fytprNZeSqvN3hHYNIhqDrrv512Wo2N4CYFofQfdi8vRMnjDfv\nH/+0bZP5vN1vnjJHqTdGmyGpZgWZ7EezuZ5jGlAhhGeeb/e0oRHXKHhD6Wt4Zw9Yu/z3b2DO3z7o\n0/56w90FkDkoTcszYI0LbPRNb6vD9zc04sEX1xWlqozEzC6q4VqgsvIenhjefaTKKmRJwtiRuWdP\ny9o+Y7S5e7S63Q9uN5srqg7Jn4RIhKDpBnyKtYqY5K6844BuTtEaUioBAM+/vRHN7bF0FaOkB4nF\nVXOFMvfgqriW7hMvRGbl3RZvx/ee+ylWtb0PqGY4u0MtYbgCW+1bs3n2gDXzd7vSLaXWrvT88cmM\nFgv3eXT/LGekdyyhIZbo/2Q1A03TDWzd1TVg79fb6vCP89fivU92Y3cBTd2vf7itV5/FMATunPeR\n526NoYyV9/DE8O4j91SphVCt0eb2JC2q5FrW0x6wZjWLT5xo7lskQkhpRlazOWCGt6GZj8swt4+m\n4nhx+efpK20lHRqblXcANenp835iwzO4d9VD+KhpbUGfQc+oPjd1fI6ElsCyliUIHvkG4Is74W0I\nA5qhQYHVV9zHyjuzz3sgFwGx108Huq+8NU/l7Q3vK+5+C1fc/VaJjrB0HlrwCW55dOWA9eH3tTp0\n5k/Io7E9hsdfqcctj+afXz+ztaSxPYZ1W1rx10Wf9umYyg2ze3gqaXjffvvtuOCCC3DhhRdi9erV\nnudmzJiBiy66CLNnz8bs2bOxe/fuUh5K0Vz+le/hrP1metb2LoRdeUtWda1KKn71PWu61IxmcyVo\n9puKRAU03Ug3m9vN6rIBXegwUnZ4p5/fvKPTudIWcgp1FWMBAElEoY773FMl2iPUN1rN7T3pbrIH\nSTGgjGx0giypm8EXRJV1Avp2q1jSSHpaEIq95Gh3uqLp982cRU3zDFhzDTTsY7N5S6wVN7xzOza0\nbuzbDors/Q2NAFBQZVsMfa0OMy8oMxXy/0vmn2y4dX2w8h6eShbe//rXv7B161bMmzcPt912G267\n7basbebOnYvHHnsMjz32GMaNG1eqQymqI2sPxxn7fb3Xr3Oaza3wliDjgL1roMiS0w/ujDb3m1+Y\nduWdsjPErrytMBO6VZFb64VLio4dzRGzipEMQNYxOjAKFx30LfP5jACt9JnN7YVOr6plfAnYI95t\nysgmZxR2QrdmiDMqrffuW+W9UnseoWNegz20qbezRdW3fYZH1j2BVB/mWe+KuirvjLECqTxzm/e1\ngnz1izfRlmjH3DWPFbS9EAJPLKnHui2lvT2tKuTreaMi6Gu+9HSPfUE5LHX765DHPu/hqWThvWLF\nCpx66qkAgAMOOAAdHR0Ih8M9vGr4UuzR5s7tZVbftyJD2KPN7T5t1aq8k0GkdAML3ramFrUGrNnL\nhcIKbwjF83wkrjkBH1KDOGj0gZ7nbVU+c0BbOJk7vNvi7Xh03ZOQrIuJzCrHvtf8lJFnw4hXQK5u\ncyrUlNXfrYgghC47y6D2VhhmOMk1zX16/b2rHsL7uz/CxwV2Dbh1uirvzJDwDNJzN6FrfWz+tVpy\nNFHYRcb2pghe/WAb7pz3UZ/erzvukfWZF2yl4q4OOyNJrN3ckndbz/H11I3ShzJ6uGXdnlh5G0Lg\nd3//EP9c8flgH0rJ9G6asF5obm7GYYcd5vw+evRoNDU1oaqqynnspptuwvbt23HMMcfg2muvzeov\ndBs1qgKq2rum6p7U1lYXdX/dGdlsNT/azeaKitraavhUGYmUt887FJKBlFlZ+/wqWtpTwCjXJC5W\neAvdrIpGjrA+hxXOmiGchU1GVY/AXnXWrWzW/u3PXREIAl1AzIjmPBfLPnkbK3evQuAIGfH3T0f1\niJBnu+QXZrhVh6ogkgHIwSg0w0BtbTVi7eaAMkX2QST9gJrM+R7vbVuFUcEafHns/p7Ho/EUKoLp\nqk8Zux1GR61nm978/Xyh3v+9U64vPUOSPK/3B9LHJrv6XYMVfmc7dytBT+8dCJj/FDVDK+g4O+Lp\nC7Fi/3/c1pluUQm5Pk8pqT7FeZ9fPLQEja1R3Hftydhv75qsbSOx9EVVVXXQeV2u44y7rrnyfY6a\n1phnm5he+N9tKGgKpy+cC/k8w+Ezh6NJfNrQjk8b2nHJN78y4O8/IP9mSv4Olsz7ZK+88kpMmzYN\nNTU1uOKKK7B48WLMmjUr7+vb2orb91ZbW42mpoEbTdvVZX1BWAEqdKCpqQuyLOWYpMX6YhYyOrvi\nUCUFCddrneZva8BaW6s5ktsO/65Iup9Y0hR0tVnPWzO0NTZ2QpIkdMbMintXuAlf7GyCX/Z5+vI7\nwzFnv1IwjJaWCJpC5nsmkjpefOdTqOOARFQ4rQApI4mmpi7s6jAHOukpCdD8kIKRrPNtCAN3vvMQ\nAOCBGXc4j2/Y2oY7nlyF807e32yokAClphkpyfBML9ubv19LR5dn+22NYUgSMKG2yrPdZ9s7cO/T\nH+Pq849EY0u6RSIWT3le3+EKuIireb2tPepsF0+mq+jujvWtxmVY9Nkbzu/23ydTMqXj3U9247iD\n69DWnv73UKz/j1es24VdLVEctl/61sfWtuiA/DuJu85vo9XPvnFLC6p82Y2Dja576Ztawmiq9uf9\n99zSkm7ty/c5OjLOZVNzz68ZSlpb0/8f9/R5Bvp7sVQiroWEBvrzFPsc5rsQKFmzeV1dHZqb002d\njY2NqK1NV05nn302xowZA1VVMX36dNTX1+fazbDh3ELk6vMGkNHnbQ1YU+1FjmVougG/Yo3Ylg0E\nfIrTbG73ecPwNpvHk5qzTYUagk/2eZ5/7q3NeGfNTqfPOqEncd1bv8b1Cx/CZ9s7nGN2z58uBSOe\npuKuWNJpAZAMn3MsQtagG4bTbA5dgdB8kBQdCc3bdJ7Qc98+ttIaLLVw5RanA1JSNchVfR/5HE15\nJ0/59V/+hRsf/lfWds8s/QyRuIZnlm5yms1HVPg8zea6YeCL3el/nO4+b/e98O6R/d01Xc5bu8Dz\ne0TLfaG6YPnneHThBjz56saSNO3OXfAJFiz/HM0d6XM1UPO25+qXzddk7668e1qOtZAxEpnvM9xW\n4ervx+mIJPHBp03FOZgBMtz+hrmULLynTp2KxYsXAwDWrVuHuro6p8m8q6sLl156KZJJ88t85cqV\nOPDAA0t1KGUhff+vNe847D5vKV1NWuEtK1Z1bshIaQYCavo+74BPTt8CZjWb6xrM6t0K51hCd6rz\nkBqCIiuQhAzJev6fK7bi4X+uzxpwFg5twd3/SPehulcFU0Y1oiWRHhxlGMK5QJCFz6m8JUWDpgkk\nrNHmwlAgUubFR1vcezUaTaXf372wiv1FLqvmY0I3L07kEd5+UPMiQcsK5lw6k7mvhA1hYHc0/cVk\nT6aj6Qa6oklUhXwI+BVPiK1YuxtrXQPFtDyD19yz2el5phAzRPbjLbHcg9B2tZih/vmu0t7j3tKR\n/ruUMrzdrXG5Lm7yjST3hHcPo80LGayV+d7D7YvffQ76MjPh//z9Qzzw/JohNfVvrgmlhpuShfdX\nv/pVHHbYYbjwwgsxZ84c3HTTTXjuueewZMkSVFdXY/r06c5tZKNHj+62yXw4kK0Ba3a/tV15T6yt\nAmA1nctWVW6FNwwFKV0gqKbv8w74FSek7cldNN2AJBSn2Tye0JyAt5cTlaB41wuHQEJPoNJeKxyA\n0FTPVbq78lZrt+PvDQ8imdJR39BuTlpih7er8oaiIaUbSOr2iHgZ0Mzwrm/x3pIW1dKh61772/4y\ntS8OjC6zGVcOeQc8aprAw2sfw8+W3YRIKuoJccMQePHtLc5kOJ3J3IG3cMur+M27v3fudbfvCtB0\nga5oCtUVPvhUb3hv3tHh2UfKM2DNtba5PUJd0pHQcg9Ei2vZrQ/5Rv/b/w/phijpl1OLq0uglMud\nunMkZ3jnCdGwK7x7Or5CgjgrvHvY519eXo///r/lPe63XLg/X19Gntu3Cw6lqX/zXSwPJyXt877u\nuus8vx988MHOzxdffDEuvvjiUr59WXEqbzugrdHml5xxMPbfewdeiSsw7AFp9n+FDE0zYOj23Oe6\n2Wwup8MdsKoj4Qp1pG/NspcTlYXqHW2uaBAQ2K9mknO/t4hXpudgR+4Q+cvL6/Gv9Y046/9NgqRo\nELoC3YCn8tZ1A0nDWr5UV2AkzGOYt+kZHD/hSAStVdJirvDuSHRiZMAcnGR/v8jWoDsRr4DQFUhB\n7/GkdANrms1j//mymwEAPzjsIhwz7ij8a/1uzH97C0LHCEABOhPp4HdXa+/sMCfvWLV7DY6qPdwJ\n70RKRziWwoSxlYgndU94B/2utdxlAy0j/gUlPgJSMIIW3Q/AHHyXTBmAZCB45FuYv6kT3z3sW1nn\nM7P1AzAHreVi37FgGKLHirM/8lXeiZSOrmgSY2sKm1WwJ+4gyZWx+YI3Ek+fn54uYgoZaZ35Pj0F\n/turd1rbGVDk8p/nyhPehkAP89rkNZRaJIbSsfZV+f+fN0yMqQlaP3mbzasr/Pi3KftClc1wkkc2\nIpwKQ7Kq8ZRuIJGy/keUzD5vJ9ytyjulGea93q5wloLm1fLY0Bjzd6E4zeZAetWyoBLElV/5sfmg\nrHtmrAqnIhgd9C668q/1Zn/0xoYOc1CcrkI3BISRWXlbzea6DL3xSxBJM7Cjrv5cd3gv3rrUqdad\nudntixFdhYhXWp/JfZtQdoDtipjHZ37BC2cZ1Q5X5e2e6tReS317s9msbs+E12atJmZW3rInxGLW\nQLTbLjsBoTFtSFR/Dv8Bq+GbsAmrxItOU3hS0yH545D8CWzsyD0NbVw3gzKoBHDKPicCQM570qOp\nmPP31Q0BrciVhbs5tdm1drm7JeGOJ1bh539a4Zl5rj/0HirCfBcoUdd0sz1V3oWFd+ZtgIV98Wd2\nKQgh8PFnzWU3Ha773PYn1IbSLWdsNqeiGVUdwG2XneBUywq8k18okgJJ1RD48ofYEdllzaomIaUZ\nSKYEhCFDkg34fa6QFq5lPo2McA5EASFhTMhscpZyVN4AsGZjJ3738GdQjRAgG051J4RAOBVBlTWR\nS6akpluVt2p++en2RDEaNF044W1oMiBk6O3mYEU7oAHvILKPm9bipS3mGIms6V11FUas0hz17k+/\nRtMMyJL3f2FP8Lmmh+1IdDrv51621J4AJ2m9zl533V6UpLrSD58qw3AtwBK3ngv61Zwz7dmfMakZ\nkHxmOLfEWz2f3WZX3idNnIqJVXtnfwaYs9XNee9ObAq+CkBYK6sV98vJHUSeytsVjvZ88u5m6/5w\nh0GuUMn3GfU83RQ5t+1L5a27jyv//jOPb/naXbj3mdX466INPb7nQHJ/hP5c8w2lanYoXWj0FcN7\nAO01phIHpk6D3joOU+r+n+e5zBHGftkHVTFHmyeSulllywZURXaazYWr2VwYCiRfylkARA5GIeuh\n9LzoIqMyt5qk7XlzzIsD3ak8E3oSmqGhUq1AcrN5n6Q9hzoAJDTdDEddhaYJT5/3X15ej664GZS6\nZq+mZr7WDnXAW3kDwEeNa8xNnT5vb+UNAFIo3XSe0DSnYjyq1jzGlGEHp56ezAaAgMD7uz92nks/\nYU9ba430z2hTHFFhhjeQDji7sgr6FShq9rdh0khCNwxsbwxD8iec998VzZ4C2J7oJqQGnb9VKiPk\nP2hcjY5kJ8LyLsjVbVafd3Er77hrBTXPimnWZ3avsFasL3F3tZ85h735Prk/o2dq2j40my98dytW\nb2rJu02+VfIyZVbe9iDGTdtLv2hOb3gGBvbjNoVi/z9XSkPpQqOvGN4D7IpZU3H1cT/AtMO+1O12\nqqzCp5qVdyJlhndNtYq6kaH0wDO72Vw3zIFhAEJfXQqoCUj+BJRUFYQQ+OPzaxCNCUiygNPsbM8X\n7p5iVU734dn93RVqJTsCVIYAACAASURBVPTmCdA7R8OADsBuEk5Bks3Qjic1T5/3Z9s6sGqTGVS6\nZlXyVmVu94UDQDSjv7cl3obGaHO6/9OpvBWIlNns7p5mNZyMQkDgyNrD8c39Z5rnwqpaw7GU83q9\nrQ5CAG82LEdjW8QTRPY99kKyBt9l3F49wmo2B9Jf1vGEBglAwK84I+LdknoKTy/dhKde/wzwpZug\nd4R3ZW1rV95BNQjVuqVPM7zhvXLXh87PytjtVp934V9OWzq+wJ/XPo5wMpJ3tHE8zxzg9mduaDSv\n8tQJG/HytpcKfu/u9NRsnm+kuztce2w2z9hvNK7h6Tc24Z6nP05v002fd+b+3ecvc8rcZmtA11in\ni6w86D20cBS8nyE09Vyxu5XKEcN7gPl9Cg6eNKrb2eQAwCer8Cmy1WyuQ5FVCDkJQ04BkvWl4fR5\n604VDgDKSPP+eilZiVhCw/ufNmXdCy75zdCwQ1FYfeZ25b0zYgbNCL81QYDzeiu8EXFeH4lrnsob\nSFfYuqZ4Xp9wVd6fN5mVynXH/Bhn7GtOpdueaE9/Qcqu+9l17/EDQJc1rWuVr8IJPrvyjsRS6dHq\nsSroLXthV2wXfvXss3jh7S3pE23tLwnzizfzy7raVXl/vH0zVu/YjFhSRzCgQJYkyEqu8E5imTWo\nya68ge7DO6QE0pV3RrN5S7wN1b4qyEKFXNmRNWBtV2Q33tuZf33zf9TPx6rG1bjx5b/i9sdzbxfP\n009rn4+GRnNMgG/CJqxu/xDhWApPLKnvVxO6O0dyZUrmMqw276IwvWs2D8ey++u7azbPvIBwH1Pm\nc01Wd8OoEYFuj2mgGT3cklfwfoZQNbsn9HkP2Axr1DuqYt5fHEtqSGoGAlAQTnXhXflvgGwu4iKE\nq9lcl5wFFeRqs0lQSoUQtkfmWkHv+9IGpD4/DFLAXrnMHDls6GZzvH070spdq8ztIxMAtDmVM2Qd\nMFTEpU4oMEeoR42Uq/K2mrqtCwwtZVW2OZrN12zdBXUsMMJf5dzSFtPi6S8J2T52NT0JnSss7TnZ\nK32V8Cne4AvHNE+fubbjAKhjd0Ie0YpVG9OTB9n3w8eMMO54/37ElNEA9nKe19ROrK94BsrY/fBk\nwyLz/RJnOyPOJVflbUSrIFeEkdCTqAgoiCU0p88bAHZEssM77qq884V3OBnBmNAoSMlKdIR2Q0fK\nEzi3vncnAKAmMAL71UxCwJ7Uxz4uawBdomIbNm34ctYxAN5mczc7HKMJLf33ADD3pbVYs6kNmiHw\nvZkH5XxtT/L1ecuSBEOIvCuCefq8ezlgzT1ffa73BrxVW+b+3bPmuS/0hBDOQL5yC7nM0eZ9NZQC\nkc3mNGh8soqAT0VXxPyySfc3Cydw7C/7ZMqAcC2bKVeYVZIwZIStLys7PNW6bZBrmiFb4W3Ezfu8\nDatvWlENGMLA6uZPEMIIvPCKNWGIYc+/bo149pnNqCJe4am81boGyKN2Oc3Qeko2mxFzhLd7xLs7\nvJ2Kxj2TnD0Lnavytu9Dr/RVOLPI2f3F4VjK2b/QVAgtPdGNh7WNhhS2djag0f+x5+k2YxeSUgT+\n/dMLm8STOkKBdDcBACTWHwe9dbzzGYP281blXaFU5q68dbvPO2Teiw/vrWIpQ0Ncj6PaV4VqqRaS\nBBiBzpxNyvd/NBd3rLwPQgi8/uE2ayY2gdZ4m3ksqubchZApka/Z3AooTRee127aaf5/YfSjedId\nJO4+b7v1J6nluaDoplk7U+aXeFeOkfLdNptrmeGt53zOfftavhYDIQSefXOTZxbDgeAZbd6Ppu+B\nWqSmGPaE+7wZ3mXi9Emn4OBRBzr3OvtkFUG/4vzDE3L6S8eerGREyAy8aELzNM/KlWZ4G7qcbtbU\nXaOiJQEpEDOraWsCFfteclk2oBk6UkYKWjQEZ35Su9ncqnxl64vciFeiPZxIr3AGwDfhMwgrZFMp\nCaOqA1AlMzyT1rSpumFO8iKEOUNb0BXedsUl5HSfd2azPwBENKvyViucCxk7+CKxFKCmK3e4lk1N\nnwcjPSFOPkp2c3IslUDQnx5dDwBC8zvvsaO1AxV2ePviEJoPtYFx6Eh2eia+AbwD1pZ+YIb79tb0\ngCd7lrsqfyWqYN72JwKdeQcP7Yo2YmP7Jjz+Sj2WvN+AjmSXZzIcuTJ7MFVcS2Bly/KsVeeAdEBp\nugHZdZ99NGVdlAT7vmSonmcglWKHd54Q9FbehQ9YE0KgM9q7ZvPsyts1sM89IY/r4iffRcfnu7rw\nzxVbcftj+bs4SsGd17kGBvbEPb/AUFGs1oZyxvAuE//fAWfgJ0dfZt0iBqiy2WxuS0npL2A7qGsq\nrfCOa5B82TN1CUN2+vjcfeKSrEMKRK0mc8nZFgBk1XACUNcl1768fd72hCkiXoH2cDIdrjCb0u3K\nW+gKVEXGiKDZPG/fLhWOpszPofmh6cKpvONaHAnNACCQ8rUDwgxGu9nefTucHUqVvgrzVjtIzoC4\ncCzlDG4TKX+Oyl3Af9BKz2fPJQVr/vdPj4HeYYYnanYh4LcvqlyD6qxz8MTrG8zKXElBCkZhRKsw\n2mfeKpdZfcdc4b16o1khN7anJ5SxZ56r9lVBMsygFJLebRW0rvlT5+ftHebAwUlV5gBJKZQ9Teyz\nG1/EB13L4Nvn06zn7PBKaYZ5+6HFvmjpzz3NIk+zuT1oMpmnP7uvfd66Yc6alykz4LuvvHM3m7tb\nLqJGFz5sXJ31Pok8XROl1t8Ba3Z4r9ncUvKpeYuluwuw7vx10QZnEp5yx/AuM4p137JPVhH0eatl\nN2HIGGmFdyyhQW8dl7Uvs/K2vmxEOoilQBSSqkEkXTNluSpbu8/VHinuft4OTykQMydesSZnCfpV\nnDzCmkFMNiAk3ZqaVIJPlVFTYb5XJGmGVWs4BikQgxGvQDJleJrNkykdck0z9EA7KhP7mHO4Z1T+\nABC1Ku8qfyUkSYJPVhFJxLHgnS1WeNvN5n4AMoQhpcPfl4AywgxLvWW8s09ZT48U3m+vaucWPpEM\nQsTMufn9B6xG20irerIH1bmqe8jmjGxyVbvZzN01GmN85t9nc8fnnr+RHd6bGiLOHPbuPm+7X7/K\nX+lZtz39hZT9ZRxOpPvZd0fN/v0vjzBnN5QrO7NGnO+KmhPb5Ap2O7xSuuFtclfSa8c/99YmvPbB\ntqzX9iTfaPN05e0Nu2ff3ISXln+OsNTsdH/0ps9b13NX3lpGuOVbqx0AYnmazd2tBLvHvYSH1z6O\n19au97z2b1/8H/wHZy+GUyhDCM/FQ8Gv62cVav89GhrD+M2j72PTjg7c98xqRON9v3ArplxjI/py\nwZJM6Xjzox34y8vre964DDC8y4w9baoqKZ7KO4sho6bKbPKOJjSkPj8MX459A0YsPamK0CWn8naW\nEQUgWc3u9qxn9v4As9nZvlXJXXlnjVZXXCPMAVQEVewd2MfaRoMOzemHVhUZFX4zFCNWsOzobIYk\nCYh4BZKajpCSEd6VZr9gZXQ/81hzNJt3psyFEsYEzYrYp/iwszWM55dtMf/B+lyVN2BeaNjH75rn\nXa4I43jpAnNbpB8/cOJIp5lbaH7rIsDU5WswH5fTg+Ls1gF17834rKUBcrV5cWCER2Kczzw3G9q8\nM63t6mqF0GX88dkNCPnM/bv7vO1b9qp9Va7WAyNdfarZYbR5d5vzc7u1GEyNOhpGIgg51JX1ZaZI\n9oWZAXWfDVDqvjB/l1zN5prhad2RrM8djafw0vKt+PuSeuyM7Ma8T5/POV97Lkae+7ztqYTXbmnF\nR5+lBxf+c8VWvLBuGT6vfhnq3uY8+T32eXtmFzN6rLx1XXQ72txTeWu5K2+7p+mJ1z51LpQMYaAj\n1QZlRO5FZwpx3zOr8V93vdVta4emG57lMM337t993plTwP7+yVX46LNmvPnR9l7vq9iefXMTfnTn\nm9jZ4p06ubtBh24bt7Wjrcv8/zVfyLd0xEs6HXFfMbzLjF15GxCe8E6PJbcYCmoqrfCOpwChoEau\n9TRf64bkDFjzfPFat4l5mrqtn9uqV6fvv3Y1J2eFp6x7Xl8RUOFTFXMOckWH4QlvCRX/P3vfGW9H\nVa/9TN/19H5OzknvIR0SEjpEulIFiShYLyI2BEQR9PpD5aJX5d5XQbHAtYAIypULWABpIXRIg5De\nc0pO3XXKej+sMmv2npOQkJAE5vlAOHvKXrNm9jzr356/wfTMWax0xyDt5EUKSdiOhxjTYM+5eRSY\nJjgdgxEYq+w2H3D7YGkmKkxqERuqESB33eR9z/k51OD4GZztI2F5FehIjxDu/mRMxylzRvgxascA\nsf34rgGLndIG8VhnODZGNZaFPvlpkZvgZSphKnG0pVqwrm+DiGl7xENPoQskT5vT8LwDuXXqoBTz\nlhdQPO6rsAXKmMqRmBc/EwCwTYqZ9xeY7CuJg2QroJhF9GT7AxKnQo42MQijeQN0Rt5xU4fteMg5\nebylPwa10idSYXnnfCL58Su348mtS/D0tucwHAghIg8j2DDD30f+/Cf3Bd3PWgO18DU2Fsfx8Pra\nnmFL1uRzOR4JlXYtzXrfXZ33cAlroXFuhYhEtqCG/b4RAReWkRvHlOJbv34Bn//RUwGyGS488Xah\nlogfcC/Du9Uudnd4aMlGAMCK9cFFUVAlL/yaO3uz+O7/vIyb734RQDjJ9/Tn8dWfPov/vPe1sm0H\nGxF5H2Lgcp8e8QJu85OSl+CyKR8RWeeEqEjFDWiqItxXhq4G4rfE8RPWvLxvkYsabznWy+uwY9vx\n5zUPBT4D4Mufqi4AAqjB2vKERevS4eo0EU3xyds0NCTMIHl35ujLl1reXiDmXXRcEVsnpCRhTvXd\nxUNuH+pitejpz+P7v30Zjh20qDXTptYwczcTT2rqwv51drbD3dXC+qYbgOohldBx2xePRW1lDBk7\nA0MxAaIGLG9DYeSt+AI1gfkC/GQ3x4DjemhPt8EhDr551xMAgK5cD4jiwcumEDM1DGR4jbwjXLfC\n8jZTUtzft7x5XH989RjUaC1iO8cga4X6qwfXw8vSmv2/L1+BL972NJ5bSePv/YX+wHGinaylwXE9\nPLVlCfr1jZClCbjl3ZcpAIoL64h/iYXGa10rUIqubA8Gi0P4zSNv4KofP4Wt3ZlhNbdLX7YD2SJW\n71oLrWETVOba91gI47W1PfjRH1/Dd38d7o4W51IdvLTzFQxk/UUst4qD3+1hlf009DYa/y+zvAsS\nebthbnP/M0X1hFUnt9flHqF9xe6M561d9HmRPQHDLYzeLrRS5aJDEKW6GYFF2zBW86ad9J70MC3/\nsORH3tt+1cbesm0HGxF5H2KQyduSyLs53YA5jTNgKiwm66mIWzoqkqZI7DE0FfUVPkm7riLI29ky\nDsUNkwFIwiEy2Uj//0bvWwDoAsHfTv/fHLNMxHJlyzwRM6DrKrW8VQdEcaEpPB6uIcXc5isHX8fD\n6/+B3iJzKRcSKNgutnXmoCoqNnX3omh7Qq5UfAdhMWtOiEYBLhxk+k1c87MleHNzHwaGXMhtTxW9\n6LvMAboA4W5zVodOHJal7hIYGvMU6P6POGNnYaksN0Amb7Yw8VAU4QNSQt6KXqTKdVDgekR0U4Pq\nghCC7Sx5jeTSKBRdZLJ+jTx/6QvL20j6fd+lmDe3vFNGCiopDy2IVquOKch7xY4NAIA7HlwJQgj6\nCiWlSxqXf6WWt0tCrErNRUXSxMBQEUosCzVGX3KqomJd/4aApekRD//x4m345Yrf4cnXaDLQ+m0D\nw8a8S8l73dYB/PjV22GOXClkfVHiiXpdcq/L4C9xo/0N3Lf+TxhM+fFM/rIujXFvJstgtKwXf3O8\n2rUcAwU/L6DUba5YGcSP/Jv/5ap/H4ekKgO5MmRf8HZ6cpcuSDj25DbfuGMA/3X/soDrfTjyPpRy\nuEuH+HZi3p0lLU7DKjhMQyv77FBBRN6HGHi3MZd4Abd5OkGJQybvmKkFpBh1XUVrbYX4m3gaeofY\nKp9ocDtHiAYn/Bz+viGPQpjbHIA1eSn9n5KYNz1GBzQXRHVE85WYqSNp+eP86/q/+brmtonnV+7E\nt3/zIlxbw2CBveTYGF23xDvAPuelal2d0o+LaDSBTKUdxVy1ECBcriInn58vWlzPg6XSfXVDJu8M\nYiodu6gVB+ApLC9AsUXSXqAcDzSWbjHCdlxPiKcomgvXI9gyRInMy6XYi1AR96efuXeHbE7eKSGB\nC9XzrT5meadNSu6EoKScboiGDYgKkqXPhpyYtq2/F45EzqpnsnI6D6ahwnY9IfIiQzc8NFbHA+1n\nTxt5MmY3TAfgl8ABtAFNxsliTd86keCn6wrk05JhyAYA1m4rr4tWNV8NcHcQOvkshGEnt4ltnHxl\nK01uHSvvs7p3LX6+7C68YD8ItXon9Kb1cFwPHgsDFG0XetPGkkF6tIwS/n0EEBDuKRsv8fZIzm8n\nbC1n4e+N5X3lfzyOl1d34bkV5Tr85eMoP9ef1/zfbtX+9hV7Gnep5e0GLO/wY3mf8mSMl5mW73co\nl5lF5H2IgVvepJS848wFzcmbqIgZQfI2NEVYhAAATxUPKIXix39RalmHrDBD3OoyFATd5lTpTYei\nuVAU1gwFQNzUUBEP9oC2XZ6lreI1Fssjju5b1oxcbVt+80iWM3P9J9W0fz2uCkUliM/5BxQzD6J4\nAVc37bxGaDy9pDOb6xKYjFw1g373uv4NsD0HMS3BxufPnY0C8k4BLmxhvYfNkaHQc7oegcoFDVUX\nRdtFX44nDkpa2GyBMsAWXZt6ekA8BU7RD4koiivkTLmLO2kkqTEqhwYA5NwMVI/OPfdCFIlv+e0c\npLHCZHYU8svno4KwzHvdEfK8A5LL18vTcyUTCpK8xpuPQU+IOZRlcLn17xFPJPFpqor/emCZf97d\nSHgOZotlOR+q7ore67sDf4nzygoS8/MBuFXtegSKlYU17Um83hOMsfMXf3+BHpdVemGNewVG+5so\n2g4efHo9rvrxU1i5cRfUip7AsUrA8vYTqoazvPNOAf/+3K349crf7/aa9qTbrVZ24U3mPQPefsxb\nnvdk3F+YD7eYKP246Nr4+6YncNeqe3Y7vr2F63n42h1LcO9j4W11AZQ6YgJW9HCaCNt76LuxtoL+\n/sLc6283Uc0jBKs29r6jxi97i4i8DzHwhDW3JObNLW9D5a5XD5apo7bSJ0VdV8vIuxQyAQXIhoQ8\nCnsgd/m7EjFK3rL1yWO0MVNDMhaU7LSJLb6DJxEpngHD8jCiIeWXAknhQSK3PWX/5vNAR2MaJ8xs\nDYxXYdnqPMnMMjV/PjSnrDOb43rCbc47hf34lTsAUPUzOugY7E1UCtT2CljeQ12w3mBN4FwyTGbN\nP7NsOx54YpMYe9HxROa9fJ9UaIDioj9TxKadg+jNDQKOibVbB0RCG1RPlCxx8t64Nc+6z0neBcVD\nkRRgEDZ+fq3En1SejY5CEqZdA5PF8hXNgc403Tlx0fmk280YEd4WbnnHtLjwLshKev1539LnBNfd\nnwsmj9VtwD82/QuATzAXnzyOnstxxaLW3jyOnch5W+QtkvGYkp6iEtF5T7a89Za1UONZPLL1kcDx\nolQupMd63inikefpPX3hra1Q48GMZyhyzFsi72Es78c2P4nOXDde3PkqVvS8OSxp7l6m1IM14SX8\nz9q7sXGAVkS83WzzjTv9+/R2Er5K8wGyUmfEMG/NviKTc9DVl8fGnYPIOTlsGiwvS1R3Y3kPN/4u\nFs/mW12XAEYe8SMfwWObnsTTr2/H7//xVuixpXjxjU6ahf9WePjmQCAi70MMgZh3wG1OicVQfOvN\nKnWba6qwfADfsm6o9gl+WPIOURIjw7jNOUbUVfqHqwomtlcFysc42cRMXciJcnDxEz6GuKVjYmsD\nHGLjcxeNRnUFHWdBTiJ2Zbc3V3BTkU4YaKyOB65HJOUxy7syYYpriM96TKjQnbuQkoHjEphsMdLX\n9Dh6cr2iZGt2zZHivM6OUXD7a2ATGy/upPrvXBa11G0OACZbbK3fPuhnzGsOirYrVMrkudVVHVA9\nPL9qJ2761QtQDBq394h0P1TXLxdi5L1lexH5okv34d4JVmGQGeT3UQFxNbjwJ7UvT4nZKRiIW5oY\nLzQbpk7H1S/FefkCSTdcn7wNej5L8cm74PrWZU/Od3trjLxL1dOMjlV4YM1DcDwXhAAT26tw1GRa\nG19winCJC7evDs72MaKigTeMCUPfUAGPv7JVkJDcjc4cuRJqRXfA8pZDQNLFis5hgYQzBqphH1zA\naH3t0gLDldzmEnlb4eQtJ/r9v9fuxLLulaH7DVceV3RtaHV+WODxzc8AKPdqEELws9d/jf9dG1yo\n9PTnWRc8EiDm4cgvb7t4+LmNWL2ZlmxmbT+GvCvfF3rMvoCX5xUdF//96p34/gs/KRM7KvXWBGr3\nh1ns8NACfw4c1xNVDH9a81f86onnsKnTv++7s8K3sERBfr/fDUTkfYhBVcOzzbmVoTGZUUV1ETM0\n1ErkPba1UsiE0pPQ40c2paXPJHeYbJmXan5Lx7O9yzaPb6kVC4yBrI3KlIXjjmgX23lTkpipiZcc\nh6uxlxnLJm+pTeDoFkqSd6+6V4xHdpvLMWthgbsa0gkDDdWJwPWUlsNVJM3A9ei1NN5cnaIuccfz\nhCeBqDbuXf0AvcaqMWhPjQheuEv3W9e3EaYSA8nR+f33y+eXzRG3vAH4Cxtmeedt+sKvTib8/TUD\niurhjU19gOJC0VwQx4TrefA8iJg4J29uUaowaRmT7DYXjVmkBZur0wx5Bp4QZ+cNxC1dkLeiOeLe\n9hcly7tAnzdV84TbnBOXqcZD3ea9WXo8IUy6Vy+KEkYK/9nryXK3ugKTkXPe4wsxQ1wDNCcQ81ZT\nvbjnzT8LBb8f3PMq7n70TSxdxWK3hpSAVbMT1sQXBQm6HvGrGmRIuQWDdjl5F11byMNyD4ilxv3K\nDtUT3gWZvLmGQSl6MoMwSAIN8ToAwxPgcKpyj21+MqDBz/NKZPL9xV9XYe2urVjWvRKPbHxMfH7/\nk2tx9zPPIT7zceitawLqdjIxquke4bnY3p3BH59Yi+/9lraslWV4d2a7Qse4L8ixDP9C0cP6Aerp\nKG3y44sJ2VjbtwHLi0/AGE1DII7r4anXt+FP/1obOIaHRGQJYPkdEZu6BErCf/YzuxGl6WFW/HCS\nvgcCEXkfYjihbSEA4JSO42GZ5daAcFUzy7u1jr4oJnVUY1RzhbAeAQh3LHe5A74rm26XasI7R8Dp\nbIPT2eZvl9zQJJeCN1SJRNHvuGVqJs4/kVoZU0ZS17HIqAZQLPjkHbf0gIAMjAIjW7pPU20Ccxpn\nYFrdJKzr34AhbTs7h3TxcsyaK615GtIJk1qBsopcSUb9rPH1qEun/HMxxboYUzVzXSL01wFfxtXQ\nDJhG8GfCSSTjZGEp/uKptT6FUpiaLITj66sXbQ95FhNorvGTDGO6IVnOvshMNu/QlzBbwMiWNyGA\n6rG+6l65d0K27ImniVp2wE+kKuboPbKYWA50h1U7EAzZQ1DsONA5GvYW1pVMc0SiD0+aMxCDxa5X\nJm/umvcGqJiOVrsNg3JrTql0atsQJVtVVYVlXfAYKTAvCl3EObAMSU9/1HI8ufVZPLLhn3A9V5RM\n9bA2nUqImI3sNlfCyrcUV7yMB/dgeXMPSEz1PUCK6olqD368O1gFNZ4RLm0ZWTuHQk7DB0efAcDv\nA5ArOPjt31eL/Rw3PKntjV1BFy8PXcge7KGcjd89/1TZsX99diNyBiVEo3VtoFc5J38lNgRr0guw\nJtM6/lK1uqyUUf+/ax8WvyEZL+54Bc9L/elLUXSL5fr/kuXN8asVv8OPX75dhMe4Vfzwhn/ihy//\nP2zxVkKv2waoDhzPw6/+7w08tGRjwAshpH+55e2RMg+kKpP3btrfdrPnbLgGPwcCEXkfYphcOwE/\nOf67mNVwBCyj/PYYnGBUDzFTQ1XKwq1XHI0vf5hm+fK4LQBBvgH3ouwelC1vosHeMBVeVs5WD24v\nrJyPKUnfhWxqBj588nh89zPzMGMctRZiElnlmfEbs6jbvLBsIYobJ4rtinR+njRycvvx9OsUjyXE\nlMfdrSnPCsubeNTytgzNT3aDH1fk1xC3NMwd7y88eDcxUzOggCa1aFKHXMI8Dbyvugw5NGCqscC2\ns0afiqq+2eLvQHvOgHyqi6JXBCEKWup8z0jcNH0vCCccx0Qmb9MsbE+lMe+Cr3QH1wAhCLjNdU3y\nTngl91+aJy7/6hQMxE0NMS3OzmvT5iu6DZe4UPKViO+a5hOo6kiWN53LJ1/y+8bLMW+e8ObsGAli\nGzDa30Rfwbcqq6sly3Dlb2BNfRquPgRNVaEqCoqk3PImqitCSfLcPrrxMVz1xNegN9FSL9cjwoPh\nZYOLq0DCWgi5Q/WEBSqTN/GYfKtr+6ED9jwljARkHf3BnA3Xc7FhYBNMLyUWMLe8eBs2D/ou7oJt\nU8liR0c2xyxCRn6PLN0kyc8SbM9tx5cfvwlPrH8hMNwEy81wulqhQRM6/4E4t+KhR6WJX2rp619a\n5PXYdBHVP1TwNQXYgpiXBZaSmWx5bx7ahqU7yrPOf7Xy9/jNyj+Iv4u2G9B8v/Wl/8Y1T92EXfle\n/HrFH7Az0yme9VIZ1NV9a6HX00UQJ+A1fesD+6iJwYDbnH+XJ2nYc0+G63riPnLIev6lynUyIvKO\nAADQVPYjUspdedzyVlRPuNJrKmJCwjBgeTOrNpDMEaKqxlGZNANxW0teCDA0VfrkbmoGFEVBY7Xv\n9pXJm7+EYqbGXKBKILNazlavTtPjWpK+znhpkhx3hauJId8t7lLL2zTUACnpsWLgHKauBRc2jNhM\nzYSmqXC8oOXNyVtX9fLYqpQ3YKlWYNOpI09ERW6cv13zr5d7PbTa7XijawOKjg14KtobZcvbpN4F\nkDLL2/OISNrjpMFnvQAAIABJREFULwlFt0EcAx4jb3gaFIVlC3P3eWkSIRfaAc1GB2huQNzS/fun\nOYiZmp87UIhTS5yoILaBIjJSzLsI4mpYsqwbdz9MXZM524/r8nixl6mE09kORSEYcCh5X/Ghqaiu\nCU6vmhhCwaRuV8NQy8ibXoODdFJ+PoOWqNEuNVlhiwsu7AJQFz63vF3J8k5qKcQd2kRGUT3YbJ4H\n7SHEtBgaN58HZ/toAIBDbL8Gmn1Hykj4vyvFQ6Ho4q3e9cg5eaSdVnj9dWIMXUO+KtiWHuZKdw0M\nZei4RJMdRhp6y1rE5vwdD+/6LYrI4c8rngxc85Cdpde1fip01RALKNntrbe+BcegnhAufyyseOn3\ns9T5E17dvAFf+q9nxCLHigWJqbQ3Oq9l93rpb3ht34bAdtlb4HgObNfG13/+HP7th/8Sn29l5ZM3\nPPtdvLDzZfxr6xLkmOVdCHNJMw8cH2NNrDqwWUkMBkrF1vZuxuceuwbLu94Qn8ltb6EHr4mrJAI0\ncY4QgnvefACvdvnhCdvx0McSE4frQX8gEJH3IYzm2gROmNWKL15whPgskE0eAqOEcDVVCcgbkuEs\nbwC1lbEAoafiQWICgNYaP0lNjudyWJLb3M821/06TEk0hYu4AEBVih4XsFRLM+Cl97OaYGVWsuWt\ny+QddJsbuio0vGWYqgFNU6h2t7SY4Mk3pmqUkbec9CeTsxibNN+xEMtbjWXxt/7f04Q1TxPudtNQ\nYeq+Z0V0RXNMZPIOtSCY5e1fqA04BmzH893mAGJxlLnNP3bqBEHufFvOy9JnytMRs3RhvampPjqn\njLy9giU8EKQYR8YblFzGRX9O2Hf15XwrLONkaEzZMcR+3EqLWRo8zd83zix/T6X3z9BUOGD3UnwH\n/d5kXAqTlHTVcwerpG1sIWeb8DIVYpz8he95HqAX4RXi+FjHlTAddqzqsg531PJOm0nYDoSHpugV\nxQKAex8qrKT4XXHPx7Iu6vKOF5vhDVXD3joGAPDcm742+NZeupghjo6BQWZpMsubX6Wa3hUoAyxk\ng7+/jJ1hc6RAgyFCF778bT/05vVAMQEvkxZiQaVqfRxLNvtlc+NHVOH8U/zcDzW1C/IP8ub/eQld\ng9TFbO8YgYQex/qSJjy25xPj7ct+gy8/eQN2ubQpztbuTKjVammmkKQNI0au9Oc4dCxyxjtA3d6y\nbsCDq/8BAPjjW38Wn/FjHdcLvEMAQElI5J230Vvow5Nbl+Dny+4Sn+8azIuZiCzvCACoxfzRRRNw\nxBh/tT4hdQTcwSoU3pwdekwpucdMLaiQJJM3CZKZkDdlkBOpONpqq6T9yxcSlaZvRfKXWMyULT//\n/LpE3tzy1lTNJ9mSxUVx3RHwWMIUb0nKY96moQlXOOCX9nDi0jU1IBwiX4OuKtjUOYTfPbpOfM7L\no/RQ8vYXKLy0SoYWIG95MVOSw6C6UIkuLNiKhCnlNDjCCiCOQd3mhJM3LwVzoai0tj5XcFDgbnMA\nlim7zTUoACxDCyTNAUCB5JDQaC5CwtIRN+j86rU70KmsgWKypKd8TMwDKcThEgdEp5nJil70xXDY\ngi3LuscRQtDv7PLbz7Lvz7t0e6ezCTuTVNr0kxMvw1ktF9Ahs/71pqHCUYKVA2JRKB5PAhhFjEx3\n4OjmuXRqpC588iKosHIezGKNyDsAaLKiYhQB2/QXSACgUMvbIx6G7AzSZgq27Sc2Op4jkTf9jsp4\nWlLCo9v6cixbv0jnmeTpwAekBc6OPhZGcA08vIS6yGWyo99B5X7zrx1L50hx8M+XttA+5ZkiuocG\nxBzpii5i5tzw1iq7oShAYeN4EMeEogDZYkHEkkvj/tttP8FLU5WA0Iw1+XmYE18AJ/A1W/rx6rrt\nbJ4NjKrsQHd+FwaKfqWCHMte2fMmrftnv+MbfrEUP/uLb81yFN2i0DRwPSJCFv7AWNWJ68sJK0RF\n7oVF9JqsbMBtvnEbnfNdhV6YE16A3voWc6F71I3O3iFpvQLENpnbnC0M8g5eWeu3C/WIhy2dQ3h+\nVac/3ihhLcJwiGtxFFfNg9dfH7q9lLwtUyuxvOWENXr7501uxPc/O5+2/pMs7/GtJf5MMMuCn1sr\nt7wbEv5Cg1tIPGv5e5+Zh7PmjQ0dK7e8AYiM5dIMYJJP4SOTzg1+oWR5u92tZePxX8R+9q0MQzWk\nemH/+3gs2ND0snri0fX+NRoh5C3PtxJI6C8JA6gudMUQZXQVSRNtKRqX16q6AuSbZZa3r89OAuSe\nLTjCbQ4Ahkl8kvc0aBpLAJOS5gACm+TAeSNmakjr/uIrhz5R1uQWLDEPXoH1ZlcGac9yzRPhEL5Y\n4q1fu3O7YKMgLF5ueXsKJYo3B/3yqE1bbdz10Dq2ncdXM7Br3gKgCNLjC4BYjL2U9SIUBUjoSVw0\n4TwoTgyKbqOphu4vXP/FGEBUmAodP7f+O6uepIsgT4Preb4YDot5d2W74REP9fE62K4nFp02sSWl\nO2Z5mwkpt4Fu4yEEl1Vf8DnK2QUUbRe/fGgVVm1hjXpcXWyX8wb4dRLHFGI7ikoT2VZv7sMdf10B\nWymI+VWhS25ztsBgdegkmxZz2DkwhKLtQW9eC60mqKrW7/o1y6qqBKRhAdAOaZK1nnPZ78s10Jyg\nZX49OT80kA35/cmu+tfWlau6FdyicJtD8VhIyQfPc+GLqEwxA89mioKuCkVzgqV10m9Qq+wRTXgc\nhzDLm97Hj7Z/Bl6mIuClsl0Pv3/CL9+7c/n/4DtLbsMDT/niMZHlHWFYqHu4Y2aJNWwZWlD3N1Aq\nRh/kptoE6qvi0FQ1QO6TO2rF/08fU4svXzjdj8ejNL5OURvzCZ94Kl08MJd5Q3UCR030CVa4iAGk\nErIrmrfwLL/YMXV+TJx386pImNA1Bc7WsdQqIeUxfuIRVFoVpaeDqRmi5EgJqXU3VCMgvfjZD07B\n5R+YIf7Ww8hb2t+TpEcntNUFd9RtqIqOuGR5z2+eCxAFesNmP6Pe1ZHJ2zR2yaw6a+ozfptXx0Au\nHyTvbGIjI2h6nzVNoeTLXtqJlEvdpooHz6afxS0dST2Jwlv0+lzFFpa3V4gL0RauVpbxBkUSk8hl\nYN+/dscu/Owvy/HEm5ScSYaFW3jZGrdwTN+789yrg4J8XOY2R7IXUF2MVuaCFBOB73hsgJYUcosx\nriawoycLt6hDt1xRiREgb0BkxOftIjziIR9jFmM+IfIKAFoOt2pjL/64lNbzt6aaqQyqwlu32uVu\n81hSkD/XyOdVBbativtJPy/gsZe34ull27F5Fy2Rq02m/aQ/j7vNFYgcCFsqeWT3dzBrY8122mKX\ne0Dyeep2J4SAe43VWAbEU0AKcXGN37l7KdZvH4AxgvUzcHTkXliEpNMIBzZ4GZ+mKgErmkP+zdhM\nuY84hig5zEreroxdImID3+1N57A8abDgFkTCGkqSyVJGUhCrIyzvrK+q6OqA7gT7juvhSWe268F1\nCfNuqMgXJE8Zu8ai7QbG8GrXcmjpXvEbEfu8S4jI+z2GcLe5VPIVEvMOWJYSudek/Bfr/KlNmDra\nJ3MAAUEYDpnc4WkBlzk9p2+5VyV88RiZ8MQChBGVSGarS6IuLnkDPA26ptDEKkUBoIAUEtAhuarZ\nS8ojBIs6TkBV/wwa72OQyZk37pARqJsHVZKrkhYBCb085i27zT2pfj5VojKnKABcFQ3VCUwbXYu5\nExtQHauC5VZBiQ8FMuppqZhfh6omhkQmLHENDOVtFGzfbd6XXI7p09l99VToqsK6zrH5GPMMVNZb\nmj8TBduFoijwhmhoxEYOipmHQhTAlmLezPLut/tgxIPkzc/Vn83h+VWd+PsKSt6lljePLTpMMCb/\n+kLs7CmIuHavthHfff5HUHU6B5br51qIlynJQzFzIt5tKQlk8g6Ia8BVCjBYtUYpefNcjbybD1iD\nzrYxrByPC9FQ8n19K81gbk01U8ubu80JJe+EpQuXdtKK+d4PI0jeXPdAdPDzCsiKen3672lzx4rf\nnS2XWqksROKYoGI7qng+sgUnQJwAkMl6ICBwPMePeceyIIUEAFXyDri49wnfclR0ByAqFM8MzLWm\nKqHlcmD3UYkPQq3sogtqT4NC6Dhkb1fGLre8lVjWJz+jnFjzbkGUivE5cnc14gOpy2CqlvjMcT04\nnoO8mxeqisTVoaiOmGN6fSXfwd3ujkcXAJoNuAZts+zySgJ/n7LjpTkChkmqO0CIyPsww5566IZa\n3oGYd3mdNycbz/MCCWvyQqBUfrB0eygUglhJrbpsrVt6+PG8QQh/iR49tQkfP20irr5oBkzNpCtu\nNv50wixrSiCTN38RVqUs6KqO6sKkQHtUUzVEOdCYmjZ888jrMK5q9LDXaGhqoJZ9XHMdPnDkCNx0\n2Vz/slUFhVVzoearsKDZF27hgh4yKhMJ6JqKL104HfOnUq+CiQR9YRh+0h0tFSOB8h7enAWOgf6h\nYHY9APSxzm10kaMGLG8A0OtYwhT7bOG0ZurZYQRAyTEPnSQAKNA1BQumNWFEFQ3Z7Mr3wUqweHKJ\n5a0YBVimImWrJ6Brip/YptlQFF+qlQghGVVoxW8Z2ibmwHN8qV23zw8ZKUZRWN4WErSch32HbrJY\nrsVkMJnHIGXRf9/Y0oVfPkL7NDudbSDFOAtNcPJmI0pQi7Ml2URj3sxtTsnbRdzSka70UBmjrV1L\nyZ+7r4u8RxD7DRb1XRhwugHFhcZEgyqsBGrScRAiuc0VOW4vJe0x0ti4Y1C4r0lJ7kHRsyl560W6\nwGChB+Fh01ykkiE04AYXWSqzvHUvjuKGSbC3jaLbmSWqN9MFDsnR3vSKS8chk3e2pH4bAPSGLYjN\noNnmhuWTYFKpggIFBafot2FlY/EKcShOnC7CJLc5j6kHLG/NoUTMUGrdK6oHKJS4HZewcj0ahvIX\nOPQ7MnlbkLfT3QJnRzubA7o9bmmR5R1heBT3QN56iaVoGcGENeLJ2xXpv1wmskSqkyGsLWCY5Q0A\nY6voD5vYFsa0BF3VMtEamoFvf+JI3HrF0SXnpeTI5V1jpoZjp7eIuHiVVcmuRRMNW2QYEnlfcfZ0\nfP7caRjTSo/RNZW9YPh1aSJO1VSTQGOqBhWmbJkH57M0/p00E/jwiePQ3ugfo6kKvMFaJDedgOqY\n/7nc7IGjpabc2rcU+oJVOem4GnIFF7miCy3mZ1VziyWQxyBZ+l051vCFuc0NPRgWEYlVnobPnD0F\nNRUxen+IBuJqKHg5KLotXsS6ruITZ0zGNefT+9WT3wU9zsnbEucCAK1yF5qmv+W77l0dLbVJsVDQ\n67Yj1rLJF3MJkZYFAM9gCnBF+twkYjq83iZUDbIKDL0o9NKTajWyeceP+zJLTjGZNeZpuOXf5iPN\nyHv11h68vtFPsgJo8ppX4vZWrBwsNYZ7/rYJBP4C1CU0YU2zCsg4Q+iobGGeJhWEKFA1Rt6eDVM1\nYLOsZrHAqejBC7gPWsNmaJXsGow4aiuo0EvOLhey4fFuIkkFr9vZi9gUKpzCFy5y3NzziEgMEwtX\nISTjoChJ2Y6qYhnlJeENVSXoLw4grqThdnb4izUtaBUX3jgyMA5ZMjUjZYIHcmMAQPEEeRc3TMIM\n71xYmomiWxAxb1GD7eooFF0YqinKHh2X+Cp2IrFRh6J5yBSkeWTkrQ42wB1gZWWqC9vxqKdDs0Fc\ng3lwuOVNv38wa4v5cLvaUB1jybvsGY9behTzjjA89mR5lwovWKaGie30ITthVmvoS5KngHgEQQlR\nibiUEPIu7fTEccX0T6Bi2wkgmUqcefTIYcdqqDra6lOoqYiVfS6j1Hr33dYkKNTBj5dUz2pSScwc\n71tqhqYG6n0BX7iBZ30nDD9cUGZ5l2Sex8JKxbgbnhDELR1nzO/AvMmNWDituWxfSyuPmXPyVrhl\nzd2sRRfElPpCM3KXQx2Vdf6LWGQrexp0VaVubzlhx2KuVtfXntfZfSaOgSFniMqzshc5d5vH9TgS\nehy7cr2iJI+/zD+4YIw4f6eyRri947qJz35oqug0BwBoXYmCW4ACJZDMJ5frOBq1evM5Rt5snKpD\nCXj2tCTMup3wCnGkvUYqYcleuq/iL4DqUPJm46urjPueE83P6OcvfM8jovWqxvu6aw5yWQVLWJtM\ngy1aHTh0MZ2gNdod6XamSgfAU6FoLJud2DA1U2RNBxfQ/iKNz21N2gI8Ddtz23HHsrsgMvqlccLT\nxMKoM+tnO8ulcIBP3rwlqli4Sm5z3oa3Wm3CN46/KuCh4ffC1bLwiIdxDS04YVYrxrXUse22P5eA\nOG57Fx1vgLyZZXzVjE/jqhmfDswBvRd8gRLDUMaFpZk0YU3EvNn8cfJWTJFQ5rgeduWpp8kX86H/\nDhX8MSi6DS+TRmbVLH8Bwo7f5qyHogDeUGXAbS5yC3K2mA/iGJg7voWek2kimHpkeUfYDfbU67fU\nhZyMGWitT+G2Lx6DxaeMLy9XAkRrPyrmIFnGEonK33v6yJNRH68NxH5lWJqJq886ATd8bI7I+JXB\nm6+UeglKt/NyH8sILjgqmeWt6DbSyZBac0XOXA9+h6YpActbBpf7TOp+LH5PlndY9yQeo+eqcecd\nNwafPnsKmmuTqDQqA/uGldvFWekWfzEeN82vr1WkF7/O483SgqxdnVZ2PuL6lrdcIy7I3/W158e3\nV2H+lEY0pqtEaRBPaNOlhUtNrBo9+V7U1zOyZy/CU+YEdeCJ4oB4KmaOa0RTTQLfuuyowPa8W4Cl\nmaiuKF8EAUBRoyV7OUbeosENK9dzk53wFAfurkbs7M0hm7eF29RGHlrtdroAkcSBYixPQdHcMnf0\n8nW70DfAXMUa70jmBBZIwuOkuFTVLk5Jo6OizV9oen5M2iU2DCk8M5yXAQDqE3WoqYgJ1/1rXcuR\nJf2wJlBJUd/y1oU7l3sv7O0j4Q0wi5aR8zPbn4dLiBAb4QtX/syYY19FhtDx1+ktSFspGLoKj7e5\nZZamrdHjm5J1+OiiCWirpgaBYmVhjnsZWsUudk56n555lWaqb+rpFdfG3ea5IR1JI/heUHQbnsEs\nZ9tAf6YIS7NQCIl5wzGQLzq+qJLqoug6uH/NX+k1MhU7fo0i1q7QOm6xuJcaBdmOh60OFW5xu1tD\nLe+hrB2o8EjH6Hvig8eOwHc/PQ+WoUUx7wjDY97kJswaX4+vLZ4Vup1nVNdZ9Zg2uhZnL6Qu7GSM\nJWaFkTezvUvbBcqiJvKmM0Yvwk3zrw0mp5WgtjKGUc3h5M7JebiYOd8u9MdLkt7SJn0BKbqDdLyc\nvGXLu1RIxtBUv+SoBJwYZMtbLyFXTmAfnXQhxlaNwsiKkqYlAM6Y34HT5rXjU2dNLtv2hWlXBWr0\nwzL2E5rk1lc0jGzyCb+m+xhfI54n+kj3tEFvxw+O/ffgCT1VinlL99RgbnfPz3jXVBWfOmsK6lL+\nvXOKLAFLWrjUxqphezZ67R7qvuS92y3//GkjhYq0CsXTce6xNI9A04KLy135PliaiY+dOhFnzO8o\nmwvCwgCdPVRHnN8jTmI7MszqtC1s685Qy1tWA+WhBdvCSbOobn/CYG1Nx7wu+otzwn9pdZcIJ2ga\nK8nTnMACSSgPcnI26QJjRLoVlsmS+jwNHmhfe9uz0dMnJToNoxxYvXURLM2kSoeyVCk2+vMhW95C\n598RcyD2Y8f/c9OT6NPXQIlTD0ap5a1oHvQxNJuee5Fk8halWCo9vi5OibE6Sc9jtKyHVs3ugfQc\ncm8N11bo7suhM0sJ/Sf3vFn221fTPSC1G0CKFrxsBdZvH0BPn4OcUxAiLVzbgdgx5G0XGgwxxgIZ\nQme2G9PrpooWvdzyzhTZ74Qt1OIaj/v7mgeO6yGDXpCiBZJL0wx1fs/ZImkoZ0uuewOVTGggFgcq\nUxZMQ0XRdvdoYO0vhJs+EQ5ZWKaGK88tt644UkYS35p/HSrMVHhMOqS1J3/Zlbb+k6340pZ77wS+\n5R1O/pogb/riLiXvuJThHeY214gpvqd0gUHJV8XE+Cx0NFQFtnELf3duc+5Wntc8B/Oa54SOP27p\nuOD4saHbUrFYwPIPu0dJPQk4/vbqtH+9llcJe/0UWJOfB1GZFSBZhZahIaZbSOoJP8bo6dBUBTFL\ng9vTgqJuw+zw5SHh6mVd32TLyCkyYpeItyZO44V9hX5U6JXg7RsURUFh9UxY41+B7dlIWAZqrKQI\njWglnouck0M6UYdpo2sxbXQtHlqyEYWVR6FuyhoMEhazJ4Bjq+hoTQjPByfvXqaRbqoxbO/JImbq\nouc44MtbLpzSjounUNnadExanNWzpL1Aq1z6Ha45CKj1rCpAmmON11mzeD57oSeNhL/wJCo8uEhY\nOlzVLbG2gwsYTt5xRp7phBH4neaJn+XtDXBi8suYeLlVIJ9B+v+COgjVGoKXjwsPguyB4z9z/rsy\nNBW9/QRWo+/9Kap0DPUsVl2TKM/VCHw/I/8iyeP1tT348f8+g9gR6+EO1ACuIQiZQ6vugqIAxS3j\nAU9HvujCLChQzaLoC6ym+kEIdWsXii7i4HF5FzZT4RvoKxeEGsxnAZjQ0rS6okKvQhcQ0DywHQ8O\n8ZUCs3lb/K54eCJrboNVxWrfPRWVcbqIzjssVBUbgDbiDeSKp5XNzYFAZHm/B1EXrxk2mexblx9V\n9hmnZbIbgi61yt8JRFx+mFOqSnncXkZCcmvL7U55rJlnm4dZtfzlf3TNSTh7zKmh35PYjdv8ncLU\ntYALN8z7kNb9a7JUU7jhAZoMN29C0NqX3eomW4BUxZi1ThSAKNBUBcmYgfqqONydI6ES2UrSAhYz\nAD+jHxAvYpng5Xr+ilgSs8bXi0XloglzUUmakXcLyDm5gJiPripCHpQjVhL394aqMdM8xf/A1QEo\naK5Jipp8txict7pUBTp7cxjMFuF2t+KktuMB+PKWDekKUXWRNMt/G7LkbYJtH4qvg9FOFzky2QkJ\nYJ6Mp9iIaVbwuXV05L0cYpVDVMSmpLJDBifvGHvuUnED8o8jD2r1FlbPFORbEaf7ajXbpQ575RoO\nAPWsKUYRhCWr3fjxuThnQfnikqvrmYYq7jl3mxdUujyrZ5Z3Q0U5eZeqNxJPRc7JY9naHiEA43bS\nZ3XZup7gHHDPgOwV83QoCi+1I1CT/SC5FFRCyZ9b3takpbAVapWv3iDVkrt8AcF6rTdSQZZRFvOI\nSZZ10XHhoCg8BnLuBPds8GeBjRhxg3fQo+SdSa6F0bwBm3qD7UoPFCLyfp9hREMKN867BjcvuEF8\nxt08Lvt3XP+5uOGoqwPH7U/y5pa1S8KTO9QSy7s05t2UpOpN9bHaQO05fzlzyzuMGPnLP6xjG/+e\nZIjlffHJ4zB+RFVACW5fQL9fETHN/kJ5b2ceFgAASw+St6oquPC4oDu+QrIkeQ9s/pKloQefMK5f\nPBvzpjQibUrk7Oplcyxn3E8f2YxPnDEJx83wBXZ4xj9A5+vKc6dhFksMvPCEsRjZQL8/5+QDSXma\npsLZOg7FdVP9awxJ2otp/vg4cTbX+Za36ypI6v51N1VVwiME67cPQFVUnNi+AIBfTiff0zE1I/zs\neAauugbQOefQG1g3L4mYDFWjiyJmeXuqLeLoHPa2MSDwUGxaRj/gmvNmubdJMQsgRKEd5cDIW/N/\nG0VlqGwMHfXU82GOXAU11ReYJ/n7AKDIiJfY1LXb0ZTG+NagZgPgaxYQ4ru9eYJWERkYqiEWdfXp\n8pBYaSIeHAMFL49ETIfCeoDzkM+ytUHyVpmSn+w14Za8Yuap7oHmwstUwjI1arm7vuVcTG1mx0jN\nhQIxawIt3Y8RqTbUWLWB8Sqai6ydp78VtmjpzxQDx1tSCaCzg4Z3+D1/bPNTeHzz037mPQn3KO5v\nROT9PkRDog6VVvnKmbvGLSWFpmRDYFtIXtY+QxXkHX5SlcfaWcy71FoZXdmBzx7xcXxp9hXB47ik\nNCPv0pp3gFs1VIq0FMJtHmJ5nzJnBK67ZFawZn4foCgKrr5oBmbWURWz0sQdAEiYMVHrbGkmkjFd\nkLKmBUkLAJKmFONn19DMFjgcfOlVmbLw6bOmoDImddjytLJEx3qplOeoiW1YMK1ZzB2AwPOTCLsG\naQ5ly5vfS/klHUbeimv6nhNO3jVJcbzreUhLY2itoZ6ATN5BIqajwkoHqiHkMVZYaehvLkL+9YXi\nM/k+xIzdh5scj8BQLKiJQSjxAXhKOXl7fY1I6Wm4Zl/g+NLKCQFXF2JKybgRVC5TWaxXImdu9QF+\nXH+4RLiiTscwsq4ON1xKQz1hiZIJk96zwWxRsjqZ2xw5VJgp8ZzIz5x8DYCfsEk8DXllANvct0SN\nNo9Db9hRrtYG0JJD/qyLOTviaRH+IPkkYqaGfNHBUM6fI0/3NQ9Kx2N0rGJzRFATr0KMe5mE5e0K\nsR5O/rmCFPPWHDom1QOxTdibJtFxSc/tfW89KMJYils+twcCEXm/j/GlC6ejrT6JY46gJQ+cvGWC\nGsvqoxtq4uUn2EfwOHRYpjbgW+Ya++2kQmq5p9VNLluAcLe5RuiPKszyPml2G7568UyMaPDJ6+On\nTUR7QwpjWqk1EbS8939ayOSRNbjsiPNxycTzsajjxLLtluRaNzUqQiMatygKNFULvDiC5E3nroy8\nSxwnSSNoeZeiPu6Tt0zEHBVSA5rQBUiAvP2xcs+HHDoI08h3XCI8LPwl3taQFDK6iZiBasn676j3\n3fiJmA5VUZHQ/WssXfAYugqST6Hw5mwU101FXaU/3ngIecvEWSi6mKgfDUVzoTdtgIci4iELkLRR\nCaIEO7uVhif880slmpoaIG9P9ZOkOJRA1QDvXS/FsaUua45OiW/e+A7RwS6szDNlMNlbqVaeyt8S\n2MghLXljShd79BqY5rzJa8jpta8k//TVANl5t3aXS6USxwCIhoZqdi8kmeOWDno9Zx45HjGTajP0\nZf2ySRKKQnq/AAAbb0lEQVQbCJxfHo+i29CbN7BrTIjx8Xtijl6OdblV9CBXFzkAckKbodN7Egif\nlNxzy6I/suaaYEXJgUJE3u9jTBtdi29/4ihhhXLXuKym9qULp+Nri2dhTMv+eyC55T0cefPt6YSO\nb19+JCrfpqv6iDHUHdZWzVyKIdZFzNQxqaM68PI5dnoLbrr8SBh6iOUdco79AV3VcXTLkaFjNAzV\n713NSJeTN19Y8aoCUzUQtyTVOmF5S33R6ZkCf6UMyfIOJW/frZowysm7UnqRlxJj6THyNXLrkkus\nAggo1vE6esfxfCEPRkS1FTGcMa8DC6Y14XPnTEW15Sccjm32a/m5cE9ausbSaxC1+P31cLvbxLMD\nhJO3vMAp2C7aY+MB0HI7opAyyxsAKqQmL9yKa2sIL1NEqbWmlv825PvUW/Sbhvglf/52N6RxkRyO\naU+34YNjTgsQZMqS480avEwaWsUuaPVbQBQvcDwAjKroQEqpFp2+xjbV4rR57SK0YW8eL/bV0n1C\nOnU48FBGQzUTKUr6IaW8QRu3jKqvQ8LSkc07yG0ZAQyy6xS96/05mDLC9x7yTPWkkfS9H9K+y7LP\n0jE4BkawBQ4kt7mha9QL4egY1ZzGly+cXrbodPUcTM1EOvHOQmtvFxF5RxDglrfspo5bOsa1VQ13\nyD5B3UPMW1jm8IZ/2YXg46dNxFXnHYGFU6hs4R7lW4eBoRnCZbuv53gnMHVNvMy5vCTPOOfZ2jUx\npg6lKIhJ8WruNi9VsCq1vNvSkmBMyAtVJtQwy1te1ISRe3wYy1tkrEtWolySyMvRbMcT18jjoYqi\nIBEz8IkzJqO5NonqmL+gbK3zn9EjxtJrr5LIPVGywCgNxUwf689XmEuYuJqwVfNFF0nTAnE1EVMP\nI++URN7cyjv32NE4//gxZfsSV99ziZFENos6ThL/z/UQZOudZCqRe/HkQGy/QiJfRVGwqOMEJGzf\nQ5OWyRsKiutoAqLesLnseAC4es7nME+7UCwARjdX4YLjx4pyPrenFflXj5PGT3uNDwfujeGWt73N\nn6ch0Bh52kxhVEsFXI+gp9dD88CxwXOwRe8lp4zHF844DpW5CQAANUkt85SRRJznHYQtJFzd98rx\nksGqbrg1a2jioWvgklMmYOro2jLvQ09uV6gH5kAhIu8IAsfPpAlJs8aHtxvdXxhTORJAmHVIMb1u\nCh1P24K9Om/M1DFjXB3SZhKWZgaSrvYWPEZ6INzme4IpWd5claqmgvc7py8MTmxFtwhLiqNazHug\nqzqumvFptPfRspVSWhhd6ddUf2NxeQWCjHgIecsItbyHiXnLXp20Qq1dXv8L+Ja37bpoSNDnkBBg\nzsRgDgYAVPMFDILiOVzJri3V4o+x1PKWyPu4GS1oqfWvIWUl8K3510IfkhY4ro6PnEItyUVzR9De\n6LYpuqrFJaW9tnrqrm9IBpvoANSDcvq8Dnz9yC/jxBHHBM4vo3YoqONAXE2QCQBMrh+Lr5TkfJSF\nPzxdlNQBCP09aNLzLSc+AsBXPngs4GqC+NIhxxMoYlw8h+Wy0yYKWWRSjAnLXHZpA0DlllNwztgz\n/HOxPIiKBPME9jYF8hIA6k2Z2O7f95baCmiuf2/5d+SLDlRFRYfH+ruzkreUZHmHClY5BkYIqWMF\nbj99RovJbfQjVy+rfvHnItwDc6AQ1XlHEDh9XgcWTmt+227qfcWF4z+ICdVjMatxeuj2SbXj8b2F\n3wyWK+0FNFXDNXM+H3AN7y0Sehx9hf6DYnkbuubXm7JabRHzZqRTKxGX/DIxpSz6CTVjYbkZAD1l\n7D0i5WeOj24O96x8ceZnsHFwS5m7tBRhZYmyNR6WkAYA7eZErCg8g+aUb/0J8nY8zG2cgXV9GxDL\njMTZx00qO77GCo77psvmYiBbFHM1qmoEsCV8DHweZ42vx8dOnRgcu6WjLl6LOSNH47lupn3u6Zg/\npQknzaZCL6+s7mJSpdQzInsqvrZ4Nrr6cuhVN4nvT5oWvvrJo0TYoCXVhIUtR+GxzU/R87s6IHHC\nCGUatrxYg/icf9APSohGVYKJi6RE2lh87vj3Jox8PdZ61elpQoKFX266bC5Wb+7D5JE1UFdUwovv\nYseXPwfHTm/GP5dyOWBK3o01CXz90jlYuWEX/vPe10DsGHXtl1xDtVGHk9tnYkzlKNz94t+xsZMu\ndqaMqsH0TbUY0ZjCX5esByGK8C6kzBTGj/AXXifPbsPmt6rQyYVYuDgMlzw2LZCiJRZZKTOJmDK8\n5U1cHUdPbcIf/klbpBbfnIPYnL/BNQbE9pgxvOt/uGf9QCCyvCMIKIpywIkboC7Go5pn79aqTUuZ\nrfuCpmQjUua+kT9A3c5pI1VWc/5uwNJV1roRqGSJYaUxb9EUAcGyt9KSLz6DpIS9Dc3AhOqxGF9V\n7sLlGFc9Bie3Hzfsdo6w8IdcBx6WkAYAU5Nz8NkjPo6zR/v19kdNpkQ+sb0auqrjkknn47w5c0Q+\nggzZbQ4A7Y1pTB3lx65HVUqysiXPkio66ZW7qrkVP76+zf/Q1f0sZQCmqQUy5mXhoLilo70xHcgb\nOG56G1rqgs9jXPYGuDpOnOV/XzpBVdZ4YlmYlRhIOvR8adLW+iTOOYY1B5LqpsPCHzWDM+H2NqCt\nOE/McXtjGiczmVut6Lv+wyz3uso4KhL02r2S52DyyBpceMJYv1lKyTV091PCHVXZjqnG8aLne1NN\nAl+4YDqV2iWqyI8wVB2WZiIVN/CJMybhyxdOR3tjGpMa/C6AHz91Mlrrk2KR1TdUCCRHUstbCx0P\nAMAxkIobuP6js3Hy7DYACohtwVN5A52g5X3JxPOFpxAID58cKESWd4QIIfjY5ItQcIvvaAGxr9B1\nFW5nG2yjgCvPOx8AUJP21a+AYDKWXH5klpK3SJ0t/56rZn66/MO9wHnjzsIDax7CxJpxZdsaE37o\nZTjytkwd0+qCNevnHDMacyc2BKoBhgOPaZdm1nPwpD7e31lGW30SmzuHUFc1/MtWvi7iagGXv6Vr\nAZd02Eu7OdmI5mQjtmd2oqmy3LshW84nzhyB8SP8fToa0wAUmEocBZIVVutpR7ULyeOEEYcChS7M\nJCI6YWYrTpzVhhNnt+Hz/1WEmuqDSozQZ/mCo+bi0aWNWPyhCaFzoBerwIVd08N4ssZVjcZLna/5\n1QESqtIWyFZekkUt81HNVP50guT+lhdn3PuSZImH3kAN1FgWtudn4C+QmvzMbZyJf215BgBNPj12\nuh8uaapJYHl3HGqKJr8ljSRivN+BY+L85o/h3rcegJryLWuAVtmMba3EkZMace+W5diapS4U4hgB\nsaKjW47ElNqJeK2b9q1/N2PeEXlHiBACUzOHVak70DCYhKuzdRzqE9R6a29M4ZxjRokOaTweXB+v\nDVjbZoj4DDCsmN07wokjjgnGbSXIRCFaNZYgbKyqqgTaq+4Opmbg5gXfGHZxAAC5l06iL9sPBD//\n6AcmoK0hxayr0rHTfyulxjtfuXB2YB/TUAPkHQ/pLqcoCq6Z83m81rUC0+unlm3XVA03zbsWf1n3\nMOY1B88/aSQlNiPXgEJsA5QEJRdNU8X9VhUVcT2GrJMrkTv1O7DpJIbCiqNRlQ4nlTEtlbjinOHl\nluOZDgwZW2FU9pZpP3AsnnQBptVNxuyQMFgypkvtR6llfvyMFpxz7KhABYvrlmfX88WS21/ni+WE\nYGTFCDQmGtCSKs+h+dAxo5B5eRJeGKKqZykjAVPKjxhd3Q4vlxbkXZr1P7atEvW91YK8DcUs03pI\nmynoqg7HcyLLO0KE9zOSMQOXfmBCwPpUFAVnLRgl/q6NV+PauVehxqoWtdMAy1QPwbvUKyGARR0n\n4G8bH0d7upwggXIvwb6gcpjOdhy3f+kUhDlPYqaO044qb4RSio9Nvggv7HwFExpaA5/HTC0QTx7u\npW1qJuY2zRz2/PWJWnxy6uKyz6tSFlrrkujcUgN97Aa/R3XJjeTiIh111Th/8Ww8+vwmHD2Fkpii\nKHBcD4CCUU27n6fhoCkGiqtnY+KY6mFzH3Z3jRPaq1D7Vgp96BYKZJapBcIbANA7SGPSFSG9Cnij\nkXFVo8u2AfQ6bzjqK6GeBUPXcMmcU/DCE4/T79aswH6WqQXaKB83pTyMJHdPTJrhXRJrrCp05roD\nuQ8HGhF5R4hwCIJn/u8OYaSol3Tt8t9T7z57nz36VBzVNCvUnQr4mfEHEqX913eHay6eiQeeWheY\n+yObZuHIpvIOfg3VCZw4bQyeHqB61/EDYHGdd/wY/PTPWRQ3TII3SC3x0kXYpJrxWLVrNU4ffRLG\n1ldibFu4FT1hxL6Ve/L5U/YxPUpTVZw//Rj8YvkGuF10XsMWmJUpujiZPKqmbBscE1+c/CWMqKsu\n38awu/CWoer4ztHXI+fky/bTVQWktxVesg+jnWNw6YfKEyPlBWKlVU7eAK3+6Mx1v6sJaxF5R4jw\nHkLpy4n/fRAMbyiKMixxA8O7+A8WJnZU42sds/e8I8MHpx8JrO2GSzyMZuWP+xMzxtZhbGsVVm30\nPQSliYeXT/kICm4xkMAYhpHDtOfdEy47bRLufvRNXHRSeV7D28XMhmn45lFfxdeefx1AeF+Bs44e\nicqkhWOOaA58fvnpk/Dy6i6MaWh6R9LE1bEqhFG/rqtQMrUoLF+A6qnhYYFKSU2wKhHufeClm2Hh\nkwOFiLwjRHg/4GCw9x5Qmhl/uCFhxHHxxPMO6Hc01iSwamOv+LvU8k4YiVBteY5vXDoHb27uxbi2\nfVNIbKlL4tpLyj0Pe4vGZD14NnxYuMTQNZEhLmPhEc1YWELo+xO6poqSjPgwuvNT6yahVZuADVuK\n+MAJ4eWtnLwjt3mECBH2Cj/43AK4XnnSz26SzQ869kfM+72OxupgedecCeHW4XAY3VKB0S37ZnUf\nKBxKizZFAbhBP5znPWkkcN2xl9Me4lY4ZY6pot6RltSBW2iUIiLvCBHeA6geJpt4yqgavPRmF2aN\nrwvdfjBxqLnND0U0VvtW9e1XH79XMfxDFeYwCmXvJj555iS8sbGPlX3tObSkKsqwxA0A46vH4gfH\n/ntkeUeIEGH/4NjpLRjVVPG26qbfbZjvASI60JgyqhqTOqpx9NSm9wRxA76lezBx9NRmHD2VWsn7\nK6fz3SRuICLvCBHe01AVBR1N+67xfiDwqbMmY/32gVDVtAhBGLqGr148fKnZ4YTT53Xg2eXbUfUu\nqDjuDfzQ0qEYXBoeB3Qpd/PNN+PDH/4wLrroIrz++uuBbc8++yzOP/98fPjDH8Z///d/H8hhRIgQ\n4RDC/ClN+MjJ4/e8Y4T3FM4/fgx+eOXCQBOZQwEXn0wz6WVltsMBB8zyfv7557Fx40bcc889WLt2\nLa6//nrcc889Yvt3vvMd3HnnnWhsbMTixYvxgQ98AGPHjj1Qw4kQIUKECBHKILvQDyccsCXQkiVL\ncPLJJwMAxowZg/7+fgwNDQEANm/ejMrKSjQ3N0NVVRx33HFYsmTJgRpKhAgRIkSI8J7CAbO8u7u7\nMWWK322lpqYGXV1dSKVS6OrqQk1NTWDb5s2bd3u+6uoE9P0cI6uvP7RigYcronl854jm8J0jmsP9\ng2ge3znejTl81xLWSjV59xa9vdn9NBKK+vo0uroG9+s534+I5vGdI5rDd45oDvcPonl859jfczjc\nQuCAuc0bGhrQ3d0t/u7s7ER9fX3otp07d6KhYe/EByJEiBAhQoT3Kw4YeS9YsACPPvooAGDFihVo\naGhAKkVrTdva2jA0NIQtW7bAcRw8/vjjWLBgwYEaSoQIESJEiPCewgFzm8+aNQtTpkzBRRddBEVR\ncOONN+L+++9HOp3GKaecgptuuglf+cpXAACnn346Ro0atYczRogQIUKECBEAQCHvNBj9LmF/x2Gi\n2M7+QTSP7xzRHL5zRHO4fxDN4zvHYR/zjhAhQoQIESIcGETkHSFChAgRIhxmiMg7QoQIESJEOMwQ\nkXeECBEiRIhwmCEi7wgRIkSIEOEww2GTbR4hQoQIESJEoIgs7wgRIkSIEOEwQ0TeESJEiBAhwmGG\niLwjRIgQIUKEwwwReUeIECFChAiHGSLyjhAhQoQIEQ4zROQdIUKECBEiHGY4YF3FDmXcfPPNeO21\n16AoCq6//nocccQRB3tIhzRWr16NK664Ah//+MexePFibN++Hddccw1c10V9fT3+4z/+A6Zp4sEH\nH8RvfvMbqKqKCy+8EBdccMHBHvohg1tuuQUvvfQSHMfBZz7zGUybNi2aw71ALpfDddddh56eHhQK\nBVxxxRWYOHFiNIf7iHw+jzPPPBNXXHEF5s+fH83jXmDp0qX4whe+gHHjxgEAxo8fj09+8pPv/hyS\n9xmWLl1KPv3pTxNCCFmzZg258MILD/KIDm1kMhmyePFi8o1vfIPcfffdhBBCrrvuOvJ///d/hBBC\nfvCDH5Df/va3JJPJkEWLFpGBgQGSy+XIGWecQXp7ew/m0A8ZLFmyhHzyk58khBCya9cuctxxx0Vz\nuJd46KGHyB133EEIIWTLli1k0aJF0Ry+A/zwhz8k5557LvnTn/4UzeNe4rnnniOf//znA58djDl8\n37nNlyxZgpNPPhkAMGbMGPT392NoaOggj+rQhWma+PnPf46Ghgbx2dKlS3HSSScBAE444QQsWbIE\nr732GqZNm4Z0Oo1YLIZZs2bh5ZdfPljDPqQwd+5c/PjHPwYAVFRUIJfLRXO4lzj99NPxqU99CgCw\nfft2NDY2RnO4j1i7di3WrFmD448/HkD0e94fOBhz+L4j7+7ublRXV4u/a2pq0NXVdRBHdGhD13XE\nYrHAZ7lcDqZpAgBqa2vR1dWF7u5u1NTUiH2iefWhaRoSiQQA4L777sOxxx4bzeE+4qKLLsLVV1+N\n66+/PprDfcT3v/99XHfddeLvaB73HmvWrMFnP/tZXHzxxXjmmWcOyhy+L2PeMkikDvuOMNz8RfNa\njn/84x+477778Mtf/hKLFi0Sn0dz+Pbxhz/8AatWrcJXv/rVwPxEc/j28Oc//xkzZszAiBEjQrdH\n87hnjBw5EldeeSVOO+00bN68GZdeeilc1xXb3605fN+Rd0NDA7q7u8XfnZ2dqK+vP4gjOvyQSCSQ\nz+cRi8Wwc+dONDQ0hM7rjBkzDuIoDy089dRT+NnPfoZf/OIXSKfT0RzuJZYvX47a2lo0Nzdj0qRJ\ncF0XyWQymsO9xBNPPIHNmzfjiSeewI4dO2CaZvQs7iUaGxtx+umnAwDa29tRV1eHZcuWvetz+L5z\nmy9YsACPPvooAGDFihVoaGhAKpU6yKM6vHD00UeLOfzb3/6GY445BtOnT8eyZcswMDCATCaDl19+\nGXPmzDnIIz00MDg4iFtuuQW33347qqqqAERzuLd48cUX8ctf/hIADX1ls9loDvcBP/rRj/CnP/0J\n9957Ly644AJcccUV0TzuJR588EHceeedAICuri709PTg3HPPfdfn8H3ZVezWW2/Fiy++CEVRcOON\nN2LixIkHe0iHLJYvX47vf//72Lp1K3RdR2NjI2699VZcd911KBQKaGlpwXe/+10YhoFHHnkEd955\nJxRFweLFi3H22Wcf7OEfErjnnntw2223YdSoUeKz733ve/jGN74RzeHbRD6fx9e//nVs374d+Xwe\nV155JaZOnYprr702msN9xG233YbW1lYsXLgwmse9wNDQEK6++moMDAzAtm1ceeWVmDRp0rs+h+9L\n8o4QIUKECBEOZ7zv3OYRIkSIECHC4Y6IvCNEiBAhQoTDDBF5R4gQIUKECIcZIvKOECFChAgRDjNE\n5B0hQoQIESIcZnjfibREiHC44ZZbbsGyZctQKBSwcuVKzJw5EwBw3nnn4UMf+tDbOscdd9yB8ePH\nCz3rMHz0ox/Fr3/9a2iatj+GHcDOnTuxbt06zJ8/f7+fO0KE9yOiUrEIEQ4TbNmyBR/5yEfw5JNP\nHuyh7DUefPBBrF27Fl/60pcO9lAiRHhPILK8I0Q4jHHbbbdhy5Yt2LZtG6699lrk83nceuutME0T\n+XweN954I6ZMmYLrrrsOs2fPxvz58/Fv//ZvWLhwIV5//XVkMhncfvvtaGxsxIQJE7BixQr89Kc/\nRV9fH3bs2IGNGzfiqKOOwg033IBCoYBrr70WW7duRVNTEzRNw4IFCwI9ijOZDL7yla9gYGAAjuPg\nhBNOwJlnnokf/ehHIISgqqoKl1xyCb797W9j48aNyGQyOPPMM3H55Zfj/vvvx9///ncoioKdO3di\n9OjRuPnmm2EYxkGc4QgRDk1EMe8IEQ5zbNmyBXfddRemTp2Kvr4+3HTTTbjrrrtw6aWX4vbbby/b\nf+3atTj33HPx29/+FpMmTcLDDz9cts/KlSvxk5/8BPfddx/uv/9+9Pf348EHH4TjOPjjH/+Ib37z\nm3jmmWfKjnv22WfhOA5+97vf4Q9/+AMSiQRaW1txzjnn4Oyzz8Zll12Gu+66Cw0NDbj77rvxxz/+\nEQ899BDeeOMNAMCyZctw66234r777sO2bdsOSy9DhAjvBiLLO0KEwxzTp0+HoigAgLq6Otxyyy0o\nFAoYHBxEZWVl2f7V1dUYN24cAKClpQV9fX1l+8yePRuapkHTNFRXV6O/vx+rVq3CkUceCQCor6/H\n7Nmzy46bNWsWfvKTn+ALX/gCjjvuOFxwwQVQ1aCNsHTpUuzYsQMvvPACAKBYLGLTpk3ieN4+debM\nmVi7dq3okxwhQgQfEXlHiHCYQ3YrX3PNNfjWt76F+fPn4/HHHxfNPGSUJqSFpb2E7eN5XoCIS0kZ\noL2M//KXv+CVV17BP//5T5x33nl44IEHAvuYponPfe5zOPXUUwOf33///fA8b7fjihAhAkXkNo8Q\n4T2E7u5ujBs3Dq7r4pFHHkGxWNxv5x49ejReeeUVAEBPTw9eeun/t3eHOAoDYRTHHyGYJlwAMAjg\nAFROSC0STCWCIJCYBhwOwxEqegIkuqLBbRN0LQaBxkBZsdkaDJutmeb/05PJ517eZCbz9bYmSRLF\ncazhcKggCOQ4jm63m2q1mh6Ph6SfVv97VJ/nuXa7XdH+z+ez7ve7Xq+X0jTVYDAobX6gSmjeQIUs\nFgvNZjO1Wi3N53MFQaAoikrZezqdKo5j+b6vTqcj13XfGnq329V6vVYYhqrX6zLGqN1uy3VdrVYr\nNRoNLZdLZVkm3/f1fD7leV7xVWq/39dms9HlclGv15MxppTZgarhqRiAj1yvV6VpqvF4rDzPNZlM\ntN1ui3fn/3U4HHQ6nbTf70vZD6gymjeAjzSbTR2Px+J/4tFoVFpwA/gbmjcAAJbhwhoAAJYhvAEA\nsAzhDQCAZQhvAAAsQ3gDAGAZwhsAAMt8AxJ5C+54P8QOAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72fab5e290>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAFnCAYAAACPasF4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzsvXe8XVWZ///e5dTba3pCQiAJCSWE\nIJGmoSSgjsg4gmCb4Tf+dCwURUdEQXGs41gYFQvDiIyIiKIIJIAgEBJCgJBKertpt59z76m7fv9Y\nu55zboiQBCL783rllXt2WXvttfden6et55Fs27aJECFChAgRIhw1kF/vDkSIECFChAgR/jZE5B0h\nQoQIESIcZYjIO0KECBEiRDjKEJF3hAgRIkSIcJQhIu8IESJEiBDhKENE3hEiRIgQIcJRhoi8I7yp\nMW3aND796U9Xbf/iF7/ItGnTQsfdcMMNoWOWL1/OBz/4QQB2797NCSec4O3btWsXH/vYx1iwYAEL\nFizgkksu4bHHHgPgpptuYuHChSxcuJCZM2fy9re/3fudy+VC19A0jfvvv/9vvq/Vq1dz1VVXHdSx\nDzzwAF/72tde9bVcvNbz3wi46667+P73v/96dyNChFeE+np3IEKE1xsbN24kl8tRX18PCBJas2ZN\n1XErVqxg/fr1IZIeCZ/97Gd597vfzW233QbAqlWr+PCHP8zDDz/MV77yFe+4+fPn8+1vf5vTTjut\nZjvr16/n/vvv55JLLvmb7umkk07i9ttvP6hjly5dyvnnn/+qr+XitZ7/RsAHPvCB17sLESIcFCLN\nO8KbHm95y1t49NFHvd9LlizhxBNPrDruuuuu4+tf//pBtblp0yZOPvlk7/fJJ5/M4sWLGT169EH3\nq6+vj09+8pO89NJLXHHFFYCwAPz0pz9lwYIFmKbJypUrufTSS1m4cCEXX3wxS5cuBYRV4IILLgDg\n1ltv5atf/Sqf+MQnOO+883jve99LT0+Pd53ly5czffr0qmu98MIL/OM//iMXXHAB73vf++jq6gKg\nu7ubD3/4w1x88cWcf/75fO9736vZ18p7ueqqq1i4cCHz58/njjvu8PatXbuWSy+9lAULFvCBD3zA\nu85I26dNm8b+/fu9893fy5cv5/LLL+fqq6/mM5/5DAD33nsvF110ERdeeCFXXnkle/bsAcC2bb7x\njW8wf/58FixYwC9+8QtvrL74xS8CsH///pD15MknnwTAMAy++MUvsmDBAi644AI++clPVllMIkQ4\n3IjIO8KbHhdddBF//vOfvd8PPvggCxcurHmcbdssWrToFds855xz+PSnP82dd97J1q1bARg1ahSS\nJB10v9rb27nuuus45ZRT+PWvf+1tt22bxYsXoygKX/7yl7nqqqtYtGgRH/3oR7nppptqtrVo0SJu\nuOEGHnvsMdra2rjvvvsA2Lp1Kx0dHYwbNy50rVwux8c//nGuu+46Hn30UT70oQ9x9dVXA/C///u/\nzJ07l4ceeogHHniArq4uLMuq2VcXP/nJTxg/fjyLFi3il7/8Jd/97nfZt28fIISiq6++msWLF3P+\n+edzyy23HHD7gbB+/Xouv/xyvvvd79Lf389Xv/pV7rjjDh555BEmTpzIj3/8YwD+9Kc/sXr1ahYv\nXsx9993HXXfdxerVq0Ntff7zn2f69OksXryYn/3sZ3zuc59jcHCQJUuWsHv3bhYtWsQjjzzC1KlT\nWbly5Sv2LUKEQ4mIvCO86XH66aezefNm+vv7KRaLrFy5knnz5tU89oYbbuA///M/KZfLB2zzO9/5\nDldeeSUPPPAA73znO5k/fz533333Ienv2972Nu/v+++/n4suugiAOXPmeNppJU477TTGjRuHJEnM\nmDHDI85ly5bVvNcXXniBUaNGceaZZwLwzne+k127drF3717a2tpYsmQJzz//PPF4nP/6r/+is7Pz\ngH2+8cYb+dKXvgTAhAkT6OjoYPfu3Wzfvp3BwUHOPfdcQJitb7311hG3vxKSyaR3P21tbbzwwgue\nteO0007zxuepp55iwYIFxGIx6uvreeihh0LWlkKhwPLly/nIRz4CwKRJk5gzZw5PPvkkra2tbN26\nlUcffZRiscg111zD2Wef/Yp9ixDhUCLyeUd400NRFC688EIefvhhWltbOeuss1DV2p/GzJkzmTt3\nLnfccQezZ88esc1EIsFVV13FVVddxdDQEIsWLeLrX/8648ePf80TfXNzs/f3Aw88wJ133kk+n8ey\nLEYqVdDQ0OD9rSgKpmkC8Mwzz3gEFcTQ0BBdXV0hC0Q8HmdgYICPfOQjWJbFV77yFXp6erjyyiv5\n1Kc+dcA+r1mzxtO2ZVmmt7cXy7IYHBwM9U1VVVRVHXH7K6Gpqcn72zRNfvjDH/L4449jmib5fJ7J\nkycDMDg4SGNjo3dsOp0OtTM8PIxt21x++eXetkKhwBlnnMFJJ53EjTfeyK9+9Ss+//nPM3/+fG66\n6aZQexEiHG5E5B0hAnDxxRfzve99j5aWlpo+2yCuvfZaLr30UsaPH19z/8DAAC+//LKntTY2NvK+\n972Pp59+mk2bNh0yLa27u5sbb7yRe++9lxkzZrBjxw4WLFhw0OcbhsGaNWtqCiGdnZ1MmTKF3//+\n9zXP/ehHP8pHP/pRtm/fzr/+678yZ86cA17r+uuv58Mf/jDvf//7kSTJG4OWlhYymQyWZSHLMrqu\n093dPeL28ePHI8uyJ3xks9kRr/nQQw/x+OOPc9ddd9Ha2spvf/tbHnjgAe+6g4OD3rF9fX0kk0nv\nd1tbG4qicN9991FXV1fVtrs6IJPJcMMNN3D77bdz7bXXHnAMIkQ4lIjM5hEiALNnz6anp4fNmzdz\n+umnH/DYzs5OrrzyyhHNuKVSiU9/+tM8/fTT3radO3eyatWqEaPKR4KqquRyuZoa9cDAAOl0milT\npmAYBvfccw8A+Xz+oNpevXo106ZNIx6PV13r5JNPpre3l1WrVgHQ1dXF9ddfj23bfPnLX+aZZ54B\nYOLEibS3tyNJ0gH72t/fz6xZs5AkiT/84Q8Ui0UKhQLHHHMMo0eP5pFHHgHgd7/7HV/+8pdH3A7Q\n0dHBhg0bALjvvvuQ5drTWH9/P+PGjaO1tZXBwUEefvhhb2zmz5/Pgw8+iKZpFAoFrrjiCjZt2hQa\n93PPPZff/OY3ABSLRb7whS+wb98+7rvvPn70ox8BwgoyZcqUgxrvCBEOJSLyjhABkCSJCy64gLe+\n9a0jkkEQ//Iv/4Ku6zX3jR07lp/85CdeVPiFF17Itddeyxe+8IVQBPrBYM6cOfT09HD22Wd72qaL\n6dOnc84557BgwQIuu+wy5s+fzymnnOKtPX8lLF26NOTvDl4rFovxwx/+kFtuuYWLLrqIT3ziEyxc\nuBBJkrj88sv53ve+50W4z549m3nz5h2wr1dffTWf+MQneNe73kWhUOCyyy7jS1/6El1dXfzgBz/g\ntttu48ILL+TPf/4zN998M5Ik1dwOwvJx88038+53v5tUKuUt8avEO9/5TjKZDBdccAGf+cxnuOaa\na9i/fz/f/OY3ufjiiznrrLO48MILec973sN73/teTj311ND5N998MytWrGDhwoW85z3vYcKECYwZ\nM4bzzjuPdevWceGFF3LRRRexZcsW/vmf//mgxjxChEMFKarnHSFChAgRIhxdiDTvCBEiRIgQ4ShD\nRN4RIkSIECHCUYaIvCNEiBAhQoSjDBF5R4gQIUKECEcZIvKOECFChAgRjjIcNUlaenuHD2l7LS1p\nBgcLh7TNNyOicXztiMbwtSMaw0ODaBxfOw71GHZ0NNTc/qbVvFVVeb278HeBaBxfO6IxfO2IxvDQ\nIBrH144jNYZvWvKOECFChAgRjlZE5B0hQoQIESIcZYjIO0KECBEiRDjKEJF3hAgRIkSIcJQhIu8I\nESJEiBDhKENE3hEiRIgQIcJRhoi8I0SIECFChKMMEXlHiBAhQoQIRxkOK3lv2rSJ888/n7vuuqtq\n39KlS3nve9/LZZddxo9+9KPD2Y0IESJEiBDh7wqHjbwLhQK33HIL8+bNq7n/a1/7Grfeeit33303\nzzzzDFu2bDlcXYkQIUKECBH+rnDYyDsej/Pzn/+czs7Oqn1dXV00NTUxZswYZFnm3HPPZdmyZYer\nKxEivGmhGxZL1+6jWDZe76542NuXZ822/te7G0cNXtjYy879wyxduw/Lsl/v7rxq9GWKrN8x8Hp3\nA4D9AwVWbekDoKyZPPdyN7Y98tjmSzovbOw54DFHGoetMImqqqhq7eZ7e3tpbW31fre2ttLV1XXA\n9lpa0oc8Z+xICd8j/G2IxvG143CN4d2PbOTXizdw3twc11x+6mG5xt+Kf/nm4wDc/+13oSiHTn/4\ne3wP9/Tm+NEf1ni/48k4F8075rBe83CNo/vcf3XzQpobEoflGn9rX+79+jv4+d0vsmzNPmRV4aK3\nTq55/I9/8SzPv9zNdVecytvnTHjF9o/Eu3jUVBU71JVuOjoaDnmlsjcjonF87TicY7hhu9BwN+wY\neMM9p737syTjh2YK+nt9D7dWaKobt/dz2tS2w3a9IzGOXXsz6K3pw3qNg0V3zzArN/YAsGnnAKcd\n117zuA3Oc3hh/X5mTWw+YJuHegzfUFXFOjs76evr8353d3fXNK9HiBDhtcE180lIr3NPqqEZ1uvd\nhTc8SroZ+m2aR/+YvZFcOJZtY5jiG1EPYAVqrheWgsHh8hHp18HgdSHv8ePHk8vl2L17N4Zh8MQT\nT3DmmWe+Hl2JEOHvGq6LTnrjcTdGRN6viHIFeRtHsc/bRb6kv95d8GBaticQqcrIH0mLY+bP5N44\n5H3YzOZr167lW9/6Fnv27EFVVRYvXsz8+fMZP348F1xwATfffDOf+cxnALj44ouZPLm2ryFChAiv\nHW9E8tYj8n5FaHp4jEzz6CfvQun11byDQWeWZeP+UuSRddn6VAyAzAE072x5iKZE4yHp48HgsJH3\nrFmz+NWvfjXi/rlz53LPPfccrstHiPCGwf6BAo3pOOmk+Nx6MkXSCdWbEGqhe6BAQzpGOukf0z1Y\noLk+QSJWHbiZzZUxLZvWxmRou+Wazd+A7H0kzOYDQyUUWaKp/rUHSFm2TVd3jgmj6pEliZ7BAk11\nCRLx8PMoayZ9QyXGtde9pusVSjq7e3OhbYPDJbJ5jaa6uLetN1MkGVdoSMcrm6BYNtiyJ8u49rqq\ndwOEANWXLTKmrbqvA0Ml4jGF/mzJu+dK2LZNV0+Ose11ntnZtm329OUZ116H5IxTXeBdz5cM9vbl\n6WxJeedYts2W3VniMZljRjfSkynSmI6FYiJ2dQ8zpq2OmFqbZGudUwslzbdmmJZV9bdl2WzenSGV\nUEknVOJxxfuOhgo6fZkidakYqYR/naV7V/B/G+7lIye8n4s7zjng9Q8VjpqAtQgRjkaUNZMbfvYs\njXVxvv+pswD499uWIQG3//v8mufohsXNd6xg9nHtfPQfZgLQny1x48+X8455k7jk7ClV51z7388A\n8D8VbbpKhvw6cLdpmWzL7mBq85SawsOR0Lw/++OlQPW4vBosfm4X9z6xlcvPO45Tj2vn33/6LBOm\nZ7A7N3LdnH+jOdEEwDf/70V2dg/z7Y/No7059aqvd/MdK+jLlkLbNuzKcO2tS0L3c8Mf7sYup/nF\nxy6vauOexzezZNdKmuoVvnvlZVX7n1i5h3v+spmvXHU64zvqve2mZXljB3DlBcdz3pzxVeev2TbA\n9+9dxZknjuaqd5wAwOMv7uH/Ht3E+88/jtOmdfLvP32W9iZfcFi5uZdfLd7I208dxwcvnAbAqi19\n3HqfiKq//v2z+c7dK5k+sZnPXSFWSGzcNci3fr2SudM7+fgls6r6kc2V+ffbljF1XBM3fHBOjdH0\nEdT8g0vvXFJ/YVMvP7l/rbddSuZomLUSuXkaVqaTz922jHHtddzy/73FO+avu5cAsKJ7JRefeGTI\nO0qPGiHCYYRmiAlhKK8BYDj+tQMZP4tlg7Juhvxre/pymJb9N/vcfBPhkWfvezf/ie+v/CkrulfW\n3K8bZs3tbyT8cevDfGHJLWimxsrNIsh21ZY+9vYXIFair/FZ+kuDdA3v8c7Z2S0ijQcOMrhJt2qb\nkSuJuxZ2De0hPmkDieNfrLm/qzdH4riXKI15ofY1MkVsoKsnrOFXmrbXjrAuf9veLADPrNnvbXtx\nUy8AK17uYSivIaWHyE+7H7mpx2lLRG4/8aI/Zv2Be3VzAGzYlfHb3LsRKV5gxYaemv3oHiwCsGVP\ntuZ+0zIxLfG+BX3uZoC8NSe+oC9bDJ0rN/Wjy3kxxoo4d09fPnRM2RDPOqkcuSVwEXlHiHAYURlf\ndDDapghSssnGtzGsiUm1NyMmt6DPc1XvWnoLB0524pL366F5P71HJF7al+/2tlkBf+PR4PN+ZOcT\nDGnD9BbD45zJlVEa/WVcRaOaaJUDBEC52DW8m2v+egNL9jxb+wDJIjZlNXJzd2iz+1yf6FpywPZ7\nC/6qHsuuHm83IK43EyasSvIeyVRdy6Li3rdhWpiWTWzsVtHGpA0j9jN4vf4KoaW/OMCSwu+Jz3hu\nxPMrz6nEf734E7723HeBcLR7Tisi1WWIH/8Cw8ZwVV8AJFXz/pbragsHZVMck1CqXReHCxF5R4hw\nGFG5tEc/iKU+Zd1Ead9LpvU5/mfdrwF/cnU1hf35Hn625k7+47n/8jQGoCoDl6d315hkNw5sYU3f\n+oO+l1eLxri/TjVI2G/0pWLuhAygW+EI6d5MESnuE0ZBD5MfgHIQEtOjO/8KwIPbH625X2ndj9q+\nl8TxYeuFu7xpW3YnALZR7QEtaQZFxSfvWn0cibzzAQKT6rIMpNbXJH+5xj2qTuCXYdqifcVpq0Yf\na12vp6Ivq3qFCVtOjEzQwf6XtDD5dg3vYcfQLnoKfWim7l9L0fn+y98mOfNZlOZetiUfreoLgBQr\nB/7WqIWyeeSj0CPyjvC64I2UZvBwwqwgU10/OPKWG4RWtze3D/AnJ3epUG9RTMq6pYcmm0rh4EBW\n8x++9DNuW/2/r9ifkfBiz2q+8dz3KRrVpBCc6IPEFyTv16p5W7bFrSt/7hFg9X4bpW0vcsv+mvtf\nCbuGdnt/l4zw5NybKSLFAuRtVCeRMi2bIW2Y32/+MzktX7U/eI2JDeOq+g4gN4nnbNvhB1jWTWzb\nJlN2TMuKUUWufZkScr2vKeZr9NGNZu/LhImx4JiWpbosyZnL2Bd/gd3De6vOryWfuEuuTMuirJtI\nqmjLNqsDNF3BsxAwZe9zTNKphAgEfMkhb9saWRjakFvtPefKe1m+z3cZ5PScZzavJGJNyZLT86G+\nACEhDTV8zu83/5lfrL0LzXnHa1lgDhci8n6TwPUvWrZdMYGaNY97pW2vBat61/LJJz7PtuyOmvst\n2+Kl3rUU9dIBr23VEAAOVV9/uf433LT0m6/6fLcfQfI2TCtEriMJMJpmIsUFIbYmW4Cg2VycP1jy\n/YHByaaSED2zeeC3bdvkdJ9MKv3ouiGIwbKtmtqWi9vX3sXu3F5W9673zgHxXILm/JAGG+hfppzh\nzvX30F3oxbQsBofL2LZd9QxFX6rHqrfYz4bBzdy/9aGq4yzbxjBMYsesIzaxtrm2ss18SQ+ZTHcN\n++RdOSn3ZkpIcX/cCjUEGNO0+c7z/81fup7imb3Lvevphskftz7MZ578Mn0lIaTJkh+xXiwb7O4f\nRG7sQ2l0yLscDnzTdJMhLYdhi/5KEhR1v49lzWT7/iGkhE/Y+RoCxLDdR+yYtfQMD4W254o6iZlL\nSc70a04MlAZDxzyzdzm7zZeRm3tInb6IPbl92LbtRZAXk7tZMfCMr3lXQB2zleuXfImCXgwJoK5F\nJp2IYds2e/Ou8CXh2pJ0w2JgqESuqDNYyrIz/gyJ414C/JgDwzK4e+PveaFntX9fWt5/xnJ1v7Kl\noWqzeby25m1ZNn/peoqVgfaPJHlH0eZvAnQPFvjCT5/l4jMm8fLOQbbvG+J//n0+Dy7bwX1PbuPm\nf57LxFEN/PWlPdy5aCPXv382MyYJ0vjNXzbzyIouvvWxeXS8hsjZIO7fIibbv3Y9w5SmY6r2L937\nHHdv/D2N+jF0r5zOj649J7QsA2BTV4Zv/t+LfPySWcydLrLzPfp8F3c/tpkbPjgHPdVNS6KZ0XWv\nLnPfc/tFAJBhGajy3/aZvLxzkO/cvZIPLZzGceP9VIr5khEycRumTUyt1iZKuomUEGSwfVeZNW39\nXhCNaxbvCfgyQ5p3gBwf2LaY4eQQ0IYkSZR1k49/90nOOGEU557lB9Zc96On+MAFM5h/6niG8hrX\n3LqEc04eiz3xRV7u38R/nHUjsQOMgaZb/P//+SRnnTiGf3nHDD73k6VkpN0kRCBxyKToE7PFnwZv\nB6A50cSG5Z1s2JUhEVcoayafuewUZk5u5cWe1WzdrPDw091V0dvd+XDw0n/d8xLb9g3xb+85ke/+\n5iWuuGgikmKCbGFaJorsE+SqLX384Herue59JzNrShvb9w1xyy+fB+CbH5tHZ3OKVV27vON/8dBq\nxqnT/WsnXkJp9f3QtUzSmfKgR3j3P72de34Dn79iNt/69UpSpz8ROvalbfvZ2TlMc32cz922DOnY\n5SSm+wKQVKHxLVu3n98/v5LkTH9btpynLp6mpBlc/+Ol5EsGiVk+mQxr1Zp3X/ol1NR+8oqBbvhR\n0kPlAnJdmNAHy2F/76833AdAfLLQqH+y5EFSPafQ0SKeUXncc7yUAzlZ+x5iEzZj2LBpcAu5cnXf\nDNPi9kWrKDrmckm2QBZC4pduX06PE6TWcEwXeJ+5ze0PvsykUQ3stzZXxRIM6znyJdFfqYZQ8Z3f\nPUeb7FtB5MZ+5PQw2BJIdugeilr1+bWEuMOFSPN+E2CjE7X50LM72b5PfJCWbXPfk9sAPzr0waXC\nf7Z07T7v3EdWiIIxm7p8Te+1wtXmRlp77EbuZhFmuv6hamn2ryvFMfc+4ZeSve+vIjBm+cbd/PdL\nv+CW5f/5mvsa1BoPFktWi34/tGxnyOddKOkhzbsye5aLkmYguf492eLhZ3d6y1hcTb7HMZvHlXhI\nU3DJ0bAMFu34CwPNKwDxvIediPdn13fTlfMjfVEML3p2l6O1PLVqN893v0TeKJAp1Q7ScbEvI/Yv\nWbMPy7YZGCqHTI1lI0jezrNP+pO1bulelHDZuc+la/ezYWAzt6+9i8cHBUm8vCus+e2vIO91OwYp\nlk1+/dgG1NHbWbRGmFslSZivg1i0XBDzn5buAMS6eq/d/gLL973Attxmb5tml0NLBOzOTeJ/UwgE\ntczmPaVe729TEtaRxc+NUIBJ0dm0O0NXTw7dsFCawgFykmqA5L87f1ixKqQVA2RLeedehCY7arRN\nLOW/vzm9uo+u8UFp2093xo84HypWa+lBzTtoNZJi4t4yWYOd3TnvGVYimbb454unM2/mKN4+2yfI\nn6/9FXvG/L7KvVHWTZZt3hHaJsU0hgs6PYNFL9+BlvYtJA2NYoy27xtClqvzIeS0PANDzvtYg7yF\nZu5bshLTxfcjmXHn+v67nCtWZ4orRWbzCIcSlZGiUrzAQCHrmbfcCdUNsKn000LtwJRXCzenkSzV\nfv08U63j56t15ZST8CS47MM1t9nqa5N+g6biVxOIogdyJQfHslAyQj5vbQTyHtZySJK7QNsMCS8e\neRd8Yqg1BtlymKzcyF8Xe3P+RCkpJprmulWcyzb4wlqmXE3ewTEKEpebgSpE3gEBSDMsUDXkQKT2\nYDHnBWC5SMQVT4iT04JUKpPT7A1EsQ+X/D70JlcTm7iR4lh/nfJAKSx8uglz+lIr+dnqX4aEqoHC\nEHe+fA92zH+PJMW3mtiYge2Oz7aW5q0NBI4TRDE40lI/xaA3U/RiG1yhIISA1hcb5QsBVkEEBA47\nhNubKSI39DM0cTGmFCDvcjUhm5Lfn64B35ozXK6+n6DmXWt5m6aJ96w0AnnbisbZJ43lX981k7NO\nGlO1X2kJC2NlKUd8imOSdueCWJk9TuKaGcc0ITf1hPz6l54nhILebBHd9L8LN2ZgWM95Yyyp1fcg\nqToZR8gNCktSsQXbkpDrhjyXVqYQHs/GeEOkeUc4tIhVJNxPnvIUNy3/ukfq7oTvEnStmsG1siu9\nWnjBOCO8fqZDDF6QTsW1C3qRVervUNp3UyxXTxSGUjs46GAR/AC1V6F5uzm7K8k7XzIOSvMeChCv\npBj0Z/0J1jQtbNv2JlLN1MgXq33Kg+UwWelGONZhMKhNK4bXF89H3uATT7YGebtL2AAKZoA43Ykx\noKGEzeYWyROXED/Gj3LPFKsrMCVisqfpuYFKlQUt9hd88t7W7U/87uQaxEDRHw/TMulveB65foBy\n0xZW9a1DM/yJfqhUYwJWDIYdTasQ8C3bloRqJ8g774zhPV+bQS2gPTvrg0dapy8pOn2ZkhfbYBuB\n4C6XuFS/j5Ll77cKIrmKaxbvzRRDhKZaooJXvkLz3jS4FSvhH7c34z/zXMDEbmbbkGyZTDDOooal\nwRVkhgvV38yY5Hh0S/e+p2DSFu/8eFhrjU3Y4AluFB33k6qxs1tsax6TJTFNuLdkW4yHkhTj25sp\nUTQDz0lLOPeV9yPTlWrNWVI133IQ0MylvTPBjCHFyyROehqAgYL/DZzVeiGtyRaKRumIBeNG5P1m\nQ0CajKsVmrdyZDXvkczmXiCRM2lVLrfaNbybItmQ9haEJudqbg9ix9CuUDRxELkAMb0as7k7gcdU\nudpsHiDQkTSUIT3Qf8UMBVaZlk1eL2AENJ+hgJbktl+pLZtWOFguUwoICLJBWQ8njwlOpBkt7PuE\nsHBQNH1hyU0sEgzyqQxYq4zyHa6hESZiCrudSHsMYbKsDCQKmnF39PmWCKxqrTWoea8f2EivuoHE\nCc+BLO47GImdr0HekmJ4VoVcYLy1DXNRiFN0iNHVzmPHrmJDYZV/vqPlDeVqvE+2BIpBT7ZAr5sg\nJBCZHdNF/Elw3EzE3+ZQC9aQKBE6rPmad5D8k5YgviB59xT6+MHKn3r3D9CdC0SmOwKKOdSKtuUU\nVCsVGsOagVmOcDGU10LzjLZsY2voAAAgAElEQVRtFi0J0Qc3ULJWamC5gryDEfbGUKM3Bjv2i/dR\nSfnHj5VEPIIuFVBkib5MMWzCdsZzqJwj4zyDWj5vggKSs9/oHYdeSHrjLzlj1p0V42V0T6TdmEZK\nTWLaZkjjP5yIyPvvEAOlQTQzaEoNkETg5Yx55C32u2ZzzSpXSemHMsmHa3IdSZu3bLe/Yn/l8ifX\nZOx+XJU+tqLtE9NI0dI/X/Mrfvly7dz6w4Go3FdjNvfIW5FCVox8yXCehY0yagd7ctVLbwAKhk/e\nUkVErGnZVcQ8FCC/2uQttO6gmX446ANWzCrNO6g51zKbBzX3vOFf39e8S9imgm0qlMxqn3cQtZaa\nxWOyPz6KDtgVS+L00Du6dzAgyFnV01qwv7XKoxYD1oOc5vfX1WpRDE+wcTXvuvxUrFwrshX3rDWu\nEKS2+W4J28bT8mwIERtA0m5Ckm36snl6B4uOUO2/N0nTIe9gwJfzHWtbT8Z2hJu8JvrQmy2FiClh\nC7N60KKU06sF3IGCP0Yll7wHRoMZQzHTDGnDXpayWm4C1zIwVNA9rdUcGIXZN576mMidviWzXRxb\n49sXAl8wsMA/xsoJ8pcSBU/zjiXEOOp7jmVS8ngAslqWtqYkvZli2IK2XUT2DRQDgmhgjLRts5x7\nEGOcSqj+flMV30dIKLTodSL0bSNGb6ZIWhWBevkaY3M4EEWbH4XY1T3ML/68nk9eeiKdLeGi9nm9\nwJeWfoPx9WP5wunX8H+PbOIvL/oaZnACiFVq3g5Db2m+l889bdO89VLv2B/9YS3nzxnPFRccf1B9\nfOCZ7by8c5Dr3z/b+1DveXwz2ZxGLqWBAiDxg3tXsdkpSPCpfzyJyWMasQhr3pWlI7tdf6/zcXUP\nFkKBa3nTn4SeXtPFI8v3ceOHTvMi1otGkUw5S4MzET350h4ef3EPLQ0Jpk9soXNyteb98PKdrHi5\nhxs/dNorWiFcYUOpMJs//uJu9vUXkFI54pM2cHfXBo4ZfQ0dHdNC5xfMvC9WKz7hqopMMbGXb6y4\nN3R8vkLzfu7lbh7ZvAncVNWySV+2xH/+5iVQyySmr6Bo+WSlxk1PAHIzuAXJ+4k1W2nMdHHh3An8\n/qmtDA6VGTfL13SDxPenZ3aI8+MlbC2JpOrsGxxC003iMSUsSOL4CZ3+j++oZ3dvDqV1L08Ul1G2\nXQ1JRBkPFzRu+eXzDAyVuPyiseLWJBnLtugeGgScSHTJH3NbjyPFNJ7esI1ZiX5mTWmrGVQUFCCW\nb9hLYgZItoK2+VSSJz8VIkPN0kgACScVpmzF0S2DsqlVuUJsSwJLDWt5FRpfPpNAaRVBcXv6ZEa1\npukPHJM0WxlmK/Gpqyi91Iytpfz2jJj4B+zoHeCWX66gN1MiMcrC/WpiOLWoy4M8u24/v35sM23j\nstDqdlJEUvfkMlzzvb/S2ZRkz2CWeDNgim9GNtPY2GTKQ7SlWnjmZT8S37+voNbqru0W53em2wG4\nc/09nNwxq3a8i2yKNtzgMMdaUVpzJraewDZU1NE76Fk3FmjwrmFmOmlKNEFJBLt2NI9l3fYBVm7d\nBzEorT4Lu1SHjEJ/UVhrmuvj5J0xHNf/DrYMlGHKWi/4rqkuTlmvuIf++fR0OMl0VIP+vAZpMUZ9\nmRId44UrIK8XSODniT9ciDTvoxD//fs17O7Nc/+S7VX7smUhDe52tJYgcYP/UUE1eXuk5Ex++/rD\n2vdjL4i2Hti6iJuWfjNkuq3EH57ezoZdmdBktvi5Lp5d3+1V7zEsg1Vb+ymUDTI5jXXbhfZkOaTq\nkXeFGd9dJuVOYNv2DbFuh29CzZm+VnnnY2vZ11/w2ga8NchlS5DDLxdtpKsnx+qt/fz2iS0hs7mr\nzdz7xFZ27B9mYFhM/I/tepIvLf1GTbO6YYj+xlQ51Hd3PIPEuLXGWveiJTRZ24gJE51kkUooJOMK\nQx3LveMkXZBVIUBGmmFy2x/XMRQ0dct+pLrascf3IzqIxy3vOXmacUwTpk5bwlSKvLBR+JT/vHQn\nz6zd7yWPUWWVklXh/5QNpJiOrSWxTRXd0ti0W5hcS3p4vNpTbRhogM2oVnE/8amrKdhhbV+KldnX\nX2D7viGyeY2N+4VmO8FJbtJfEM9/zrQOkulAycfhZmxLwlIK/OB3Ivgp6Au1ikIjdCO1g+M1Sj8Z\nu5wS5tsa5OvmsVY1oRWu69/gCUFuxrPyunnYhhr2V1eQt2vilhQD07LpaEqScuSQs8fOo8mY6B3b\n0C76Kak6tiWDrXiad1e+i+37hsgVdU8rbUk0M8Y6EXO4ha58F89u3k6uqLN70DeB1xtCECrbBbbu\nzrJsXbfXx6ljBOm675rrLtm63/fne/0P3CMO8br7zh53BhMbxmNjM1QerhKgZEMoIak5j3uBeZKq\nYVsSdrEejDj67uORZBu5LksqoVK2xHc0a2In582ayrFNk9kwuJlp08XzH8yL91wkh5EYk5jAgN6L\nlMgzujXtPceGRJpPvWc2tiV5yk1bU9IXnB3yTlvtTFBO8PqWLfrfaaGkM7vzJE5sn0FnXTtHAhF5\nH4UYcgJC6pPVfiP7gCUvCJnN3UxIru9VqTRlSbVNzot2Pk5faaAqorkWatbudYSDsiHuo9NZu+ul\nAK2INh9Z8xb34i6BclG2AxOxc7/BW3Ozk2mmVm1Wl02yAZPycCk8ybhc/IctDzJQGqyZdcowLZAN\nCsmuqr7Hj3+exPTnvd+1AuLKtiBDu5T2+pSIKSiyhGwEAn1KQrovB8hINyykZA65MbBGOKC9h9Jo\n6qItJRYgb9MCbKRYmeZEI7aeQIqXqtJn7sntJ6HEmdQwAc0uhd6Vjsni2tZwC5gKyCb5ongPKrN8\n1cXS4n1QjFCZSxcJSbwbUkwjmw8kRTHFxDyrbTqqpNDPDuIxiX+7ZBbHT/K1HqtUJywA8ZJnBXH9\ntbGhiRyfOA2A/ny1sCNZCiCBWaE5O/uTagJFlogPC3J9dt/zvrAqmyT0Nuxio/C31iB/M9sqtErX\nv+1sb29OolllpjQdw+XT30NCrqO8+RQAzjhFVC5D1T2N2y6lMTPtKI0DyM3i25Ad8rzm1I+RUJKY\nvULI2Ws6Firn2zH6RzPJnOe0GXgXnb5ceubxpBMqaOJdcZMDlZx3Ttt6EqUX345VrAuR97hRTh4B\nh/iS8RjHNYtqeHkjXxWVPaV9lD+8DQNCaFUMMGOeddF2+iCpGsm44rXxrxefQjKhcsGkc0UDDb1M\n7KzHkp3+OO/8xLiwcClt+xjVmvb6m5QTzD6uQ5i9nW1j2tJV1gNVkZkxfpTTB92L1bDNGLppM731\nOD520j8TV0Yu9XsoEZH3UQg3pWFDuka6wVcIsAp+YO5yD9eXqCgyECAb+cDZygyrOjBDMzVRLEEO\ntx3KmuWQd8kh77FO3WOXICqXihkBn7duGV6gkrf8JlS9yaZsB5f4iD4G/es9gexfwSUvUqJA6rRH\nWbTjL962XLmCvKtyh9fI8mZaxI9byZ66p9haeDm0T2nuC/2u5VPXKGDbYDlZtSTFEOStSEiaX3fZ\ndLRGzfKfeX+5j8TMZUiq4S83CvrNAwFKdllMikrM94drugmqjiTbNMQbsEsppHiJTD6Q7U6y6Cn2\nMrZuNC1JQSZBa4LWuB3bkjB6JmBbigjGGhSknXeimG1TYczwOYK8Ee9lMmVWvXMtsrOkKFYmGxDS\nipYg77H1Yzix/QSM2BAtHWUkSfKIxb1HW0tCrOwJGG4msobisXTUif5nQwF8jvbs+DhtUw2Rr/ve\nJeQE8ZiCVaynM93OjuwuQd6ShSTbWIZ/vhCgrND5Vq4Fu9jgkYvSIN7rlqYYNjZJ1dHsFckjLtcq\nIyl6IChNwth/DBAonOFcI6UmUWQJa0jYyPP0e+MNYPaOp0FtAFsKPUM1bnrnx2Iylkve5QyWbVGS\nRTu2HgdkQdJObAKAGjP8sUMQn/usl+xZ7uVkd3FSu59tJnHcS6RmPYuk6khmjNaGROBaQEwnHlMo\nODEPKUX0rTMlNN5sOYuqytiyLtwWtqC5MbFjkWwFdew28vWbkRQD25KIqWIcG2PNSIkCclOvqG8e\n8Hm7z8H1a8tNfdhjnRUThloVVHskEJH3UYxaUeFBM26tJQth8hbHFsoOwcmSZ+6CEaIxAyjVIJ4l\ne5fzu81/Iu6UKHQTHlQm+we8oLr6VIzm+rgXqeznwq4OWOst9PmEqRiA7ZVelOoyJE56GjsogDj3\nIAX81K7mDWHylOurE9G4ZnMXlZp0Lf+pYfpJNoaMAye3qWV216WiiLB2J+eA5m1LgQxteYe8bf8e\n9pZ3ICkmetfxGN2TgLDmHXz+linGRFZNz/pSMjSSJz0FQL1aj1VOIUli+ZUXSZ7MY9kWY+pGe+lb\ng9HphprHLtWBkRBaqwQ9Q4Jsi6azpGr/MSQK47wJXU6UeEL/XxLTnwtZB1plx7edyntCK0DRsa40\nJxqZ2ijiMFIteeceAuRddDRvyRcw3ICidCzFqAZB3iUr8Jwd8rZ0550xYk6kcfC9EwlyknGFsm7S\nnmwjbxTIlYre+YYhuwMi/ne/rQpSsDVBCLGJGyFWoqlRXNclJUWWQBcEVrByoh+q7hEj+OlTvWVy\nAdO+KsvYWgoZGSvmm91BmHx1A5JyGjk9jNK2l8TMZ5AdQSKlJokpMmbJ1byz/HHrwxjNwuftWg2E\ni8dGaXfW5scCPnkH7rNetm8Fd738W4KYN2Yu7516iX8/ySFQdFRJCEiiLdcXrpGIyRSMIkkl4WXO\na3LqqWfKQyKHhaO5e5kiTJVYYTSSbLFOe1ospzNVYoo4/8KxFyFJoI7ewdi2tDf/ueMcU2XqnMC7\n2Litfl/N2EEVHDrUiMj7KEN/IUPylCeQW/bXXCccJCOj1gtVg7xdYrVtO+QTr6V5u/5qqC7WAHjL\nJJTGAZANr22fvG1PA3LN5om4Qkdziv6hEoZp+ZHyjoYeLIPZEyBeSRZtuZp3/NhVyE7mLjcgxtMw\nAm0EyTtoqQhOhi4KevgeNcMK+fprJWUIErxsB9s8sLAFIg7AUHJYpTS25ZyrGCTiCrIsYztadHnT\nqZ42plt+H12t08o3CpO1c76LEHk7yT0kxcS0bAzTIqsNeoFCti152rmUKLK/wmffnGyixZkw3XSu\nSBaWpHt+WDdCtzebc8bL0byNGLph0eFoS7HxIpuZXJ8N9bfDnoptg9wUWAoGDNvid0uiBVsT10qk\nxHlFowSWjLZtltBuXXOrI2C4qTjr4knGNAvhQx29E6VjlzceAGXNyXtQSiPJlne+uz+pCGIp6xYt\nSeH3HigP+uTtnO8SnEsGleZYs38Mck6YY+Vkgfp6cZ6vecvYegJsWJ9ZizJqJ5IEiuW7GWwthW0H\nnoOsE1fiKLLiLAGVSMuNSIkCx09o8oPLzBjDeY3ZTWcgqQbxY1cj1w1jJ7NOH5IidqMk+rJjaCeP\n7XrSfxCGK4CIMY5PWRsao+A35Uac10JKTXJK58zQNkm2ScpJLzmPa2lQO/YwNGoJBb1ISvXT5SbV\nBEklSaacJaZIQrM2VRJxcb5hWpT2jQ9dwzZjnvvw2JaJIj4hVqalIeG9h+51Fdm3HoTa0OO159rD\njIi8jzI8vnOZSBRw3Es10xAGyeBHf1hbtT84eWu2+LusmTy8fKfIchUMOlGq28+X/PaD5kkXwQpS\nUsqv4OOlHJRsQbrgVeJJxAR52zbc9sd1dGcdE6ZD8oWSwc/+tI7t+4ZYvlVIvHaAmAbcDGSBicI1\nobkfYFk3uOuRDdz57OPszPjpX3/1WKAkZg0ff7DYA8AdD7/MMxt2+PsDWt7zG3q4/+ltXoY1gBUv\nB7JG1bBkDBULWJbN/zz0Mt/9zUrW7u0CyRZBOs49SorQvFVZwpZ1UnIdVqbTm1SCfmQ3iMc2Yz75\nB4Uwl0D2noDpaeZim6abFAMa6JT6qb5Glyiw28ls5b5DxbzM48sdM6yreTt+U9fE6fZxy0AXS9fu\n8zVcI0ZXT47lS2JYxXqkej/gUJIgZTfz2TmfADOOlWsRVhGnbbmhnyFpPzNaj6cp0YBWEtey4jl+\nt/lPDJYzKHoDZt94QAqR992PbWbzfiG8NSTqGN/a4l03Ptl5FxzXgusxsYvChy4lnVgKR4BKqkkS\nMZmybnoWiKyWDWhsYdLxfMoBUhjVmgYkFIe8pXiJtMNHSVX0W5Yc06/zPONOXeykEqg1YMvCwuCQ\ntyXrYc0dyA6oSDGd3WN+i5zyg62GCjpnj52HbVbTQVJJEFcV8jlQibNreE9ovyuY6Hum+hslv5JY\ncL16LeKb0Xo8lxx7MZIk0ZRoqNqfjqX9zHqm6iXsKSf3M1jOkI6F6y00JxrZm9/PQMOLYi4zVS+W\n4p7Ht1AeaGFy9p3e8dZwi5dpsi6pYhtxJFVHVWTf8uhp3lLoHqxcE+WNp4KexDBtNnVl+PkD6w95\nIaeREJH3UQYvwtGSR9C8fXJdvbWvan9wOYcZINpHV3RhmFY4KrZm1R0/GKyW5h3URCXVr9Lk19AN\nrNX1yFtm1mThk3txU6+fwcohnadW7eXZ9d3c8svneXHnDsDRLBHLSTzLgeFrIp3pDq8PAC9s7OXJ\nrmdZXlgUShm5bmcgM1egb66/2fXLu9jTm+dXj/vJN4JLjH58/1r+9MyOUCYwSxJ/nzdnfDga10Hf\ncJ6+TJElq/exbscgK7YJ4cQq1vtai2z4ZnPZICY596m7+Zb9PnqWF1MNkH+15l3YO9Zr31svr1ue\nyVnfNY2J6SkhzXt3r/PsHRLq2qvRtdsJ7nK1UvcenWdh9ApNRx23lb+8sMeL5G9K1lPWTdZvz2L2\nj64al3pzFJObJqEbFtZwC5IEckoIdW5ynrdPOFvcS05MY7uNjSLeApDxScMn7yKPPt/FcKmIbcP0\n8e00pfzJ2DYVLjhtgvfeubGK9YrQqpV0nqb6uDdeKVVohZpmehaIgVLGF5Zcn7kTeCincsSOXYXa\nIVZttNbVMXl0g9NH8b6NHS15edBd8vVQIfx1NjaGfttaCjlRIjZlFSYaKZf8HfK2HdO7jS0sHDbU\nxVNccf5xjG2vJ2ZWL29SZMVZlSJR6vOjqMubT0HvOs57zu3pZibEnWWkqk5RFXPP5I4OLxVqXQ3N\n+x+mLOSCSW8T/ayxfGx0U6OnOYPvv3aRVivJ2zGdpzYiyRa2odJQkRBmzqQp3t9mpsPLQJlMqCTk\nJHJcFwKPa4Gq4bcHMHomYGU7aUzHMEyLp1ftZdm6/fRnj0x+84i8jzJ4ZGHEvIQQQYQCoCo0ye99\n8kzGjvJfZMM2mDymkUmjGiiUDQzDqliPWi0cZEv+MqNaPu9g6khJMTzy9uoDy7XIW+GMmaOZM80h\nXOe6biajYPUeOT0szLmOyTc4oU0d3eH9PaqCvLsHi6GsX36DZu2/HXPgLv1lnt+/MnxOILCndi7j\ngHnc6d+0Cc186rJpVUdqZjkkhA3oTgnIYr0n8UuqCNBRFAkUHQXXzxj0AYoJzrUE2IbqTToN9YHP\nXNHF0idLEe3bEpYsyLikGb7mbsQo66YnxMjJgle8xBUWTC0WIka3L+75AP/90X9gUuMElPpBhu0e\n9sbEWH78nbO5/v2zAbDyTVXjIluOS8AwPdLxVg441291TNXZrF1V79pSfWuEbz1wBQwDhRjzZo5B\nkiTU3aeKZUKKiT12LRPGiOuVyqKm9LX/cBYA8+c1M3/2uEAwWIJETMEGGmMueQ/6AW+u5u1o7kpr\nN2rbPuQ6IYTc8E9v83IPWI5PefrxKe+7cjVvL6eMHo7GnzVugvf3tz8+jwmdghzV9n0YUtkjb1fz\ndp+Vi3Qsxa1Xn8Ox45qIqTIzx4r2bEOl05jBW8ecDvhLSs0+EX9QrzZgDY5G6fdzPnz742+lNS2+\nydiY7QzYuzm+ZSpffN+5/MvFM4BqzbshVs/ExrAZ28qFBZJxLS2hnPZSxZyUrqHNB2FrqVBFwpnH\ntPD2U8czPjlZXG+oDdW5P1mSOG5MBzYW31/zQz/4L0DeDTFfwDH7x5KMKzTVJzBMS9R4l6Ct6dBU\nX3wlROR9lMElC3dyrUTIh1rxosdUORSJbdg6MVUmnVTRdKeggHJgzXsoQN7lGpp3KFtWgLx9zdvv\nk0fejmTtfaTudR3ydgPzpGQOuW4IK9vmE1egv2ogAdKYOmfpiUMm/dlSyKzuJVEIEHZQcLED2ZTu\nWH936B6DwVmVZnWlfTdKu798zG1TUYTJOwjbktCscMrUYVMEuNmlOo+0pFiZZFxBloVA4+ZxxlKw\nLRkppnnpJstWwIXg3INhB56pE8ErGEEiIdWhSeKZarqFZrmFMWLifdATyCjIyYDP2yFRvaSCGcM2\nFc9c6xKrazaPqTJtyRaQID9KVGhS+o9lcvNEjxRcK0pojB0TsW5YHmkpDYNI8aInILgTaX8mXPEL\nwFQC5K255F1EqsuKwCzbJ8J0cRJG9zEAPLN/GT3SJm98VUVmVLodWZJZ2buGPnmrFxOQiiW9dzet\nOMVB9Kz/jjvjbzlL+jwyQBBZc6LJS0lslcWzzpQy3mqKSh+xvP2tnNJxovd7Zsdx3t8xVWFWy6zQ\n8UmPvMU4G/smc0LsLG99eqXw3ZRwnoNkc4w1jytnvFcc5xatGWrj7JaFvHvM+wHoqCCphrgjPIze\nSVxOcOnUd4b2B8n7golv4wunX0MlyhtOp7R2nve7M91BIjYyTbkCigvTDs95drE+RN5u8NtFoy6l\n+OLbwYyhBoJZ61RxDz3FXuRkwUmyI85RFZn6eB0fPuFyJmbeAbZMQzqGqsjohk1vtkRrQ7KqENTh\nQkTebzDolsEL3atGXPJVMv3JtbbPO1A4voJ8Y6rirSEGQLZEBKVTYSlb0MKm3Rqad1+gwENNzTto\nNlcM8k4ke9Eh76CJ17CdJTfOB5WIK0h1Wc8n7loO3OVZSpvwVZt943yTslJtogY/o5N7P2U9LJgk\nqHP6GPQHB9ZDl/2JqXISDd5DZWrP+JS1xKesCbTpkLcsoVNhTrNUdFsLZR3TLFdzjnvFFKR4mURM\n8QOeLH+JkK3HQfXJW7c1Z3mM4pnNTcIJQkLEJTWiSQWQRIpUL3LdUB3BSyItNSKlh4jPWYSUzHkC\nUbkozKlWoQEplUNp24OccM+Pe/fdFHdIIZHHzHQwpnwasiT7BXMMv7b4jBZhnVBNJ5LesDxBQB29\nk+QpTwrLhy15/s7eTMl7Z+aOOhWAMaXT/HE2Y9iGitLc65fRDFilEjEFK+dr/xa+5qwqMnElzsJj\nzmNYy/F88REUZy11OuYHU6WoR5UUitKQJxDalkIqoYAR9zK9uVAlX5sDMHSFhBJnsJxlq5NCdErT\nJIKQyg28a8qF3u+JjX5lrrgqc+boed56cPDJ0hUQsFSmpWbzDqeNyY1+8hcARXIEViksCfnr6yVa\n9anETfE8O5rDxOmSN8C5o89hQsPY0H41UBP+H45d6AsLQVgqdqGJy6dcwTWzP8bcUbOrqskFMXfU\n7NDv9x1/ifftg1jn71aQA0g6wlZSjXvvnRog2/p4WJMXFj4xfm5g2+mjT0XVxfuSTsaIKRKGKQJn\nK8fkcCIi7zcYFu94nP9Z93/cv/Xhmvu9IDFbqql5B3OaV5KvLNuUrTC5xlWZdDIG2BhNO/xoVaiK\nNpcb+1jc/cfqvgQQIrMKn7c6ertXHxfAQiz18j5OtRSuUSyHydsNGDKHW3yTskNoqiJj4pN3a7KV\nhJIIpYN1NSZzqIVOyzFhBwQc19xp5ZrQd87wtlea+4Jthgs01Fiap7h542VPsDGHm/nQ8R/ANhVM\nWw/lHNesssiFbckhzTsekz1BwF0/DIARR1I16p01/yaaFyTkBqwFyRtVJyb5ZFkvi0lIbhhk7eAa\nj7xtM0beqaLVoDoR5bKN2tnlkVCx4JiF841IEsSPXYM6QQRTeZYRSQpN0ma2jQ4nKU88oFG17lnI\nl97yWT4y/QOUN84hVRJJRXTDCsUyiPHQUOwEsiRjWlaoZOqxzZP477d/i3GcGDonKIyBsxzPQTyu\nYA2OQt80h45Um3+Q5QsYFx9zPh+Y8T5/V66JZEz1a0obNu2pdqz4MI0Nzn2ZKumEeBZuJjcXHzxB\ntOUSgmHatCRb6C8OsGlwK82JJi8ILvhadaTamdQwgYXHnIcs++MXU2WSCVUkxnFwaufJ4hoBzTKm\nysyfcDafnfMJPjDjn0J9mtYqgs7M3rApOzPsv++9mZIXhNpWURmsMemblMc3d3IgjFQO2MVJ7TM4\nrmUKkiQRj/vvu7Z9JlY5yWzlHXz5LZ9leutxofPG1o/m+jmf8n7bxbqQ5u0+r2CKYzVQddHVvF2Y\nw63e30GN2nUDphNqiPzbm4+MyRwi8j6ieOz5Lrp6wqkphwoaDyz1g5y2O8kLdgyJZSuPrOjinsc3\ne8uhvCQNstCUlq3bz12PbOS3T2xhqKCFfd4V5Js3CuGkIgHNW27qJT55HWpnIA96KEDGJhYo4wjV\nAWuWbYfK5EmKEYo2V8eJ7E5mthUz60ySkuV9nHvl1aH2RE1rvw61p7kYcZ/YHD92XVL1lr5JG9/G\njq4yacXPmAR45KdvO9ExHVdq3k7U9daTwIxTeukcZDPBkBZ+ZkENasPgZm596o/c+9ctxBM1stsF\nNG+3kIax5zjmjj0RLAXN0vnLCr82s47uCCaSuE9bglhZWCVc4UMPrO/V40iK5UUoIxu+VcJdR+ya\n6yUTSbZIyP6k26AKv3HsmHX8pe8BhmNOX4yY9+wanWPASTiiashWjGLJoqUhUdNnbet+bEWQvO1S\n2iPvYKnaJI2MruskEVOxsh1YTlSxHtC8XcjJAoolnv/9T2/HtGxithCwOlLtSJJUpa25goxVrMPM\ntnGccoa3TxwrYWY7mBYkA0vxtFZJkpg35jRa4+K9NbonElMV7zovbupF1uqRFJN0Y9k737VqeX57\nQNtyMjNahb/YNWnbNqT0jnwAACAASURBVExrOZaSWaZgFJnaPLlm8Q5FVvjc3E/xrikLKrZLwrxs\nJLC1BAoqJ7YLAVRRwiQPMLlpkhfU6eLE9hOIbTsXfdf00PZgVbvebJGCM1e11CdCxzUHyLsj3Uot\n/MeZX+Rrb72h5r4g4oHnlwz8bfZOoLzqbYxPTmZUXW0BIRiBbpdTNc3mSlCgCZJ3haBuF/0I+CDJ\nu0pJXSoW2t4RkfffH/b05vj1Y5u56X+eC23/5cMb+MNT2/ijk6fc9dmokkJfpshv/rKZxc+JZTa6\nqfumV4e871y0kcdf3MOi5bt4fkNPyOddGdzh1om2yk6QkWx6Pu9ahelD/uBE0VtD7aLSbL5++wAl\no+RPtorOcN5J0lLWQRITsbb5VH8Nsmx6H+cwvdiWRGnVOZhZ5+OXA+StaiKBhy37EbxOn+rTMXRL\nx9bjFLJJfvC71aTUdCi5hr+EJ4ahy1X36Js7nWIMRh2y1iisCZIFWMSnP4fS0uMtWQHYYDzDw8/u\n8pa+ubBtKeTzdjNC2UYMWZbEGnDZ4PHnffK2EMk3Jo6qByTQ48LnHTCbG5r/2bpaaSJtihSwiu6T\ntvMcOjqcNe+uoBPQLprjzc44OlYBxe+jm9K0PuaTr6Tqjt88TqFk0NaYrE3eAW3ZM5sjfPluLedY\nYFJWHRLz/LPOulndsJDNMEkAyGaSkmbw4DIh7F7UcQVXTv8nprUI7TFoKhX37rgjSmm0jXOZnvTN\n6u77Z9swJu2n6cSWQxM7wPunXIm2YwZm/1hiqkzKuc79T29n5y4np3Z6nTjdUkgnVc49ZayXZAXC\na59PmSpMvP9w5jGcMdrv07njzwx0vur2PbQ42cckSfJIpLT2TK4c93FPuw1mF4yrI5ugAS47ay7Y\nMmec4I/DhXP9wLjeTNEjrvGdgqynTxTvUFOAvNuStcm7OdHkrYmvhfEd4t0MCl+1zOYH8oMDTJFP\nQ983GZBFeteKtoKat+dWIOxDNzPtmAP+OAQ17wWnC5fD204ZGyJvNxvckUBUVewIoViuvfZv/4CY\nLF3Tn0veiqzSE8gnPVzQ6Q/UL0a2KGtmyHReLBuUpaDmHSbkYUeDtMtpSJRANompCnXJcO7lifHj\n2aVtCpO/G6S07xhGG7Pon/DnqoC1fFlDUkyxbjemISkG/UMlLMumZJSRZJvpbZP5+GfO56tPbKef\nHpAt74PSEZnF7HLaXx8qW77ZPKb564fLKSRkSAhLREdTim5LCwWaqXZSRKzLplgj6yWmUCmXbVER\nyCHsKWMbSU2qZ1sOT7CoS6pCY0oBqoYkWSL5DAifbkX0uhtjEJfjDL94BvETlgc0b9kb/1s+IqKX\nZTuGpYhc4u4MLSkGthHnuHHNfPby2Vz/2FKkZE5MHE77Wjkwm7sJJGI6LQ0xioqF5Wb0slQScoJU\nyuCmj8zllj8+AAjTopsfqjXRChUp6t1o9JyjeadUn4ileAlUDb2QwrJt0kmVn3ziHfz48TgbSi+i\nNGRoUBspBsgqpHlrqZqatxfxK0tIkh+kqBsWsRqEI5kJT7iYPKaBK+efSl+fbyFprwim0ndNI3Hc\nS+h7BbknAqbYoJY3OqTNSSGTKMC4pk7MHuGLjqkyHQHTsV1hGscU39aHFkyjdetuFu0Sgkaw1vak\n0Q386NpzhGUFeNv4M2lPtYX93QcoV/Ctj83zAh49Td2Ih4g0SE6V91OJd501hVMmt4a01ffNn8q7\nz5rMt3+9kr39ec+d0tqQ4NZrziYVF8cGY0Nqrek+GHz5I3PRdCtErkGzubftAH5wgOPUuazrEgpR\nkLxdn/dImncwT4W2KRA3QVggPPeUsZw+YxTppMpTq/wA1WT8yFFqpHkfIVg1UpUCVaYxw3KLhMih\nYhCFkkFf0c/JLSt+SktX8ivr1oE1b6fghuf/U0zH5+1XParPT+WczvnO/mCqVD/pQn3MCc6p8Hkv\nyQo/va0lhd9WFVWSBoZLXtnIpkQ9qiLTXu8kvohp3sdZtovexGa7NZklV/O2QdWwveAmmTq5Ednx\ng7c3J0WQn+l/1JYernYkMi4JE2nZ4V3h47dJxhUM1zfsCACphIrlraUuh0zwthFnsiYKIXjJLZzx\nPnPs6SiWWKftEroiS2TKQ0hIjKpvce6gVhIVE0yVeFymPhUThUEUC1s2sJVgoJjTDyeoTZcLtLW4\n5nKfHBpiDWS1Idqakshp8fyntvqaVGstDckQZnt3kp7deiqzW+eIcUgNI8m2l9WsLqkSjym0SZPR\nd8yEgQl8aMpVBNXFUGCSLdf0eYeIXJFFwiBElbRa0buSmfDM+lPGNFV9R5WBQ9bgaN7ffg22YyUI\nam5BIvdWKQT6EkRdYAKPq3LIxxn0N4MTsJZUkSSJ9nTAOmGE1x2nEiqyJCFJEv90/Lt5+4Szqu53\nJKiKHCLaWvcUJKr4K5C3JElV7cnOto7mJLphsddZMphOxqhLxjyiDa7jrmXyPxioilxlNamteR+Y\nvINCyiuZzYPHnth+AnVqmium/2NVm3WBQlCSJHn9DL67ifiRo9SIvI8QauUZr7Xd07wlhd6MT475\nUrXm7aIuJV4iTTdDRSrCmrfN5sw28Zdjcg6bzZ3lN6UpXuIKKWg293Ihq6QSKkk1GdK8dctgW2GD\nc7AFZswj/L5Myat85Urnk5pEQJJclyURU9AtQ0RKuyZ3JxmDu9YbVRdm4YD/s0FpEfV3FZ2OppQg\n74DmrZXcQLhAZivHZFly5CK1bT9Kx26x3MPSPHJXFYlEXMEoOwJATAv5usHGGhjDhORkp9604Res\nUBNiQgsUtFBkiaw2RH28zsvFbLqme1dIkiyRWMJUfFOuQ85lCuiKIF+9GCAMxyeXNftobnL8pwGz\nbGO8kbxeYG+xy8vHPGOUr9U1JeqIyeIeZdstpOFkbnPMo+lEgg/NfC9WOemZ122nIlk66QpbNnax\nAXXvKbSkwlHESSXBuPix6LunoioyTfV+JLoLNaAdKrLkFXoQmncN8jYSnvm2crKH2r7HukCyjrBZ\n1m+/MR7O8hXsl/gd9h8Hr2NraYrPBXzRAZ93Q9zXhG0zTN6viFfBg0HNVKkIbHu1cO91pxO3U1dJ\nskqck9pnVvnjXytqEXWlUHWg/alEtQl+pIC1hng93z7nZs4c+5aqNmu9ZxAm/1cSKg4lIvI+QhiB\nu6vglsNUZYW+rK9590gb+O2m+/0DA8TqlgYtaWaIUIOat9zcy7J9ItLbKjnLpGJlYorsmM2dNddy\niua0Y/JSqoO9MEVO6sZ4AwOlQTQnA1le9zOvGb0TsE0VJSau35sp8v/Yu/P4qMqzf/yfs81MJpls\nkAAJ+yabICgo4i5Qt69WWxUXcKlaRVu1daFUpbUPuFT9Wbva1trqQ12hllddeLpp1YLWlcUVtAjI\nkkD2zHaW3x9nmXMmM5mQZCYZ5vP+h8xkZnLmJMx1rvu+7uuOWevL7eA9uXqMeVyl+/CrDx7Gqzv+\nbZ6npJ7Y/knrzKBmNUZxFy/ZhVRCoB0DyvxQDc0zbG533rKDrjkkbZ6rcLsrWJTvRWNwM3a173Iy\nd0kU4VckaBFX8PZUrsdR3xSBz96yUo45vxO/5IMkCOb6Z8mcKxdFc7cjuwMUAGiqfYFiPq+4OLGk\nx+nnbL3fiNaGmGiNnESCieYeVrOaFr0egZB1fK7gXR4wA+nKj58xHx8NYGBxYs4x4JfNoXMAJdoQ\nCLGg01TEzmz9PslsEqO5A5V5UWF/gNt75IiC0GGeWBAEnFJ9DtQvx6KqPODMwbqzMzkp8/YOm4uY\nV3YhYlumIbLpaMS3j4PSNNK5uEgOIgBQXtJx7tGdOaWbUxUEAQtGXYzohzM7HFcyRRZR2mFnP8HZ\nIcuI+Z2LG89FQYoe+r3NfUHiHjbvjeAdjWnmErqkQCUIAr459RKcMvLkbv+MVFLOb2e4oHFfdLmH\nsv0phs2TL9DSCaYY4QAS9RoAg/dByT1s/t6n9dANsxfuftd2llu/bEJMtTJcw0BdY9jJAPeXJ/aA\n1ttLrAIq8zXNDy8De/WtqI/sT/xQV/C1h5dlQYbeWG0uMSpuhqJ4h819QgClwSLo4SDEUKPzGu7M\ne2d9G0q1oYjpcTyx/jWs/2C3s7etumeY9fqJIri6pjBiVqFdsbWOcnRlrbmOdsBufNGyHau2/MU8\nUCt428PmghL3NOZwF0KVyFaLVCWC0lJ7eU7iP09Ts13oZm1VKCUqsdvaBGCH+SErVdShrug980nW\n/2NZMiuW7QsdqWq7N/OW4tjXHIEMa3jWmuMHzEzTzLytD3YljrgRQVxXUe4aQraXfCnDzKYgpSF7\nIwvZmUqwq5TbjVZExCYYutnD2mn5GPfDiPuwH19ik/oP8xQ0JqqI7S077SmX2JbEOmDAXPs/sMgM\n3pE2H9o3HO08xh42d9bhC4lhUfu47OBk/32bc9YdPwztAJuuGtcdJCVR8BSs+WQRgwODoe0fAqO9\nFOquMdBVn7MbXjDFvvbuzMo5Blfm7Q48/qQ51YmV46C3mFXlyRciboospXyvV0y5GPH35gGaL2Xm\nndziMxv8nmJAd/DufnAZ6JqKSHXBlC2ZsuxMz3FPz9gXAuky784Up/g7AwBZTrwWg/dByJ15P7Rq\nA155dyfuXvmOp9HK8sfeRn2zOTcc0+PY1xxFZWkAJQEFQsRd9GP9J7IztiIFUtUObCsyd/s5acDp\nAOBtB2oF4UvGLwIMEXpbGUR/GIZoNfiQzbaZPsmHoF+GVl8LQdQhVe72PB+agoaWKN57y/xD/vN7\nr+PXaz7Axm3m4+zgamgydMEMmvubo4gLVvC2Mm9REJ0Mz3OeUvTrdg9ZuzPvErnEeZ8lQSvwuTJv\nLe7q2Caaeyy7s57wl0M9VePun2suvZGgt1ZgQukkSKFGSFWJZXSxz6bCMIC4nZl7Mm+/uYey9f7E\nYDPaNHsLy0TWO7bYWspTuQcQVZSUJC5A7A+BgUHz8a/sfx5hcb815SGgusIOgmaTFA0xRIw2xHeM\nxfTBiTXqdvAGzPXtRpv5evaccNAvozpoBqq2Fsks7LOCS0u7+Tu3i3wG+BJroO3gbVfXjq01f85h\nYwc6owLuoFhZGoAAYGhVx9854B16lCUhkXlrZuadnDFqmp5YrpNuODPpQzlV4RLQ8QPXPUfaWYGX\nnbFVliay/AGl5haVMsy/02CK4J343XXN6CHm//3Dxg3M8MhERul+T6LYu5k3kH4IORvsn+WTRQyz\nKtyryjpvhuK+6PLMSSuJkbVU3+/KcSSTPXPeuQverDbPESczKWmAPPQTfLjTu7ymTdgLsbzOmeON\najFEYqq5jlY30KaJEACE6meiWbaWFllV1MGA7GzacP74ryKyx9zooXaIjMtPnAXdMPBKXQPW7/3M\nyXy11lKIZXUIi/sQ9I81s1PV3NtWFAV8e958/PKjTzF1qoh3/55ocFLiC6IZifWPdr/o3U1WW0+7\nGMfOOiUVMVWHjihEeCtSJ9YOxsdNiZaR5vPNDz17pACAOd/tt+daXQ1GrOB95IwifN76mfVzXWug\nnUYuGgTr/BiajOKAbA25CmZBmL9jsxnJNSw4o2w2Pmr+wNmJKbLhWHO/agDtLQJQYgV915y3JEWc\nJVRicTNaVfPnuzPvb596PJa/vBX75E8hKDFUlgWxwzpGe8574UmH4uebXI1rrO5XQwYU47yTxiIU\n9OGdLwdgXcM/UOoL4ZRDvo5h1SFcEDYb5OwT/us89fCRI3HWCWbryTsunYn9zebWh9VNZlCwh8JP\nnjEUf39nBzTdQHFAdoYd506agj98bG7KcsnJhyEkDMCU0WbWfszUIRhUUYTRNWaf7GWXzkSFK6hV\nlRfh9kuPwODK1FXInjlvSUQsrsEwDKfaPDljVDU9MSef5kP1vmuPxusbduGZl825fk+Rmiu4JQc0\nd5CXU2Tw9187B63huJN1L7t0JprazIs+ewcrO4ja2Zq7u9hti7xVzJnMnjIYA8sCGF2ToiNZkvsW\nH43G1ljSnHfXC9Y6M6DU3BfdMHIbvAM+GcsunYnykB+KJODLfe2oTXMRaHNfdCkpRlnSFay53X/t\nHLz18V488Tdzu9p0GXqqi4NcYPDOEbswTSzdB6m0Ac2tewAkrh63la6Fuyg3psUQi5vLqEQB2Cuo\nKJaLIDQMhVi1y1xcJOowAAQUGYJsZn2TB0zA3z7ZZ3bv8oedtZhavVWQ5jOvnu250jZhPwRBgCDH\nYaiJhgMTBtdC/FhE3JpntTPvimAJmvfHzbXWmuQUpe1vbzH/muxqcSdwxs0GNIpVze5aQlJRFAK8\nsdvJrGPbJiIweb35GnIMorVlpN6ayFxDivke3t3/Nt7d/7Z5pyvzdobQ5Rj8483vG9EihII+TB0z\nEOs273aCfak+BOMGV2P9f8zzJEuCk50FjUrobaUQi5s9xwgATc0wg7ccd4oI/ZIPoiA4PbvF4ia0\nqOZzy1xz3n6fhOpQGfaFzfqD8jLRXLalJ4bNq0PeavD4DrOJSHFAdrLYE8dNw4mY5nmcX5FQWQpU\nxkc79w0vH4RqK3sqtiqFAWDW4MPx7KsfQ9s/BJNHVngyQ3e2NW5AotBt+qihngsxURBwyPBEtfWI\nwR23dxw5OH3w6ThsbjhD58mZt98nQdONRJerNMOZpUGf50PeHdB8nmHljnP0smQeQ6oP9oqQ31lf\nDQChoA+hoLeRjP1+3BcCK+bcDkkUUaIcWMFa8rntTFmJH2VJ8/2pmrR0hyyJqAz5sa85mnYIOVvc\nf0/2KE9n5DRLwVIWrKW4QAPM3/PIFH/Hydw1BRw2Pwg5w+ZWT+WInmKHKxd7yZdfkcwPJ1GDIvoQ\njWuQkpYYybIASUlsU1jXGIERC6BdS6x7tVtzhvzmB669WUMM7TAMwwrePkSsHbxkUUaFvxx14X0Q\ny/dCHmAPi7u2WlQVZ/mUvduYMyft7GGsojUchVhsZuYhV+FOyrWg9rB7WzmiH1vLk5QoxFAj9EgR\nEHd1B/OluPp27fhlX0BIFXsgKHFIrYOh7jSDn7OUyPp9+EQ/Lp9yEcT95m5DdsEaYFZdq9aOSoYu\neLL7BmsBgFi6z5mXtzd+QDwAI+aHEGzGjlZzyL0maSlSib1LkRxDaYld7Z0YNncXOk0UToTeYI6q\ndDXzce+6lG7tbUD2I/blKECXMLC8KG27R3exXbHcvXW86aQqWItZ65cVyRu8Az4JqmZkHDYHktY4\np8mQUs2P2wVv7u1dD4Q9kuD+PZX5Qx365OdCb2XeQOJiLpeZd3d4Mu8U1eBdybzNx2U+X+6Lg1R/\nS9nC4J0jTsGatYGCs/uTLWlLQ7tHud9ndmkSJA2KYG4Dam9q4ARvSXSGtQNyAPWNYQhqAO1qO1Td\nvD+shiEKIoKKGbTsIdKI0WbuYiQYgKZ4CuiqigagOdYC//h3nPvawq41yZriFGm1WNXmRofMW0Wj\n8hnE4hZUxMd4AkiqYOLeYcoZQg81QJDjHdbRFrn28lWsYUnPPLp1DGKRtca8bZIzn+tklFa2LMIq\nHrP+I8qS4BS6tEfi0PbVmIFb9cFd6mq0l0JvLYNUXg+p2mxp65f8zsWa3h6C6I9g475NKJKLMCxU\n63kPIcVV+e/XneO2P2R8kmvNtpgYdTiQzOeiCeeiSC7C5AET0j4mZm0vW1bs82Qi7vXSgiDg4gnn\n4uyxp3d7HW86SoqlYnbzEZ8ieoJOQJGg6ZmHzYH0WZV7Pa6U4jF2Zt3Y0vlFdjr2h36uM9RUpG4U\nZ6VjX8wV+/v+fXXGezHoyox9nS8VS9aF2J2x8U22MHhnQTiqoqXduyuY0yXMyvTiRlJ3LtVbgOEE\nb8Xa9UtUAV1GLK5Bttbl2vPjih28NRkCBNQ1heEXzMBoN2Zpj4dRJAcgSaK5VCfuh2EIaFWb8ceP\nVgEAtP2DPB9WA4OuTRosEVenOMOqKPcrIiL2Bh3OnLcVOBUVMdnMumvh3bIwOXifNOxYs2DKZjVZ\nEcvMPa6T23C650EXTVqAG2dcA3XXqMTx6e75bxHlYiLrtT+c7WAfMMzXtocYJUl0/qO3RVRA9SH+\n3ymIb0/sya3IImCIiG01h6ztna38kh/2SgB7HXZYi2B8+egOGzKU+q3aASXmbCBiaHLK5TElUuLi\npegAMp+ja2bivuN+2GlbSltxkeL5MEquDp9dMxNzhx/f5Z/dZUnLxlQtfebt90nQNHPY3C4sTCdd\n5uS+v7Pg3dDazeCdIvPuK+5h855edOVL5q2kec+pMu/OCtbc2/Wm09MLou5i8M6C6x96Ddc/9Jrn\nPrt6Fk7w9gZ3Q9CgR4LmGlZRcTbZ8CsSSopkCJKOPfti5iYMgt061GroIgnOhhRtERXhqIZia3/h\npqg51xpWwwhamar5QWj2zd7eth0fNXyKQfIIaPW1GDwgEVA9OyxZhg8yX3dAqd8pShs62Oc0QnEy\nb6dtpwpVtIbsFe/8kXsI8VuHXdlh/99xQ8xWlfb/PfcmAYD3P+DQkhqMLR8FGCnmvGF1RLOqdodV\nlzgZROyzqYhvH4ehhrk8ys4AZVFwisbs4VmtvhbaPnP4fNSQUqdHtxEtcvrFA4Bf9jkdLY32xEhA\nqsy3LJAI3s7fhC55AtL/G30Kjhx8OAJS9pbq2PPcQyqDnkrsQTnaaMHdrEiWBOiG4azEUBTJO2yu\nmHPebRHVHJXqJCBJXVjDW2o1jXFn92NqzIu5IQO6N8ztVyQU+eU++2B3S3Vx0l2DrL+T0mJfhkf2\nrXS/d/v3ka63eTK7F7zYyd9YV9eJ97b+ffmUp+xCG8MwnA+WRPA2/9XQMXhDC0DdNQbV46LYGfkC\nsLbLnDV5IF54C04wcipXreCtSCIgxWFEfE5L1XJ/KRoANESbMArmnLddLFUe8mPP/nZz/tgXRZEc\nwHdnX463yxow3bUcZXz5mMTxqTK+MuJknHTUZLz7aT3iqo6nt5hrzysrJWyPxc1kU1NwzNQhEMsF\n/CeyCZKswfBFYBhCh0zbfXt8xRgIgoAbzp0Gnyxi9/52zJo4CDe/9ifnfert3jluWRKwaOL52NL4\neYcLjbISH5paDRiGGfyrS8px+lEjURr0YfaUwSgOKLj27EPx8z9thLprDIQae7g8kXnbRU32nuTj\nhpbhzGNGYV9TBDPGV+FnqzbA3GFcgN40EKK1I5tn2Nx1wTFj0FQkq7CaqMiDvsCGfebPOe+YyZ6i\no1NGmu1qX3rjC+e+AaW9u2/wLRdMx0dfNGDK6AGIxTV8/YQxB1Qo1VPupZR2sLH/litCfs8oix3I\nW9oT+5ink/yhe+uF0xGNe7OpMTVluPTUCc4GGwBw8hFD4fdJmDHeu/NWV10wd5wzrN/XejN4H35I\nFRbOH4+jJg/utdfMhuRs+vuLDkdTa+Iz1/130dn5GTE4hEtPnYBDhqcfteqrCzQG7yxStcSmCppm\nz3mbHxya4A3eEPREYxIkdtzy+yRnq0l7DbOdeTt7RUsCdCEOQwtiZ521UUdxJT4PA/sjDeZuZLrq\nZN5V5QEzeFsXEjXFQ1CsFOG4ad4sa3jpUAwOVmN3+15EP5yFY4+ag1DQh+Om1WD9B4lK7WDQgICw\ntbm9gLOPHY09cRn/ec8cNheUKBD3IVDi/aAt9lQrm+996hgzCE8YYfX/1v3QxXarUKxjRe+RVYfj\nyCGHdzj35SV+8z+rIQCCgepQGfyKhLlHJPp6H35IFYJ+Ge1R1cmU7Yst93CsnXkfOnoAJo9MVH+7\nq5zj28cDhoihlRVQRNmpcTDCJdAjRThhzHTPHL1znEWJC5KdrealwNETRnV4HODNEFJ1EOuJytIA\njp4yBIBZiX3aUSMyPKN3uTNve5jX3rSnqjzgyYrt77dFVFRXdF44l5xVpbsYOW5ajee2KAgd7jsQ\n44ZmnqLIld4M3pIo4sQZQzM/sI8lz0PbIympZJpKyPR30JWitmzo+zGdg5j7Cl/Tra+tYXNnj2Xz\nljlfavfztpc7WTtuOXt0W8HSZ+/HbDdOkTSn4GxHnVn1XVtqZtANkUa0W/PRRdY+t1X2jkuy+bo1\nJemvom8+4jpEP5wJI1zq3bQ+oCSGyJUwRH8EmpUZK7LobK0nyCoEXxRGzO/Zlxfo2s5DJa3mHLPe\n2DED6uxDKRS06wLMoFDiSz386QzJG4bntuyZ844797l55v00H+LbJmGYPsN6Qet+Q0R0w3E4b/xZ\nKX9+kc/nFA8CZuFdukpud/FVLqtac8Gdedvnede+xI5x7mFz9+890/RBbwaufCX1g6H7XMvlUHa6\nTaeyrfB+qzlkL7sCEsPmguBu2WmxN9+wMm87wxZE1QreiblQAAiIAc9rGFYWb2gydlrBe0SlOV+8\nP9Jo7kcNIGgFVLvNYeyzQzEiNAynjpyb9j0E5IDTKtL9HyIYkJ3g/Z+ItZtYuGPwhq/dXI8eD3To\nhdyV4F0ePgSRjXMQ+++UDt/r7EMped/idEt07Kvu5P9/dntUIJF5JweCVEU79lW49+VStwwFzGVP\n0Q+OcgJ4mb/jDlm2rhTP5Bv7nRquM2af50TmXeQ59+7fe6bCqT76XO1XCvECJpdD2U5ilmMcNu9F\numF45lI8mbfmLVhzb7cJwargtoKz0SHztrqLWXPefsneDMMM3ppgXQioMnZY2/UNG1AJn6hgQ/1m\nJxjYa4ZDRebws948ELfMPK/L7y8580bS7kh24xdFFlFkWM1gfFZns5i/Q1WwLMo4YegcDAqmn1eU\nZbFDoZqts0KT5Cvv9MHb/Dd52FwUEsHbnitLfs1Uy4DsD8p0u8h1PE4RRqzIXG5WuRdiJzsu2Mv4\netJoo78RBAGGYSRl3lbw3tcOvyIhFFSSNjFxZ96dz3l39fdwMGPwzi7nsz3HDp5PgT62e387rrjn\nn/j7267+1/HEsqrkgjVB6ph5G9awuW7vNuWLwO9zZ97mtZZTdWy9hu7KvJvaYigt9iHgk+GTzCD9\nft0mAMCospEA9OeBHQAAIABJREFUgPJQ9ypFk5tcOHtuW/RwCLIkmPv/Wpm37rOat8T9Kfv+njv+\nLBw39Oi0P7OzZRzJnbHcyoq9c8IBOXWBlz13XGQdWyITTPS/brcadSRn+ikzbyl1Jp+JfcEW19MX\nOdmvOXxQ560h84m9JMtd4S675rQHlgc6jES4f++ZMu+DbXqhOwrxHOTygqWvLqaZefeStz7aCwBY\n+ddPnPvcm444QyuiO/M2AAiJPautYFgSrwX8m6AM+wQ++WRnztvOvINyEIh3zLxrKspR7h/gVMi2\nurbpBIDRZWYR0qSRlTh99ghMH9e1Stprzz4UX9a3ej4EKkJ+nHroNLypbcWYkrF4+8P9MNpKofgT\nFfHmkjd7NzJft1oHdjZ3lep7t1wwHe9vrcfXjh8DUQT+ZT9WTP2zrz17Cl5Y/wVOn20VaLlesqqi\nCKOGhPD5LnP0IPkDIdV8qzNsbkVaWRJxwdxxad8DAFx51hSsa9iBrZFdnuHjZGcdMwqqpuOsY1IX\ntOWj75w/Df/3n+04+fBEEdSx02rQ0h6Hbhg4ekqiHuPCueMQ8MnY+mWip26m4D24MohTjhyOKaMq\nO33cwcyvSDhzzshO29MebIr8svmeh6R/zxfPH9/lTUk6M2N8FU6cXotjpw3p8WsdCAbvA7S18b+o\nC9fjqCHezQXswCYEWiGW1UPbMwLRlJm3GagF0XA2FnGG0q3g7YsOQoV/KBqKdwBSvMOcd5EcgBAX\nnNakLaq5DehXpo/F7JpEj+sLJ3wNL3z+NzRGm5znAeaQ8NeOTywDy+TwQ6pw+CEdA/3Xjp6Mq6uO\nwsdb67B+rbkft/sqNCgH0BSzh/SVbgZvMem22XMaSD1sPmFEhVOpfv5J4/Avc5dMKGLq4dXqiiAu\nPTWx/lpAYthbFARcd85UfPfnr6c8lmCKLlPJAf74aTU4cXpth8e5nXncGEzYfiZ+uWEfFhxydtrH\nBQMyFn7lkLTfz0dDBhTjklO869/H1pbh21/vuKzOXimwbXeLc1+mYXNBEHDeiWN74Ujz21ePHZ35\nQQeZTO/5pF6qmpclsU/+XzJ4H6AH3vkFAGDW4Bmebln2XHdgqtmcJdJa4Q3emrdgDYCZfetyIhu3\nhs1jcQ2KHgREQBOirszbqjZXJBTFi9BqBe9P2z+CKIiYPND7ITin5kjMqTkSb+95DwNTNFzpLe4P\nUPeSnqASRFMssZtXd7bLSw6YPlmCqplDy501TrAdWzsbr+5ch9HWlEEmiepz89+yksQUQ3Kmn7pg\nzTts3tWGVhWBciyddWPXHlzg3Bdt7o0/iAoJ//K7SdVVZ04ZAJKnWARRSxo2TypYgznsbcQDEKwm\nJPYcciSmQdB9ZvAWI4iq1lIxe523LCIoF6FNaoXgb8OeyJeYWDnes4mF2+GDDuvRe83Ep4hmP2rd\n8GTe7mpyo7uZd9J8kt8nOXPQXWn1eN74s/DVMachIHdvXbS3mYP3WFIOm9tz3vbwd+FNN2ad5ClY\n40cYFSYWrHWTmlRY1CGQGELSsLm9zjuRedubeiSGzc3gFo1rEKyGJHEjirC1TttemuWTRQSVIkCO\nQRpgNvaYOWh6z99UNwmC4HyIuueQPOuVXZttHIjkbPdAd0USBfGAAnfyum+3mKp5bqfaijIx523/\nfEbv3uYtWOvfG2QQZQuDdzfFda3zBwg6/vi3T7Hps30AXJm36FoTaFecJw2bb9vdgi92mvPcUSOM\ntri53tVu0qLIEkqUIATRgFS1A7IgY2rV5J6/qR6wP0QVxTtsbjNUxbOTU1clD5tne79cZ847xfda\n2uOe26kL1rpXbU5dx8ybiMG725Izb7ufucMKyA88/T6AdMPmKgZVFGFghbWHtWvplb0dZtQIO3tx\n25m3IotOYBT9EQwtHppoitJHjpo0CANKAzh8fLVzX1BJtAOVoXSrjaA7eI8cHMKF88b37EAzSZEo\nf+/iGZgwvByzJ3v34lZkEbMmVnsKopKHzZl4976JIyowqDKISSMrUFHau21iifIFL1u7SdW9WVgs\nufuVNY9tf3inK1i7Y9FMPP3uK3izHYAuosgvIRzVnK0129VE8LaboiiyiGI90XQk5Ov7db9nHjMK\nZyYtYXIPm/vl7q0td+/zfPslR2R9swdnnbfr1zRuaDluuXBGx8cKAq4+y+z89vQ/twBwVZsbicdQ\n7xo3tBx3XXVUXx8GUZ9i5t1NquEdNjdbV7rms63MuzJkZsTJvc0Bs1GLTxFRVGT9GgzRqdy2s+zW\nWBva4+3wiT5nWN0niyj3JdYvhtIUqvU1d8FadyrNgUTBmiSaLUaz3Xwh0XGte+Pe9pJBnfVqRJRF\nWc28V6xYgffffx+CIGDp0qWYOjWxdnPlypVYs2YNRFHElClT8P3vfz+bh9LrkofN46ruZNsAnK8H\nWMN6qea8BVmFJIoIBqzgrZutIOubIs6weVu8De1qGAEpALs1hSKLKJMSwbs0zaYbfc09593duWq7\nOMkO2tnecEBI7pd6gOSkJi3MvIkoG7KWeb/55pvYtm0bnnrqKSxfvhzLly93vtfa2opHHnkEK1eu\nxBNPPIGtW7fivffey9ahZEVyG8u4qnn7lVvBu9Rqv6m72qMaqnnNJPniePHzv6MdjQDMOe9ia39i\nSfdBgIDWeBva42FnO0/ALFgr8yeCd1mgf3ZOch9z8qYkXWVn3nZGm+3t9xLD5t2M3slLBhm7iSgL\nsvZJuG7dOsyda+5WNWbMGDQ1NaG11exzrSgKFEVBe3s7VFVFOBxGWVn6/Vb7Ql1jGI+t/djZDjJZ\nqszbvVOY0/LUCgLujUnsrFoYsAN/+XwtXtn5uvVY0Wk6IUkiipUgGqNNiGgRTxaryKI3ePeDOe9U\nZDExsNPtzFtK7K8N5K5Pc7eLxa0n9tU2gURUGLI2bF5fX4/JkxPLlyorK1FXV4eSkhL4/X5ce+21\nmDt3Lvx+P04//XSMGtV5v+aKiiBkuXeXCVVVpZ8rXrHyHWzZ3oiyUABXnNVxO8rikOJ5viCJiXXb\ngJN5y4qEqqoQJFkCYEAQAD3uAwLtHY+nrBhl1hy5IokYXl6DD+o+BQBUliSC9eDqEAJFiYA9fNAg\nVA3su3nvdOfRCNYC7wB6OIjSEn+n5zudygpzSkCWRef5AZ+E8cMruvV6mVx46kT86JE3cN68Q7r1\n+iWhAKqqQrj6nKn45aoNOHXO6C69TjbeS6HhOewdPI89l4tzmLNqc/cwZGtrKx5++GG89NJLKCkp\nwSWXXIKPPvoIEyZMSPv8hoaOwa4nqqpCqKtrSfv9fY1h69/2lI/b19CCOiVxf2tbzDNsPn54CB/s\nBMLhOOrqWtAeiSWK1TQZhi4msnPLVacfin+tM3+uKAoY5B+ED2AGb9lINKNoaQ5DiyZ+dVq72Ol7\nyabOzqMAH04oPh8vvl0HjDO6dYzhtqj1WnCe/7MbjoMgICvveVRVMX57y4kQRaFbr9/cHEZdXQtm\njhuIw7v4Opn+FikznsPewfPYc719DtNdCGRt2Ly6uhr19fXO7b1796KqytzcYuvWrRg2bBgqKyvh\n8/lwxBFHYNOmTdk6lG6xLzbSjdJ2GDbX9KRtPs3MW7NeJ2q0QvBHrBcXALXjdZMsys7wuiyJqA3V\nON9zL7tSJNFTCFWi9M9hcwCo8g0GNB/8Svf+1Ox13u4qc9GqPM+W3hqaL8StGIkoN7IWvOfMmYO1\na9cCADZv3ozq6mqUlJhBpra2Flu3bkUkYgazTZs2YeTIkdk6lG4xUqzTdY8edChYi2uA7B42N7Nq\nOxjvqFqDwNRXrRcSYcQ7NpdQRBmqtaRMEgUMK0kEb/dabrt/+IiQuctSd/t254IdfANK9wZ5ZDm3\nc91ERPkga8PmM2bMwOTJk7FgwQIIgoBly5Zh9erVCIVCmDdvHr7xjW9g0aJFkCQJ06dPxxFHHJH5\nRXPIvdRH1VX88aNVmO3aBlQ1OmbeYlFiqMQQzO87Veae1xZgtJVBLDYff+GEr+HThs9RVTQQmlYH\nwCxYqykZjONqZ0MWZcypmYUnsN45JgD47uGLu70eOVfsgjNfN1qjAole6WKWq8x7C+vUiCgXsjrn\nfdNNN3luu+e0FyxYgAULFmTzx/eIu8nGxvoP8cbut/HG7red76tJvc1jqg6xsinxfGgQBQGaYeCL\nvc3eFzdE6K3lQPUOAImtO4FEm1VZFCAKIs7vZH9nScxun+/eYGfe3a02l6zny3mSeff3iykiOjjk\nRzrTBxKZd+pGG8lz3jEtBiHYAr3NrArXoEIUBei6gR/8YZ33yboIvS310rhZE83+2ccfVpPy+/mm\nImQO6Q8o7V7v9UTm3b+D9xGHmPUcIwf3zzX3RHRwYW/zDARBgF/s2Jc7ntzbXG6EIBjQWiogBJuh\nG6qzx7Wn8xoAASLu/8ZXsOaLCMaVj/Z878hJgzC2tgyVKTZc+NkNxyY6teWJMbVluPvq2RhY1r3g\nbQ+79/fg/c2zJuO8ligGlhVlfjARUQ8xeKfhDJsLqbt6JQ+bq4K5lE2PBiHpkpN5R2Kad/03zD2m\ny0sCWDTp/JQ/e0CaQJevexdXl3c/oLl7m/dnkigycBNRznDYPI3EUjEBmqF3+H6HLUFFa9vOmB/Q\nRWiGBkkUsLehvUPmDZ2nvavsXuH9PfMmIsolRpE03FXDWlKWDXirzQ3DgC5Za7jjPhi6BNWIQxIF\nGAY6ZN727mCUmZ1550vBGhFRLjCKpJEp845riYC8e387oJidwIy4HzBEqIaayBalpODP4N1lSp7M\neRMR5RKjSBruOW/N6Dzz/vmfNkFQYgCAUn8I0BKZNwAIHQrWGIi6yqdIkCUBRT6WZxAR2Ri803A3\nadFTDZu75rwjMRWCEoUiKvjhJbNRO6AUcT0OwT67ycPmev9fn91fyJKI755/GM4/aWxfHwoRUb/B\ndCYDM/NOVbCWCOiabkDyx1DmC6G02I/yYDF2RXRIkpW+JxesaflZNd5XDhle0deHQETUrzDzTkN3\nNWlJOeftWuet6ToMKWoOmQPwS+YabdGa6xaS57w1XjMREVH3MXin47RHFVLPebuGzXUhBggGQtbu\nXgEreAv2RiVi0rC51rHpCxERUVcxeKdhrxRLV7AW01QnOzdEs9K8WDG37fRbu3wJaTNvDpsTEVH3\nMXhnIKYpWPt8dyN+9Zy5B7kmmpXmQTt4S1ZmbQftDpk3h82JiKj7GLy7INWcN0QNb31sbt9pWMG7\nWDaDtzNszsybiIiygME7A90wUg6bu7umGZJZvNZh2FxUARgQ/OGkF2XmTURE3cfgnYFupMm8reBt\nGAYMKXnY3NoRTFQhDdoGsbjZ05iFTVqIiKgnGLwzMAwj5Zy3ORRuQDcMCLKdeZu7StnD5pBUSKX7\nAQCXTlqQk+MlIqKDH4N3BuaweYrMGwAkFf/e+SaU2q0AgGDSnLchqBCKWmDEFVQFB+bkeImI6ODH\n4J2BYaRYKmavAZdUPPnpaufu5DlvTYpADIRhREKQRc5zExFR72DwzkDXOxasybDntL33FyctFYvI\n9eY3wqEO+38TERF1F4N3BobRcT9vUU/MaeuRIud+RTSXgNnD5mFxHwBAiIYwpHgQJMOH+M4xEFiv\nRkREPcDgnYFhGNCT5rwF3cysBUmFEU0Eb8GKyvawuV1ULmh++CQfZukLoe4cl/2DJiKigxqDdwYp\nC9bsJiuSCgjmBPjo8Hzn285SMYtgPV4wmHITEVHPMXin8NH+TyH4zMYqqQrWjLgVjK3gbRgCSvUa\n5/uKKENxFagJujeYExER9QSDd5KWWCt++t5v4J/2CgAz8/7vnibPY1S79kxSIQgGYAiQRG9WHfKF\nnK9FnbuIERFR72HwTtIWbwcAp6hM1XTsaWjzPMauX7MzbxgCxKQzGfKVOF+LOnuZExFR72HwThLT\nY57bcVV35rVtumadNsnsXW4Gb++pLHUFbwHmELoB7+sQERF1B4N3koga8dyOxXVAMAvWoh8dgaJw\nLbR6c37bnXlLSeu/Qkpi2Dz5e0RERD3B4J2kPSl4x1XNybz15gEI7T0aajQAABCUqBO8haQzWepP\nBG8hKXgn3yYiIjoQ7NmZJBz3bt8Zs4bNDQMABLRH44Dqg6EqEAJt1lruFAVrimvOW2SwJiKi3sPM\nO0lY6zhsLgg6YJinqj1ilprr4WIIgTAEUYNhCB0CtGepGDNtIiLqRQzeSbyZt4G4pjtD4wAQjpql\n5kakGIJgQPBFAUPskHlLouR8bX+L5WpERNQbGLyTeDJvwUAsrgGCDlmUMLqmFLo5fg4jXJx4nCFA\nTMquJw+YABgCYtsmdPgeERFRTzB4JwnHXcFbVJ05b1EQ4VcS2bQeDSYel2LYPOQrwYTGi6DtGclh\ncyIi6lUM3knCqmvYXNSdanMR3uAN3fV1ig5rAKwit8SwORERUW9g8E4Sdi0VEyTVWectQoLf5w7Y\n7lPXMfMG4AyxC4zeRETUixi8k3gzbw2abkBwhs1dp8u9Q1iKOW8gEbyd2M2KNSIi6gUM3kncTVoE\n0W5ibgZvn2vY3NATpy7VUjEAmDKyEgBw2NiBnvs5BU5ERD3BJi1JYpqrt7lkB28doiBBkdJn3qnm\nvOfOHIZDhldgWHVJh+8RERF1F4N3kqh7Y5KkzFv2BG9vIE+VeYuCgBGDQx3uJyIi6gkGbxfDMBDX\n4s7txLC5DkmQkoK3O1innvMmIiLKBs55u8R11bttp5TIvCVBhCy5h8q9WXiqYfNkrFcjIqLewODt\nYs93S4JZmCaIGiCqEATAJ/o9mbe7YC3dsHk6zNGJiKgnGLxdolbwDohW9zRRg+Azq89LlNABF6wl\nG1xpvu7omrLeOWAiIipInPN2iVvFakVSEG1aCyCp5sYjAEqVEGQxdcGakWadd7KTZtSiOCBj+riB\nGR9LRESUDoO3i5N5C2aGLEgaBMXMvEt9pZCTsm33110ZNpclEXMOHdJ7B0xERAWJw+YuMavS3O8M\nm6vOsHmZrzT9UrE07VGJiIiyIWPw3rp1ay6Oo1+IWcPmPqMIgJV5W8Pm5f4yyHLP5ryJiIh6Q8bg\n/e1vfxsXXHABVq1ahXA4nOnhec0eNldgBm9zztvMvCuKyrwFa8jc25yIiCgbMs55P//88/jkk0/w\n4osvYuHChZg4cSLOPfdcTJ06NRfHl1N2gxZBV2BoIgRJBXwRGLqIkFKMqBRJ/URD5LA5ERHlTJfm\nvMePH4/rr78eS5YswdatW7F48WJcdNFF+O9//5vlw8stO/OGIQG6DLG4GWKgHdq+wVCUpA5rbhw2\nJyKiHMqYee/cuRN/+tOf8Je//AVjx47F1VdfjWOPPRYbN27EzTffjGeeeSYXx5kT9py3oUkwNAmC\nYt6v7hwHRfL2Nvd0WwOYeRMRUc5kDN4LFy7E17/+dfzhD3/AoEGDnPunTp2aceh8xYoVeP/99yEI\nApYuXep5/K5du/Cd73wH8XgckyZNwp133tmDt9E77A5rhi4CmnlqDEOAEQtAlrztURXZvc5b5Jw3\nERHlTMZh8zVr1mDkyJFO4H7iiSfQ1tYGALj99tvTPu/NN9/Etm3b8NRTT2H58uVYvny55/t33303\nLr/8cjz77LOQJAlffvllT95Hr7CXihmqBEO39u6OKwAESJLgqTZP7rbGYXMiIsqVjMH7e9/7Hurr\n653bkUgEt9xyS8YXXrduHebOnQsAGDNmDJqamtDa2goA0HUdb7/9Nk466SQAwLJly1BTU9OtN9Cb\n7DlvXXNl3roMSTSryd0BO3nZGIfNiYgoVzIG78bGRixatMi5fdlll6G5uTnjC9fX16OiosK5XVlZ\nibq6OgDA/v37UVxcjLvuugsXXHAB7r///u4ce6+z57x1VYSzFEyTnEDtnvNOzrwZvImIKFcyznnH\n43Fs3boVY8aMAQBs2rQJ8Xg8w7M6MgzD8/WePXuwaNEi1NbW4qqrrsLLL7+ME044Ie3zKyqCkGXp\ngH9uZ6qqQp7bwqfmMUqiD7D28jZ0CQFFQlVVCEUlifddFFDgXMIYAgYOKO7weoWiUN93b+I57Dme\nw97B89hzuTiHGYP39773PSxevBgtLS3QNA2VlZW49957M75wdXW1Z7h97969qKqqAgBUVFSgpqYG\nw4cPBwDMnj0bn376aafBu6GhPePPPBBVVSHU1bV47mtpN39GuN2A4Lf28tYlSKKAuroWxOJa4sGu\nixEYApoa2+EvwOQ71XmkA8Nz2HM8h72D57HnevscprsQyDhsPm3aNKxduxbPP/881q5dixdffLFL\nmfecOXOwdu1aAMDmzZtRXV2NkpISAIAsyxg2bJizTnzz5s0YNWpUV99L1tjV5mocTuYNXXIqy93z\n3JJnqRg7rBERUe5kzLxbW1vx5z//GQ0NDQDMYfRVq1bhtdde6/R5M2bMwOTJk7FgwQIIgoBly5Zh\n9erVCIVCmDdvHpYuXYolS5bAMAyMHz/eKV7rS1E9BlmUoaqAPedtqDJ8VtB2B2jJ9bVhCDBARESU\nGxmD9w033ICamhq89tpr+MpXvoLXX38dP/jBD7r04jfddJPn9oQJE5yvR4wYgSeeeOLAjjbL4loc\nPlFBXNUhfjEdyvCPEd5+CJSqjgMU7gI1QTAYvImIKGcyDptHo1HceeedqK2txa233orHHnsML774\nYi6OLeeiWgw+yYeYqkNRy1C291hA9UNJUSgnJbdKNRi+iYgoNzIG73g8jvb2dui6joaGBpSXl2P7\n9u25OLaci2kx+CQz81ZkCbpuBmR3NzWbuymLJAuoCAVydpxERFTYMg6bn3XWWXj66adx7rnn4rTT\nTkNlZSVGjBiRi2PLuZgeQ7lYigZVQzCgQNV0AHDmvN3cwfvsY0alDPBERETZkDF42wVngLmka9++\nfZg4cWLWDyzXDMNATIvDJ/kQ13Qosoj2iAogc+bNGW8iIsqljOmiu7vaoEGDMGnSJCeYH0xUXYUB\nwwzeqg6fLELVzcxbSbEVqOgJ3kRERLmTMfOeOHEifvKTn2D69OlQFMW5f/bs2Vk9sFyLWq1RFVGB\nqhlQZBGaZs15KykK1kT3rmIM30RElDsZg/eHH34IAHjrrbec+wRBOOiCt92gRbY28VZkyZnzTpV5\ne4fN9RwcIRERkSlj8H788cdzcRx9zt4ONBG8RSd4y3IiUFeVB1DXGPEOmzPzJiKiHMoYvC+88MKU\nc9wrV67MygH1leTM2yeLUK1hc9k1RL78yqMQi2t4+p9bnftYsEZERLnUpQ5rtng8jvXr1yMYDGb1\noPqCvZe3CHN+293HXHb1MZcl0dkaVI8UQQyEUSQX5fBIiYio0GUM3rNmzfLcnjNnDq688sqsHVBf\nienmsLmExLC5rUM3Nfs5Hx+B4NCdOPb4g2v+n4iI+reMwTu5m9quXbvw+eefZ+2A+krMybzNU+Ju\nzCKLqZbGGTCixZD3TIFPUlJ8n4iIKDsyBu9LLrnE+VoQBJSUlOC6667L6kH1BSd4GzIA3ZN5y510\nTzv4VrwTEVF/lzF4/+Mf/4Cu6xCtoq14PO5Z732wiOnu4B3zbEYipxk2JyIi6gsZo9LatWuxePFi\n5/ZFF12El156KasH1RfsgjUY5ilxr+2WUgybc3UYERH1lYzB+9FHH8WPf/xj5/bvfvc7PProo1k9\nqL4Qt9Z5Q7fmvBV3wVr6wfGDsVUsERH1bxmDt2EYCIVCzu2SkpKDMmBFtCgAQLCCtzvzdq/ztjHx\nJiKivpJxznvKlCm44YYbMGvWLBiGgVdffRVTpkzJxbHllB287cxbUdzrvDnnTURE/UfG4H3bbbdh\nzZo12LBhAwRBwJlnnolTTjklF8eWU1HVCt6anXm7C9YOvpEGIiLKXxmDdzgchqIouP322wEATzzx\nBMLhMIqLi7N+cLlkZ96GZgZt91KxUNDX4fFVZQEAQG3VwXUeiIio/8s4Hnzrrbeivr7euR2JRHDL\nLbdk9aD6gp1566oZvH2yiOVXHolLTjkEIwaHOjz+lCOH44KTx+HKMybl9DiJiIgyBu/GxkYsWrTI\nuX3ZZZehubk5qwfVFyJaBIqoQNPM24osYsiAYhx/WG3KxyuyhHkzh6XMyomIiLIpY/COx+PYujWx\ng9bGjRsRj8ezelB9IaJFEZD8iKvWHt6ddFUjIiLqSxnnvL/3ve9h8eLFaGlpga7rqKiowL333puL\nY8upqBpFQPYjxuBNRET9XMYINW3aNKxduxarVq3CkiVLUF1djWuuuSYXx5ZTyZm3z9UelYiIqD/J\nmHm/9957WL16NV544QXouo4f/ehHmD9/fi6OLWd0Q0dUi8Ev+xHXmHkTEVH/ljZC/eY3v8Fpp52G\nG2+8EZWVlVi1ahWGDx+O008//aDbmMTuax6Q/IjHzYo1Bm8iIuqv0mbeDz74IMaOHYs77rgDRx11\nFICDt4931FrjHZADaGPmTURE/Vza4P3yyy/jT3/6E5YtWwZd13H22WcflFXmABCx1nj7JbNgTRBS\n7yRGRETUH6RNL6uqqnDVVVdh7dq1WLFiBb744gvs3LkTV199NV555ZVcHmPWOZm3VbDmk6WDdpSB\niIjyX5fGhmfOnIm7774br776Kk444QT8/Oc/z/Zx5VRYjQCAWbCm6hwyJyKifu2AolRJSQkWLFiA\np59+OlvH0ye8mbfG4E1ERP0aoxSA9ngYAFAkB9DcHkdx4OCqpiciooMLgzeAdtUM3qLuRzSmoao8\n0MdHRERElB6DN4D2eDsAIBoxT0dVeVFfHg4REVGnGLwBtFmZd6SNwZuIiPo/Bm8kMu+WVvM2gzcR\nEfVnDN5IzHk3NZnd1TjnTURE/RmDN4C2eDsUUUFLmxm8K0L+Pj4iIiKi9Bi8YQ6bFytBRGPmpiQ+\nhduBEhFR/8XgDXPYPCgXIRLX4FNEiGyNSkRE/VjBB2/d0BFWIwgqRYjFNfiZdRMRUT9X8ME7rEZg\nwECxHEQ6NoGoAAAYmElEQVSUwZuIiPIAg7ddad6so6E5Cr+PwZuIiPq3gg/eMc3co3zL9jYYADNv\nIiLq9wo+eMd1M3gbunkqGLyJiKi/Y/DWVfMLBm8iIsoTDN5W5g3dDNo+peBPCRER9XMFH6ni1pw3\nDPNUBFiwRkRE/RyDtzPnbWfeDN5ERNS/MXhzzpuIiPIMg7cz583gTURE+YHBW/MOmzN4ExFRf5fV\n4L1ixQqcf/75WLBgATZs2JDyMffffz8WLlyYzcPolDNsbhWsiSI3JSEiov4ta8H7zTffxLZt2/DU\nU09h+fLlWL58eYfHbNmyBf/5z3+ydQhdkrxUTNP0PjwaIiKizLIWvNetW4e5c+cCAMaMGYOmpia0\ntrZ6HnP33XfjxhtvzNYhdEksqcOapht9eThEREQZZS1419fXo6KiwrldWVmJuro65/bq1asxa9Ys\n1NbWZusQukR1qs3NzLu4SOnDoyEiIspMztUPMoxERtvY2IjVq1fj0UcfxZ49e7r0/IqKIGS5d4vJ\nqqpCkD63bugi5h85Al89aTwkznsfkKqqUF8fQt7jOew5nsPewfPYc7k4h1kL3tXV1aivr3du7927\nF1VVVQCA9evXY//+/bjooosQi8XwxRdfYMWKFVi6dGna12toaO/V46uqCqGurgXN7ebrGrqEuTNq\nsH9fa4Znkpt9Hqn7eA57juewd/A89lxvn8N0FwJZGzafM2cO1q5dCwDYvHkzqqurUVJSAgA45ZRT\n8MILL+Dpp5/Gz372M0yePLnTwJ1NqqvaXBILfuUcERHlgaxl3jNmzMDkyZOxYMECCIKAZcuWYfXq\n1QiFQpg3b162fuwBi7matMgSh8uJiKj/y+qc90033eS5PWHChA6PGTp0KB5//PFsHkannI1JdAmy\nxMybiIj6v4KPVqquWg1aBBaqERFRXij44B3T4xAMs4qdmTcREeWDgo9WcSt4CwJboxIRUX5g8NZU\nCKw0JyKiPFLwESuuxwFDYqU5ERHlDQZvPW4tEyv4U0FERHmioCOWYRiIaXFAl1lpTkREeaOgg7dq\naDBgwGCDFiIiyiMFHbxjWsz8QpcgcdiciIjyREFHLDt4G5rEYXMiIsobhR28rb7mhsaCNSIiyh8F\nHbHszFtn5k1ERHmkwIM3M28iIso/BR2xYnoi82a1ORER5YvCDt4sWCMiojxU4MHb3stb5FIxIiLK\nGwUdsRLrvGXOeRMRUd4o6IjlLBXTRQ6bExFR3ijs4O3qsMaCNSIiyhcM3gCgSdzPm4iI8kZBR6zE\nsLkERSnoU0FERHmkoCOWe9hcYcEaERHliYKOWFFnqZgEHzNvIiLKEwUdseJWhzWDmTcREeWRgo5Y\nMVfmrchS3x4MERFRFxV08I46c94iFLmgTwUREeWRgo5Yqq5CggRAgI/Bm4iI8kRBRyzVUCEKMgAw\n8yYiorxR0BErrschwpzrZvAmIqJ8UdARK66pruDNgjUiIsoPBR28VUOFYJingJk3ERHli4KOWKqu\nQrAybxasERFRvijoiBXXVQgG57yJiCi/FGzEMgzDzLw5bE5ERHmmYCOWqqvmF8y8iYgozxRsxIpr\nVvDW7cyb1eZERJQfCjd423t5W8PmLFgjIqJ8UbARy868DZ1z3kRElF8KNmLFrMwbmghBACRR6NsD\nIiIi6qKCDd6qlXnrugBFFiEIDN5ERJQfCjZ423t5G5oIRSrY00BERHmoYKOWXbCmaQJ8CivNiYgo\nfxRu8LaHzTWBmTcREeWVgo1acd0O3iIUpWBPAxER5aGCjVpxa85bUwXIYsGeBiIiykMFG7Xcw+ay\nzEpzIiLKH4UbvK2CNV0TITHzJiKiPFKwUcvpbW6IkCVm3kRElD8KN3jbvc11ETKrzYmIKI8UbNSy\nm7TAENkalYiI8krBBm9nP29dhMTMm4iI8kjBRq2Ys6uYBJmZNxER5ZGCDd5x97A5C9aIiCiPyNl8\n8RUrVuD999+HIAhYunQppk6d6nxv/fr1eOCBByCKIkaNGoXly5dDzOGSrYgaNb/QJBasERFRXsla\n1HrzzTexbds2PPXUU1i+fDmWL1/u+f4dd9yBhx56CE8++STa2trw6quvZutQUgqrEQCAocssWCMi\norySteC9bt06zJ07FwAwZswYNDU1obW11fn+6tWrMXjwYABAZWUlGhoasnUoKUXiZvCGJjPzJiKi\nvJK1qFVfX4+KigrndmVlJerq6pzbJSUlAIC9e/fi9ddfx/HHH5+tQ0kprEYhQLCqzZl5ExFR/sjq\nnLebYRgd7tu3bx+uvvpqLFu2zBPoU6moCEKWe2/f7XA8Ap/kRzsElJYEUFUV6rXXLjQ8dz3Hc9hz\nPIe9g+ex53JxDrMWvKurq1FfX+/c3rt3L6qqqpzbra2tuPLKK3HDDTfgmGOOyfh6DQ3tvXp8YTUC\nBQoAIBqNo66upVdfv1BUVYV47nqI57DneA57B89jz/X2OUx3IZC1YfM5c+Zg7dq1AIDNmzejurra\nGSoHgLvvvhuXXHIJjjvuuGwdQqci8QgU0QcALFgjIqK8krXMe8aMGZg8eTIWLFgAQRCwbNkyrF69\nGqFQCMcccwyee+45bNu2Dc8++ywA4IwzzsD555+frcPpIKxGUSGXAgAL1oiIKK9kdc77pptu8tye\nMGGC8/WmTZuy+aM7FddVqLoKQTffPoM3EVHfevnlv+OEE07u0mN/8pP7ce65C1BTU5vlo+q/CjJq\nRa0GLbv2xgBw2JyIqC/t2vUl/va3tV1+/PXXf7egAzeQw2rz/iSimcHb0MzqdS4VIyLqOw88cA8+\n/HAzHn30N9B1HV9+uRO7dn2JBx/8Be66607U1e1FOBzG5ZdfhTlzjsV1112F73znFvzzn39HW1sr\nvvhiG3bu3IFvf/u7mD17jvO6qqpi+fIfdHj+J598hPvvvweiKGDKlGm49trrU95n/5zRo8di1aqn\n0NjYiOnTD8eTT/4v2tvbcd11N+Ldd9/Gyy//HbquY/bsObj11u+ipaUFd955G9ra2lBSUoI77vgf\nXH75Rfj9759AMBjEhg3v4cknV2LFih93+5wVZPCOWsEb9rB5DtuyEhH1Z0//Ywv+89HeXn3NmROq\ncd5JY9N+/4ILFmL16qdx2WVX4pFHHoaqxvGLX/wWDQ37MWvWUTj11DOwc+cO3H77EsyZc6znuXv3\n7sF99z2E9ev/jT//eZUneLe0NKd8/oMP3oebb16KsWPH4Uc/ugO7d+9KeV86W7duwRNPrIbP58O7\n776NX/zitxBFEeeddxauvfabeOKJxzFr1myce+4CPPXUSrzzzls47rgT8dpr/8L8+afgtddewbx5\nX+nROS3I4G33NTc08+0z8yYi6j8mTpwMAAiFSvHhh5uxZs1qCIKI5uamDo+dOvUwAObyZHcXz86e\n/8UX2zB27DgAwO2335n2vnTGjh0Hn89crRQIBHDddVdBkiQ0NjaisbERn3zyEa644hoAwPnnXwQA\nqKmpxW9/+0vMn38K3n33bXzjG1cf+IlxKczgrSU2JQFYsEZEZDvvpLGdZsm5oChmD46//vUlNDc3\n4+c//y2am5txxRULOzxWkhLNu5KbgaV7fqpNsFLdJwiJxE5V1Q7Ht3v3Ljz11Er87ncrEQwGsXDh\nedZrSTAM3fNaY8eOw759+/Dhh5sxatQY+P3+zk9CBgUZtSL2piR25s2CNSKiPiOKIjRN63B/Y2Mj\nhgypgSiKeOWVfyAejx/Q66Z7/siRo7B5s7ni6a677sR///t5yvuKi4uxb5/ZbGzjxvdTvn5FRQWC\nwSA+/vgj7N69G/F4HBMnTsLbb/8HAPDcc6vw4ot/AQCcdNI8PPDAPZg375QDeh+pFGTwthlx88qH\nmTcRUd8ZMWIUPv74Izz00P2e+0844ST8+9+v4vrrr0FRURGqq6vx6KO/6fLrpnv+9dffhJ/97P/D\nNdd8A6FQKUaOHJXyvjPPPAf3338vbr75egwcWNXh9ceNG4+ioiCuueZy/P3v/4ezzjoHP/zhD3Hu\nuRdg06YNuO66q/Dvf7+G448/EQBw8snzsHfvXhx++MyenTAAgpGq6Xg/1Jvt5uJaHNf89hnojdWA\nIeKWC6ZjwojOe6tTamyn2HM8hz3Hc9g7eB57rrNz+Pzza7B79y584xvfPKDXS6Ug57wVSYHeMNi5\nzcybiIiy6Z57/gdffrkTd911X6+8XkEG72SsNiciomy69dbbevX1CjLl1HXvTAEL1oiIKJ8UZPCO\nxr1VjRw2JyKifFKQUSvWIXgz8yYiovxRkME7OfOW2B6ViIjySEFGrWjc2/mGmTcRUd96+eW/H/Bz\n3nvvHTQ07M/C0fR/hRm8Y0mZN+e8iYj6zIFuCWp7/vk1BRu8C3KpWMdhc2beRER9xb0l6PnnX4gV\nK36IlpYWaJqGG264GWPHjsP//u/v8cor/4Qoipgz51hMnDgJr776Mj7//DP8z//ci8GDzd4dfbEN\n6OWXX+VsAxqLReD3F2VlG1A3Bm+w2pyIyLZ6y1/w7t6Nvfqa06sPxTljz0j7ffeWoL///W9x5JFH\n4//9v6/i888/w09+ch8efPAXePLJ/8Vzz70ESZLw3HOrMHPmURg7djy+851bnMAN9M02oOeff6Gz\nDejixVfiZz/7VVa2AXVj8AabtBAR9RcbN25AY2MD1q59AQAQjZobSZ1wwsm44YbFmDfvFMyfn35j\nj77YBrS5uTkn24C6FWTwrgz54ZNF6IYBVTMgCgzeREQAcM7YMzrNkrNNUWTceOPNmDJlquf+m276\nHrZt+y/+8Y+/4lvf+iZ+/es/pHz+wbwNqOfYe+2V8sghwyvw1IrT8fBNJ+DXN5/Q14dDRFTQ3FuC\nTpo0Bf/618sAgM8//wxPPvm/aG1txaOP/gYjRozEZZddiVCoDO3tbSm3Ej2YtwH1nLNefbU8Iksi\nBEHgfDcRUR9zbwn69a+fj507t2Px4itwzz3/g8MOm4GSkhI0NjbgyisX4dvfvhqTJ09BaWkZDjts\nBm677VZ89tlW57X6YhvQ+++/x9kGdOHChVnbBtStILcEBbj1XW/heew5nsOe4znsHTyPPZd8Druz\nDWjy66VSkHPeRERE2dbb24C6MXgTERFlQW9vA+rGCV8iIqI8w+BNRESUZxi8iYiI8gyDNxERUZ5h\n8CYiIsozDN5ERER5hsGbiIgozzB4ExER5Zm8aY9KREREJmbeREREeYbBm4iIKM8weBMREeUZBm8i\nIqI8w+BNRESUZxi8iYiI8kxB7ue9YsUKvP/++xAEAUuXLsXUqVP7+pD6tU8++QSLFy/GpZdeiosv\nvhi7du3CLbfcAk3TUFVVhR//+Mfw+XxYs2YN/vCHP0AURZx33nk499xz+/rQ+417770Xb7/9NlRV\nxTe/+U0ceuihPIcHIBwOY8mSJdi3bx+i0SgWL16MCRMm8Bx2UyQSwRlnnIHFixdj9uzZPI8H4I03\n3sD111+PcePGAQDGjx+PK664Ivfn0Cgwb7zxhnHVVVcZhmEYW7ZsMc4777w+PqL+ra2tzbj44ouN\n2267zXj88ccNwzCMJUuWGC+88IJhGIZx//33GytXrjTa2tqM+fPnG83NzUY4HDZOP/10o6GhoS8P\nvd9Yt26dccUVVxiGYRj79+83jj/+eJ7DA/T8888bv/71rw3DMIwdO3YY8+fP5znsgQceeMA455xz\njFWrVvE8HqD169cb3/rWtzz39cU5LLhh83Xr1mHu3LkAgDFjxqCpqQmtra19fFT9l8/nw29+8xtU\nV1c7973xxhs4+eSTAQAnnngi1q1bh/fffx+HHnooQqEQAoEAZsyYgXfeeaevDrtfmTlzJn7yk58A\nAEpLSxEOh3kOD9Bpp52GK6+8EgCwa9cuDBo0iOewm7Zu3YotW7bghBNOAMD/z72hL85hwQXv+vp6\nVFRUOLcrKytRV1fXh0fUv8myjEAg4LkvHA7D5/MBAAYMGIC6ujrU19ejsrLSeQzPa4IkSQgGgwCA\nZ599FscddxzPYTctWLAAN910E5YuXcpz2E333HMPlixZ4tzmeTxwW7ZswdVXX40LLrgAr7/+ep+c\nw4Kc83Yz2B22R9KdP57Xjv72t7/h2Wefxe9+9zvMnz/fuZ/nsOuefPJJfPjhh7j55ps954fnsGue\ne+45HHbYYRg2bFjK7/M8ZjZy5Ehcd911OPXUU7F9+3YsWrQImqY538/VOSy44F1dXY36+nrn9t69\ne1FVVdWHR5R/gsEgIpEIAoEA9uzZg+rq6pTn9bDDDuvDo+xfXn31VfzqV7/Cb3/7W4RCIZ7DA7Rp\n0yYMGDAAQ4YMwcSJE6FpGoqLi3kOD9DLL7+M7du34+WXX8bu3bvh8/n4t3iABg0ahNNOOw0AMHz4\ncAwcOBAbN27M+TksuGHzOXPmYO3atQCAzZs3o7q6GiUlJX18VPnl6KOPds7h//3f/+HYY4/FtGnT\nsHHjRjQ3N6OtrQ3vvPMOjjjiiD4+0v6hpaUF9957Lx5++GGUl5cD4Dk8UG+99RZ+97vfATCnvtrb\n23kOu+HBBx/EqlWr8PTTT+Pcc8/F4sWLeR4P0Jo1a/DII48AAOrq6rBv3z6cc845OT+HBbmr2H33\n3Ye33noLgiBg2bJlmDBhQl8fUr+1adMm3HPPPdi5cydkWcagQYNw3333YcmSJYhGo6ipqcFdd90F\nRVHw0ksv4ZFHHoEgCLj44otx5pln9vXh9wtPPfUUfvrTn2LUqFHOfXfffTduu+02nsMuikQi+P73\nv49du3YhEonguuuuw5QpU3DrrbfyHHbTT3/6U9TW1uKYY47heTwAra2tuOmmm9Dc3Ix4PI7rrrsO\nEydOzPk5LMjgTURElM8KbticiIgo3zF4ExER5RkGbyIiojzD4E1ERJRnGLyJiIjyTME1aSHKN/fe\ney82btyIaDSKDz74ANOnTwcAfO1rX8NXv/rVLr3Gr3/9a4wfP97pZ53KwoUL8fvf/x6SJPXGYXvs\n2bMHn332GWbPnt3rr01UiLhUjChP7NixAxdeeCH+9a9/9fWhHLA1a9Zg69atuPHGG/v6UIgOCsy8\nifLYT3/6U+zYsQNffvklbr31VkQiEdx3333w+XyIRCJYtmwZJk+ejCVLluDwww/H7Nmzcc011+CY\nY47Bhg0b0NbWhocffhiDBg3CIYccgs2bN+OXv/wlGhsbsXv3bmzbtg1HHnkkbr/9dkSjUdx6663Y\nuXMnBg8eDEmSMGfOHM8exW1tbfjud7+L5uZmqKqKE088EWeccQYefPBBGIaB8vJyXHTRRbjzzjux\nbds2tLW14YwzzsDll1+O1atX469//SsEQcCePXswevRorFixAoqi9OEZJuqfOOdNlOd27NiBxx57\nDFOmTEFjYyN+8IMf4LHHHsOiRYvw8MMPd3j81q1bcc4552DlypWYOHEiXnzxxQ6P+eCDD/DQQw/h\n2WefxerVq9HU1IQ1a9ZAVVU888wzuOOOO/D66693eN6///1vqKqKP/7xj3jyyScRDAZRW1uLs88+\nG2eeeSYuu+wyPPbYY6iursbjjz+OZ555Bs8//zw++ugjAMDGjRv///bu2CW1MIzj+NcONQQRQi3W\nYnBsjDoSBFKNOVaEo0M4REO4HGyrKQin5ob+gDBaoiVyECEipakhWkKkQKFoiERPd5DOzYxLlysX\njvw+4+F5X97tx/PyHh7S6TSHh4eUy2VP3jKI/A/qvEU8bmJiAp/PB8DQ0BC7u7u8vb3x8vLC4OBg\nW73f78c0TQACgQBPT09tNZZlYRgGhmHg9/t5fn7m5uaG6elpAIaHh7Esq23d1NQUe3t7bGxsMDc3\nx8rKCj09rT3CxcUFDw8PXF5eAlCr1bi/v3fXf4xPnZyc5O7uzp2TLCK/KbxFPO7ztbJt22xvbzMz\nM8P5+bk7zOOzrw/Svnv28l2N4zgtQfw1lKE5y/j4+JhiscjZ2RnLy8scHR211PT19bG+vs7CwkLL\n90wmg+M4fzyXiDTp2lyki1QqFUzTpNFocHp6Sq1W69jeY2NjFItFAKrVKldXV201uVyObDaLZVnY\ntk1/fz/VahWfz0e9XgeaXf3HVb3jOOzs7Ljd//X1Na+vr7y/v1MoFBgfH+/Y+UW6iTpvkS6SSCSI\nx+MEAgFWV1exbZuDg4OO7L20tEQ2myUWizE6Oko4HG7r0IPBIKlUiv39fQzDIBKJMDIyQjgcJplM\n0tvby9raGre3t8RiMRqNBvPz8+6o1FAoxObmJqVSCdM0iUQiHTm7SLfRr2Ii8iOPj48UCgWi0SiO\n47C4uMjW1pb73/m/ymQy5PN50ul0R/YT6WbqvEXkRwYGBjg5OXHnE8/OznYsuEXk76jzFhER8Rg9\nWBMREfEYhbeIiIjHKLxFREQ8RuEtIiLiMQpvERERj1F4i4iIeMwvRph4T/csGFUAAAAASUVORK5C\nYII=\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72f867ef90>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "HNqUFL4deCsL",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 4. Case study: building an RNN\n"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "YkC1k4HEQ7rw",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "In this exercise we build and train a model similar to the RNNColorbot model that was used in the main Eager notebook. The model is adapted for converting and training in graph mode."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "7nkPDl5CTCNb",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "To get started, we load the colorbot dataset. The code is identical to that used in the other exercise and its details are unimportant."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "A0uREmVXCQEw",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def parse(line):\n",
+        "  \"\"\"Parses a line from the colors dataset.\n",
+        "  \n",
+        "  Args:\n",
+        "    line: A comma-separated string containing four items:\n",
+        "        color_name, red, green, and blue, representing the name and\n",
+        "        respectively the RGB value of the color, as an integer\n",
+        "        between 0 and 255.\n",
+        "\n",
+        "  Returns:\n",
+        "    A tuple of three tensors (rgb, chars, length), of shapes: (batch_size, 3),\n",
+        "    (batch_size, max_sequence_length, 256) and respectively (batch_size).\n",
+        "  \"\"\"\n",
+        "  items = tf.string_split([line], \",\").values\n",
+        "  rgb = tf.string_to_number(items[1:], out_type=tf.float32) / 255.0\n",
+        "  color_name = items[0]\n",
+        "  chars = tf.one_hot(tf.decode_raw(color_name, tf.uint8), depth=256)\n",
+        "  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n",
+        "  return rgb, chars, length\n",
+        "\n",
+        "\n",
+        "def maybe_download(filename, work_directory, source_url):\n",
+        "  \"\"\"Downloads the data from source url.\"\"\"\n",
+        "  if not tf.gfile.Exists(work_directory):\n",
+        "    tf.gfile.MakeDirs(work_directory)\n",
+        "  filepath = os.path.join(work_directory, filename)\n",
+        "  if not tf.gfile.Exists(filepath):\n",
+        "    temp_file_name, _ = six.moves.urllib.request.urlretrieve(source_url)\n",
+        "    tf.gfile.Copy(temp_file_name, filepath)\n",
+        "    with tf.gfile.GFile(filepath) as f:\n",
+        "      size = f.size()\n",
+        "    print('Successfully downloaded', filename, size, 'bytes.')\n",
+        "  return filepath\n",
+        "\n",
+        "\n",
+        "def load_dataset(data_dir, url, batch_size, training=True):\n",
+        "  \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n",
+        "  path = maybe_download(os.path.basename(url), data_dir, url)\n",
+        "  dataset = tf.data.TextLineDataset(path)\n",
+        "  dataset = dataset.skip(1)\n",
+        "  dataset = dataset.map(parse)\n",
+        "  dataset = dataset.cache()\n",
+        "  dataset = dataset.repeat()\n",
+        "  if training:\n",
+        "    dataset = dataset.shuffle(buffer_size=3000)\n",
+        "  dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None, None], []))\n",
+        "  return dataset\n",
+        "\n",
+        "\n",
+        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv\"\n",
+        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv\"\n",
+        "data_dir = \"tmp/rnn/data\""
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "waZ89t3DTUla",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Next, we set up the RNNColobot model, which is very similar to the one we used in the main exercise.\n",
+        "\n",
+        "Autograph doesn't fully support classes yet (but it will soon!), so we'll write the model using simple functions."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9v8AJouiC44V",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def model_components():\n",
+        "  lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
+        "  lower_cell.build(tf.TensorShape((None, 256)))\n",
+        "  upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "  upper_cell.build(tf.TensorShape((None, 256)))\n",
+        "  relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
+        "  relu_layer.build(tf.TensorShape((None, 128)))\n",
+        "  return lower_cell, upper_cell, relu_layer\n",
+        "\n",
+        "\n",
+        "def rnn_layer(chars, cell, batch_size, training):\n",
+        "  \"\"\"A simple RNN layer.\n",
+        "  \n",
+        "  Args:\n",
+        "    chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
+        "    cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "  Returns:\n",
+        "    A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
+        "  \"\"\"\n",
+        "  hidden_outputs = []\n",
+        "  autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
+        "  state, output = cell.zero_state(batch_size, tf.float32)\n",
+        "  n = tf.shape(chars)[0]\n",
+        "  i = 0\n",
+        "  while i < n:\n",
+        "    ch = chars[i]\n",
+        "    cell_output, (state, output) = cell.call(ch, (state, output))\n",
+        "    hidden_outputs.append(cell_output)\n",
+        "    i += 1\n",
+        "  hidden_outputs = hidden_outputs.stack()\n",
+        "  if training:\n",
+        "    hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
+        "  return hidden_outputs\n",
+        "\n",
+        "\n",
+        "def model(inputs, lower_cell, upper_cell, relu_layer, batch_size, training):\n",
+        "  \"\"\"RNNColorbot model.\n",
+        "  \n",
+        "  The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
+        "  followed by a fully connected layer with ReLU activation.\n",
+        "  \n",
+        "  Args:\n",
+        "    inputs: A tuple (chars, length)\n",
+        "    lower_cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    upper_cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    relu_layer: An object of type tf.layers.Dense\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "    \n",
+        "  Returns:\n",
+        "    A Tensor of shape (batch_size, 3) - the model predictions.\n",
+        "  \"\"\"\n",
+        "  (chars, length) = inputs\n",
+        "  chars_time_major = tf.transpose(chars, [1, 0, 2])\n",
+        "  chars_time_major.set_shape((None, batch_size, 256))\n",
+        "\n",
+        "  hidden_outputs = rnn_layer(chars_time_major, lower_cell, batch_size, training)\n",
+        "  final_outputs = rnn_layer(hidden_outputs, upper_cell, batch_size, training)\n",
+        "\n",
+        "  # Grab just the end-of-sequence from each output.\n",
+        "  indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
+        "  sequence_ends = tf.gather_nd(final_outputs, indices)\n",
+        "  return relu_layer(sequence_ends)\n",
+        "\n",
+        "def loss_fn(labels, predictions):\n",
+        "  return tf.reduce_mean((predictions - labels) ** 2)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "JjK4gXFvFsf4",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "The train and test functions are also similar to the ones used in the Eager notebook. Since the network requires a fixed batch size, we'll train in a single shot, rather than by epoch."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "ZWQMExk0S6X6",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def train(optimizer, train_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps):\n",
+        "  iterator = train_data.make_one_shot_iterator()\n",
+        "  step = 0\n",
+        "  while step < num_steps:\n",
+        "    labels, chars, sequence_length = iterator.get_next()\n",
+        "    predictions = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, batch_size, training=True)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "    optimizer.minimize(loss)\n",
+        "    if step % (num_steps // 10) == 0:\n",
+        "      print('Step', step, 'train loss', loss)\n",
+        "    step += 1\n",
+        "  return step\n",
+        "\n",
+        "\n",
+        "def test(eval_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps):\n",
+        "  total_loss = 0.0\n",
+        "  iterator = eval_data.make_one_shot_iterator()\n",
+        "  step = 0\n",
+        "  while step < num_steps:\n",
+        "    labels, chars, sequence_length = iterator.get_next()\n",
+        "    predictions = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, batch_size, training=False)\n",
+        "    total_loss += loss_fn(labels, predictions)\n",
+        "    step += 1\n",
+        "  print('Test loss', total_loss)\n",
+        "  return total_loss\n",
+        "\n",
+        "\n",
+        "def train_model(train_data, eval_data, batch_size, lower_cell, upper_cell, relu_layer, train_steps):\n",
+        "  optimizer = tf.train.AdamOptimizer(learning_rate=0.01)\n",
+        "\n",
+        "  train(optimizer, train_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps=tf.constant(train_steps))\n",
+        "  test(eval_data, lower_cell, upper_cell, relu_layer, 50, num_steps=tf.constant(2))\n",
+        "\n",
+        "  print('Colorbot is ready to generate colors!\\n\\n')\n",
+        "  \n",
+        "  # In graph mode, every op needs to be a dependent of another op.\n",
+        "  # Here, we create a no_op that will drive the execution of all other code in\n",
+        "  # this function. Autograph will add the necessary control dependencies.\n",
+        "  return tf.no_op()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "iopcs5hXG2od",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we add code to run inference on a single input, which we'll read from the input.\n",
+        "\n",
+        "Note the `do_not_convert` annotation that lets us disable conversion for certain functions and run them as a `py_func` instead, so you can still call them from compiled code."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "DyU0wnnAFEYj",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "@autograph.do_not_convert(run_as=autograph.RunMode.PY_FUNC)\n",
+        "def draw_prediction(color_name, pred):\n",
+        "  pred = pred * 255\n",
+        "  pred = pred.astype(np.uint8)\n",
+        "  plt.axis('off')\n",
+        "  plt.imshow(pred)\n",
+        "  plt.title(color_name)\n",
+        "  plt.show()\n",
+        "\n",
+        "\n",
+        "def inference(color_name, lower_cell, upper_cell, relu_layer):\n",
+        "  _, chars, sequence_length = parse(color_name)\n",
+        "  chars = tf.expand_dims(chars, 0)\n",
+        "  sequence_length = tf.expand_dims(sequence_length, 0)\n",
+        "  pred = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, 1, training=False)\n",
+        "  pred = tf.minimum(pred, 1.0)\n",
+        "  pred = tf.expand_dims(pred, 0)\n",
+        "  draw_prediction(color_name, pred)\n",
+        "  # Create an op that will drive the entire function.\n",
+        "  return tf.no_op()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Nt0Kv5OCHip0",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we put everything together.\n",
+        "\n",
+        "Note that the entire training and testing code is all compiled into a single op (`tf_train_model`) that you only execute once! We also still use a `sess.run` loop for the inference part, because that requires keyboard input."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "-GmWa0GtYWdh",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 668
+        },
+        "outputId": "61f4af1d-c81e-44db-9079-1a7b8ed8ce58",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345877153,
+          "user_tz": 240,
+          "elapsed": 75500,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def run_input_loop(sess, inference_ops, color_name_placeholder):\n",
+        "  \"\"\"Helper function that reads from input and calls the inference ops in a loop.\"\"\"\n",
+        "\n",
+        "  tb = widgets.TabBar([\"RNN Colorbot\"])\n",
+        "  while True:\n",
+        "    with tb.output_to(0):\n",
+        "      try:\n",
+        "        color_name = six.moves.input(\"Give me a color name (or press 'enter' to exit): \")\n",
+        "      except (EOFError, KeyboardInterrupt):\n",
+        "        break\n",
+        "    if not color_name:\n",
+        "      break\n",
+        "    with tb.output_to(0):\n",
+        "      tb.clear_tab()\n",
+        "      sess.run(inference_ops, {color_name_placeholder: color_name})\n",
+        "      plt.show()\n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  # Read the data.\n",
+        "  batch_size = 64\n",
+        "  train_data = load_dataset(data_dir, train_url, batch_size)\n",
+        "  eval_data = load_dataset(data_dir, test_url, 50, training=False)\n",
+        "  \n",
+        "  # Create the model components.\n",
+        "  lower_cell, upper_cell, relu_layer = model_components()\n",
+        "  # Create the helper placeholder for inference.\n",
+        "  color_name_placeholder = tf.placeholder(tf.string, shape=())\n",
+        "  \n",
+        "  # Compile the train / test code.\n",
+        "  tf_train_model = autograph.to_graph(train_model)\n",
+        "  train_model_ops = tf_train_model(\n",
+        "      train_data, eval_data, batch_size, lower_cell, upper_cell, relu_layer, train_steps=100)\n",
+        "  \n",
+        "  # Compile the inference code.\n",
+        "  tf_inference = autograph.to_graph(inference)\n",
+        "  inference_ops = tf_inference(color_name_placeholder, lower_cell, upper_cell, relu_layer)\n",
+        "  \n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf.global_variables_initializer())\n",
+        "    \n",
+        "    # Run training and testing.\n",
+        "    sess.run(train_model_ops)\n",
+        "     \n",
+        "    # Run the inference loop.\n",
+        "    run_input_loop(sess, inference_ops, color_name_placeholder)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "('Successfully downloaded', 'train.csv', 28010L, 'bytes.')\n",
+            "('Successfully downloaded', 'test.csv', 2414L, 'bytes.')\n",
+            "Step 0 train loss 0.37890616\n",
+            "Step 10 train loss 0.18515904\n",
+            "Step 20 train loss 0.0892782\n",
+            "Step 30 train loss 0.07883155\n",
+            "Step 40 train loss 0.08585831\n",
+            "Step 50 train loss 0.09302989\n",
+            "Step 60 train loss 0.089012615\n",
+            "Step 70 train loss 0.07275697\n",
+            "Step 80 train loss 0.06644974\n",
+            "Step 90 train loss 0.0854013\n",
+            "Test loss 0.13216865Colorbot is ready to generate colors!\n",
+            "\n",
+            "\n",
+            "\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<link rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'></link>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<script src='/nbextensions/google.colab/tabbar_main.min.js'></script>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<div id=\"id1\"></div>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b102d936-3379-11e8-ac70-0242ac110002\"] = colab_lib.createTabBar({\"contentBorder\": [\"0px\"], \"borderColor\": [\"#a7a7a7\"], \"tabNames\": [\"RNN Colorbot\"], \"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"elementId\": \"id1\"});\n",
+              "//# sourceURL=js_e223a56194"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b103532a-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_b8c6a821fb"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b105b28c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_44805e254b"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b106197a-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_a63d3c6c47"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b1069f44-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"b106197a-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7e203b8bce"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b1070f38-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_d53293d4a7"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6d90d5c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"b105b28c-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_3000dc2c05"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6da872c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_4136f669a3"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6dac868-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_2f70dd9aee"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6db07d8-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c6dac868-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7226726048"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6dcc6fe-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_72e7709865"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVQAAAFZCAYAAADHDNdrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAB9JJREFUeJzt3E1Lle0ax+HTF4jeEAyMBhE0DawI\nwsCH0AIlaGBWNJBo0CDoA0TQhmDXuKAGDioiCA2KlEAlnl05FD9Co8BeaGCQoBDa2jPZsXt4Bvu/\n0+o4Rmvd1zW4rsmP84bFamo0Go0C4H/WvNYHAPhVCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKDy\nUxgeHq5Dhw7V4OBgPXz4sHp7e+vWrVt15cqVOnnyZN2/f78ajUbdvn27+vr6qqenp65du1YrKytV\nVfXhw4e6cOFC9fX1VV9fX01PT1dV1dzcXHV3d9eDBw/q+PHj9ccff9TExMRaXpWfWOtaHwD+zuvX\nr+vOnTs1MTFRbW1tdf78+dW16enpGh8fr/b29hobG6upqal6/Phxbdy4sS5evFgjIyM1NDRUly5d\nqv3799fw8HC9efOmTp8+XVNTU1VV9enTp2pubq5nz57V5ORk3bhxo44dO7ZW1+UnZkJl3Zudna2D\nBw9WR0dHbdiwoQYHB1fX9u7dW+3t7VVV9fLlyxocHKytW7dWa2trnTp1qp4/f16Li4s1MzNT586d\nq6qqXbt21YEDB1an1OXl5Tpx4kRVVe3Zs6fevXv3Yy/IL8OEyrr3+fPnamtrW/2+ffv21c//+Xxh\nYaHu3r1bjx49qqqqlZWVam9vr4WFhWo0GnXmzJnVvYuLi9XV1VVVVS0tLbVp06aqqmpubq6vX7/+\nX+/Dr0tQWfe2bNlSi4uLq98/fvz43X0dHR3V29tbQ0ND3zxfXl6ulpaWevLkSW3evPmbtbm5ufyB\n+W155Wfd6+zsrJmZmZqfn68vX77U2NjYd/cdOXKkxsfHa2lpqaqqRkdH6+nTp9Xa2lqHDx+u0dHR\nqqpaWlqqy5cv1/v373/YHfg9CCrrXmdnZw0MDNTAwECdPXu2enp6vrvv6NGj1dPTUwMDA9Xf318v\nXryo7u7uqqq6evVqzc7OVn9/fw0MDNTOnTtrx44dP/Ia/Aaa/B8qP4NGo1FNTU1VVfXq1au6efPm\nX06qsFZMqKx78/Pz1dXVVW/fvq1Go1GTk5O1b9++tT4W/BcTKj+FkZGRunfvXjU1NdXu3bvr+vXr\ntW3btrU+FnxDUAFCvPIDhAgqQMi6+WH/kX8eXesjAPytf/3jz79cM6EChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCI\noAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIig\nAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAC\nhAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCI\noAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIig\nAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAC\nhAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkBI\nU6PRaKz1IQB+BSZUgBBBBQgRVIAQQQUIEVSAEEEFCBFUgBBBBQgRVIAQQQUIEVSAEEEFCBFUgBBB\nBQgRVIAQQQUIEVSAEEEFCBFUgBBBBQgRVIAQQQUIEVSAkH8D1Aj8lNhhe7QAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72f402e850>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c70592aa-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c6da872c-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_25c3aaf79a"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c70842c0-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_984c56b816"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c708dec4-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_e0451a1217"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7092726-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c708dec4-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7aa23d7385"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7099044-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_5722756ddb"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "stream",
+          "text": [
+            "Give me a color name (or press 'enter' to exit): \n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7baac12-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c70842c0-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_cdd622e58f"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "AHJ2c47U-A5W",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Where do we go next?\n",
+        "\n",
+        "Autograph is available in tensorflow.contrib, but it's still in its early stages. We're excited about the possibilities it brings — write your machine learning code in the flexible Eager style, but still enjoy all the benefits that come with running in graph mode. A beta version will be available soon -- stay tuned!"
+      ]
+    }
+  ]
+}
diff --git a/tensorflow/contrib/py2tf/impl/BUILD b/tensorflow/contrib/autograph/impl/BUILD
similarity index 74%
rename from tensorflow/contrib/py2tf/impl/BUILD
rename to tensorflow/contrib/autograph/impl/BUILD
index 90ffabbc9bf4524ec2ebf54b6dd847bd8768a486..54424e26472b8466b8fe68ea848b5463c10224c9 100644
--- a/tensorflow/contrib/py2tf/impl/BUILD
+++ b/tensorflow/contrib/autograph/impl/BUILD
@@ -25,10 +25,11 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/contrib/py2tf/converters",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
-        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/contrib/autograph/converters",
+        "//tensorflow/contrib/autograph/operators",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/contrib/autograph/pyct/static_analysis",
+        "//tensorflow/contrib/autograph/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
@@ -38,10 +39,12 @@ py_test(
     name = "api_test",
     srcs = ["api_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":impl",
-        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/contrib/autograph/utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -49,6 +52,7 @@ py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/py2tf/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
similarity index 76%
rename from tensorflow/contrib/py2tf/impl/api.py
rename to tensorflow/contrib/autograph/impl/api.py
index 883b304089024363f41cabde2cb74c49f01ae836..dce994e50df60d8bd419f62207d77035beac9f5a 100644
--- a/tensorflow/contrib/py2tf/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -20,15 +20,20 @@ from __future__ import print_function
 
 from functools import wraps
 
+from enum import Enum
+
+# pylint:disable=g-bad-import-order
 import gast
 import six
-
-from tensorflow.contrib.py2tf.impl import config
-from tensorflow.contrib.py2tf.impl import conversion
-from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import inspect_utils
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.utils import builtins
+# pylint:enable=g-bad-import-order
+
+from tensorflow.contrib.autograph.impl import config
+from tensorflow.contrib.autograph.impl import conversion
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import inspect_utils
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.utils import builtins
+from tensorflow.contrib.autograph.utils import py_func
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
 
@@ -37,55 +42,6 @@ from tensorflow.python.util import tf_inspect
 # (currently we require (module + class name, type))
 
 
-def graph_ready(f):
-  """No-op decorator that explicitly marks a function as graph-ready.
-
-  Graph-ready functions are assumed to not need any conversion.
-
-  Args:
-    f: Any callable.
-  Returns:
-    f itself.
-  """
-  setattr(f, '__pyct_is_compile_decorator', True)
-  return f
-
-
-def convert_inline(f, *args, **kwargs):
-  """Shorthand to convert and call a function.
-
-  For example, the following two statements are equivalent:
-
-      @convert()
-      def foo():
-        ...
-      foo(bar)
-
-      def foo():
-        ...
-      convert_inline(foo, bar)
-
-  Args:
-    f: Function to convert. Only this call will be converted.
-    *args: Passed through to f.
-    **kwargs: Passed through to f, with the following exceptions:
-        * arg_value_hints: A dict mapping parameter names to objects that can
-            hint at the type of those parameters.
-
-  Returns:
-    The result of the converted f applied to args and kwargs.
-  """
-  if 'arg_value_hints' in kwargs:
-    arg_value_hints = kwargs['arg_value_hints']
-    del kwargs['arg_value_hints']
-  else:
-    arg_value_hints = None
-  if tf_inspect.ismethod(f):
-    # When converting methods, the result is still an unbound function.
-    args = (f.__self__,) + args
-  return convert(arg_value_hints)(f)(*args, **kwargs)
-
-
 def convert(recursive=False, verbose=False, arg_types=None):
   """Decorator that compiles a function to graph mode.
 
@@ -122,6 +78,55 @@ def convert(recursive=False, verbose=False, arg_types=None):
   return decorator
 
 
+class RunMode(Enum):
+  GRAPH = 1
+  PY_FUNC = 2
+
+
+def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
+  """Decorator that suppresses compilation of a function.
+
+  Args:
+    run_as: RunMode value. Whether to run the function as-is, or wrap it into
+        a py_func.
+    return_dtypes: See autograph.utils.py_func.wrap_py_func. Setting to None or
+        empty list or tuple will create a dummy return value that can be used
+        to set control dependencies.
+
+  Returns:
+    A decorator that wraps the original function.
+  """
+  def decorator(f):
+    """Decorator implementation."""
+
+    @wraps(f)
+    def graph_wrapper(*args, **kwargs):
+      return f(*args, **kwargs)
+
+    @wraps(f)
+    def py_func_wrapper(*args, **kwargs):
+      if kwargs:
+        raise NotImplementedError(
+            'RunMode.PY_FUNC does not yet support kwargs')
+      # TODO(mdan): Add support for kwargs.
+      return py_func.wrap_py_func(
+          f, return_dtypes, args, kwargs, use_dummy_return=not return_dtypes)
+
+    if run_as == RunMode.GRAPH:
+      wrapper = graph_wrapper
+    elif run_as == RunMode.PY_FUNC:
+      wrapper = py_func_wrapper
+    else:
+      raise ValueError('unknown value for run_as: %s' % run_as)
+
+    # Sometimes the decorator is just desugared, making it impossible to detect.
+    # This attribute makes detection easier.
+    setattr(wrapper, '__pyct_is_compile_decorator', True)
+    return wrapper
+
+  return decorator
+
+
 def converted_call(f, recursive, verbose, arg_types, *args, **kwargs):
   """Compiles a function call inline."""
   # TODO(mdan): This needs cleanup.
@@ -227,7 +232,7 @@ def to_graph(e,
   """
   conversion_map = conversion.ConversionMap(
       recursive=recursive,
-      nocompile_decorators=(convert, graph_ready, convert_inline),
+      nocompile_decorators=(convert, do_not_convert, converted_call),
       partial_types=partial_types,
       api_module=tf_inspect.getmodule(to_graph))
   _, name = conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
@@ -242,7 +247,10 @@ def to_graph(e,
   # The compiled code should see everything the entry function saw.
   # TODO(mdan): This might not work well if the call tree spans modules?
   if tf_inspect.isfunction(e):
-    compiled_node.__dict__.update(inspect_utils.getnamespace(e))
+    for key, val in inspect_utils.getnamespace(e).items():
+      # Avoid overwriting entities that have been transformed.
+      if key not in compiled_node.__dict__:
+        compiled_node.__dict__[key] = val
   compiled_fn = getattr(compiled_node, name)
 
   if verbose:
@@ -274,7 +282,7 @@ def to_code(e,
   """
   conversion_map = conversion.ConversionMap(
       recursive=recursive,
-      nocompile_decorators=(convert, graph_ready, convert_inline),
+      nocompile_decorators=(convert, do_not_convert, converted_call),
       partial_types=partial_types,
       api_module=tf_inspect.getmodule(to_graph))
   conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
diff --git a/tensorflow/contrib/py2tf/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py
similarity index 81%
rename from tensorflow/contrib/py2tf/impl/api_test.py
rename to tensorflow/contrib/autograph/impl/api_test.py
index 13f8e66018920a5b13f8bd3f00c67d3bbdd519aa..ee2d301d7562ef5ba6bc7ca6d013b99dec78d4c3 100644
--- a/tensorflow/contrib/py2tf/impl/api_test.py
+++ b/tensorflow/contrib/autograph/impl/api_test.py
@@ -18,10 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.impl import api
-from tensorflow.contrib.py2tf.impl import config
-from tensorflow.contrib.py2tf.pyct import parser
+import numpy as np
+
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.impl import api
+from tensorflow.contrib.autograph.impl import config
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
@@ -34,10 +37,8 @@ class ApiTest(test.TestCase):
   def setUp(self):
     config.COMPILED_IMPORT_STATEMENTS = (
         'from __future__ import print_function',
-        'from tensorflow.contrib.py2tf import utils as '
-        'py2tf_utils',
-        'tf = py2tf_utils.fake_tf()'
-    )
+        'from tensorflow.contrib.autograph import utils as '
+        'autograph_utils', 'tf = autograph_utils.fake_tf()')
 
   def test_decorator_recurses(self):
 
@@ -81,11 +82,11 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
-  def test_decorator_calls_converted(self):
+  def test_decorator_calls_unconverted_graph(self):
 
     class TestClass(object):
 
-      @api.graph_ready
+      @api.do_not_convert(api.RunMode.GRAPH)
       def called_member(self, a):
         return tf.negative(a)
 
@@ -102,20 +103,23 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
-  def test_decorator_calls_decorated(self):
+  def test_decorator_calls_unconverted_py_func(self):
 
     class TestClass(object):
 
-      @api.convert()
+      @api.do_not_convert(
+          api.RunMode.PY_FUNC, return_dtypes=py_func.MatchDType(1))
       def called_member(self, a):
-        if a < 0:
-          a = -a
-        return a
+        return np.negative(a)
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
         while tf.reduce_sum(x) > s:
-          x //= self.called_member(a)
+          y = self.called_member(a)
+          # set_shape works around while_loop's limitations.
+          # TODO(mdan): Allow specifying shapes (or ShapeLike) instead.
+          y.set_shape(a.shape)
+          x //= y
         return x
 
     tc = TestClass()
@@ -125,10 +129,11 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
-  def test_convert_call_site_decorator(self):
+  def test_decorator_calls_decorated(self):
 
     class TestClass(object):
 
+      @api.convert()
       def called_member(self, a):
         if a < 0:
           a = -a
@@ -137,7 +142,7 @@ class ApiTest(test.TestCase):
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
         while tf.reduce_sum(x) > s:
-          x //= api.convert_inline(self.called_member, a)
+          x //= self.called_member(a)
         return x
 
     tc = TestClass()
@@ -147,17 +152,20 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
-  def test_graph_ready_call_site_decorator(self):
+  def test_convert_call_site_decorator(self):
 
     class TestClass(object):
 
       def called_member(self, a):
-        return tf.negative(a)
+        if a < 0:
+          a = -a
+        return a
 
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
         while tf.reduce_sum(x) > s:
-          x //= api.graph_ready(self.called_member(a))
+          x //= api.converted_call(self.called_member, False, False, {}, self,
+                                   a)
         return x
 
     tc = TestClass()
@@ -168,6 +176,7 @@ class ApiTest(test.TestCase):
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
   def test_to_graph_basic(self):
+
     def test_fn(x, s):
       while tf.reduce_sum(x) > s:
         x //= 2
@@ -180,6 +189,7 @@ class ApiTest(test.TestCase):
       self.assertListEqual([1, 2], sess.run(x).tolist())
 
   def test_to_code_basic(self):
+
     def test_fn(x, s):
       while tf.reduce_sum(x) > s:
         x /= 2
@@ -188,7 +198,7 @@ class ApiTest(test.TestCase):
     compiled_code = api.to_code(test_fn)
 
     # Just check for some key words and that it is parseable Python code.
-    self.assertRegexpMatches(compiled_code, 'py2tf_utils\\.run_while')
+    self.assertRegexpMatches(compiled_code, 'autograph_utils\\.run_while')
     self.assertIsNotNone(parser.parse_str(compiled_code))
 
 
diff --git a/tensorflow/contrib/py2tf/impl/config.py b/tensorflow/contrib/autograph/impl/config.py
similarity index 73%
rename from tensorflow/contrib/py2tf/impl/config.py
rename to tensorflow/contrib/autograph/impl/config.py
index bdbc6663dd65ed66c55ad2d2e52428084bbea219..26326465e265f5b40c3badedc0ea2813248ef60f 100644
--- a/tensorflow/contrib/py2tf/impl/config.py
+++ b/tensorflow/contrib/autograph/impl/config.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
+from tensorflow.contrib.autograph import utils
 
 
 PYTHON_LITERALS = {
@@ -35,16 +35,21 @@ DEFAULT_UNCOMPILED_MODULES = set((
     # All of tensorflow's subpackages. Unlike the root tf module, they don't
     # have well-known names. Not refering to the module directly to avoid
     # circular imports.
-    (utils.__name__[:-len('.contrib.py2tf.utils')],),
+    (
+        utils.__name__[:-len('.contrib.autograph.utils')],),
 ))
 
 NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
-# TODO(mdan): Also allow controlling the generated names (for testability).
+# TODO(mdan): Also allow controlling the generated names.
+# TODO(mdan); Consolidate all internal imports into a single __ag module.
 COMPILED_IMPORT_STATEMENTS = (
     'from __future__ import print_function',
     'import tensorflow as tf',
-    'from tensorflow.contrib.py2tf.impl import api as '
-    'py2tf_api',
-    'from tensorflow.contrib.py2tf import utils as '
-    'py2tf_utils')
+    'from tensorflow.contrib.autograph.impl import api'
+    ' as autograph_api',
+    'from tensorflow.contrib.autograph import utils'
+    ' as autograph_utils',
+    'from tensorflow.contrib.autograph import operators'
+    ' as __ops',
+)
diff --git a/tensorflow/contrib/py2tf/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
similarity index 84%
rename from tensorflow/contrib/py2tf/impl/conversion.py
rename to tensorflow/contrib/autograph/impl/conversion.py
index 37b24ab55fdd1b03e12e9afe06530e3c26218b61..62a49cd92d835fb942f48354041cb0ab03d02c97 100644
--- a/tensorflow/contrib/py2tf/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -20,31 +20,31 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.converters import asserts
-from tensorflow.contrib.py2tf.converters import break_statements
-from tensorflow.contrib.py2tf.converters import builtin_functions
-from tensorflow.contrib.py2tf.converters import call_trees
-from tensorflow.contrib.py2tf.converters import continue_statements
-from tensorflow.contrib.py2tf.converters import control_flow
-from tensorflow.contrib.py2tf.converters import decorators
-from tensorflow.contrib.py2tf.converters import for_loops
-from tensorflow.contrib.py2tf.converters import ifexp
-from tensorflow.contrib.py2tf.converters import lists
-from tensorflow.contrib.py2tf.converters import logical_expressions
-from tensorflow.contrib.py2tf.converters import name_scopes
-from tensorflow.contrib.py2tf.converters import side_effect_guards
-from tensorflow.contrib.py2tf.converters import single_return
-from tensorflow.contrib.py2tf.impl import config
-from tensorflow.contrib.py2tf.impl import naming
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import inspect_utils
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
-from tensorflow.contrib.py2tf.utils import type_hints
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.converters import asserts
+from tensorflow.contrib.autograph.converters import break_statements
+from tensorflow.contrib.autograph.converters import builtin_functions
+from tensorflow.contrib.autograph.converters import call_trees
+from tensorflow.contrib.autograph.converters import continue_statements
+from tensorflow.contrib.autograph.converters import control_flow
+from tensorflow.contrib.autograph.converters import decorators
+from tensorflow.contrib.autograph.converters import for_loops
+from tensorflow.contrib.autograph.converters import ifexp
+from tensorflow.contrib.autograph.converters import lists
+from tensorflow.contrib.autograph.converters import logical_expressions
+from tensorflow.contrib.autograph.converters import name_scopes
+from tensorflow.contrib.autograph.converters import side_effect_guards
+from tensorflow.contrib.autograph.converters import single_return
+from tensorflow.contrib.autograph.impl import config
+from tensorflow.contrib.autograph.impl import naming
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import inspect_utils
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis import live_values
+from tensorflow.contrib.autograph.pyct.static_analysis import type_info
+from tensorflow.contrib.autograph.utils import type_hints
 from tensorflow.python.util import tf_inspect
 
 
@@ -213,19 +213,19 @@ def class_to_graph(c, conversion_map):
 def _add_self_references(namespace, api_module):
   """Self refs are only required for analysis and are not used directly."""
   # Manually add the utils namespace which may be used from generated code.
-  if 'py2tf_util' not in namespace:
-    namespace['py2tf_utils'] = utils
-  elif namespace['py2tf_utils'] != utils:
+  if 'autograph_util' not in namespace:
+    namespace['autograph_utils'] = utils
+  elif namespace['autograph_utils'] != utils:
     raise ValueError(
-        'The module name "py2tf_utils" is reserved and may not be used.')
+        'The module name "autograph_utils" is reserved and may not be used.')
 
   # We also make reference to the api module for dynamic conversion, but
   # to avoid circular references we don't import it here.
-  if 'py2tf_api' not in namespace:
-    namespace['py2tf_api'] = api_module
-  elif namespace['py2tf_api'] != api_module:
+  if 'autograph_api' not in namespace:
+    namespace['autograph_api'] = api_module
+  elif namespace['autograph_api'] != api_module:
     raise ValueError(
-        'The module name "py2tf_api" is reserved and may not be used.')
+        'The module name "autograph_api" is reserved and may not be used.')
 
 
 def function_to_graph(f, conversion_map, arg_values, arg_types,
diff --git a/tensorflow/contrib/py2tf/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
similarity index 96%
rename from tensorflow/contrib/py2tf/impl/conversion_test.py
rename to tensorflow/contrib/autograph/impl/conversion_test.py
index 9ff256aace7a0e7ac5e7ac07e580b8bed7d8df6f..7066739eb87f89ab98e906b10dab62baeaa2de8e 100644
--- a/tensorflow/contrib/py2tf/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.impl import conversion
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.impl import conversion
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/py2tf/impl/naming.py b/tensorflow/contrib/autograph/impl/naming.py
similarity index 98%
rename from tensorflow/contrib/py2tf/impl/naming.py
rename to tensorflow/contrib/autograph/impl/naming.py
index 51326091de13715c32d0a79279f1d3274e48ad10..1facaa0ca0ebcc6d4281e7c92a462ceeb00b453a 100644
--- a/tensorflow/contrib/py2tf/impl/naming.py
+++ b/tensorflow/contrib/autograph/impl/naming.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.autograph.pyct import qual_names
 
 
 class Namer(object):
diff --git a/tensorflow/contrib/py2tf/impl/naming_test.py b/tensorflow/contrib/autograph/impl/naming_test.py
similarity index 98%
rename from tensorflow/contrib/py2tf/impl/naming_test.py
rename to tensorflow/contrib/autograph/impl/naming_test.py
index beb4e54937bbb91b19157c9b9e3c528353206c62..73fc0894655cb49e4f61bf8ca51995b06feb3072 100644
--- a/tensorflow/contrib/py2tf/impl/naming_test.py
+++ b/tensorflow/contrib/autograph/impl/naming_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.impl import naming
+from tensorflow.contrib.autograph.impl import naming
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/autograph/operators/BUILD b/tensorflow/contrib/autograph/operators/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7856c253bd0c83b1712267184393a8742576bfcd
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "operators",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [],
+)
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc.py b/tensorflow/contrib/autograph/operators/__init__.py
similarity index 62%
rename from tensorflow/contrib/bayesflow/python/ops/hmc.py
rename to tensorflow/contrib/autograph/operators/__init__.py
index c8a5a195d3d709ded7afd09287255deab2ac2f3c..c3f4cab69eed416ed5f4987076969de9c353c203 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc.py
+++ b/tensorflow/contrib/autograph/operators/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm."""
+"""This module implements operators that we overload.
+
+Note that "operator" is used loosely here, and includes control structures like
+conditionals and loops, implemented in functional form, using for example
+closures for the body.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-# go/tf-wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.hmc_impl import *  # pylint: disable=wildcard-import,unused-wildcard-import,g-importing-member
-from tensorflow.python.util import all_util
-
-_allowed_symbols = [
-    "sample_chain",
-    "kernel",
-]
-
-all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
similarity index 98%
rename from tensorflow/contrib/py2tf/pyct/BUILD
rename to tensorflow/contrib/autograph/pyct/BUILD
index edec5f7712d08247437c9e95d743e59dafffcd7b..c483ff68c4b7c6d9a3315f569b62b8f253079f00 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -66,6 +66,7 @@ py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/py2tf/pyct/__init__.py b/tensorflow/contrib/autograph/pyct/__init__.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/__init__.py
rename to tensorflow/contrib/autograph/pyct/__init__.py
diff --git a/tensorflow/contrib/py2tf/pyct/anno.py b/tensorflow/contrib/autograph/pyct/anno.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/anno.py
rename to tensorflow/contrib/autograph/pyct/anno.py
diff --git a/tensorflow/contrib/py2tf/pyct/anno_test.py b/tensorflow/contrib/autograph/pyct/anno_test.py
similarity index 97%
rename from tensorflow/contrib/py2tf/pyct/anno_test.py
rename to tensorflow/contrib/autograph/pyct/anno_test.py
index 6c29918fdfaaa0224f20a2c3cb2ea8088f3eb52b..1d4d9d119e0c45c4bf9dd4e5b8156766489a2e4d 100644
--- a/tensorflow/contrib/py2tf/pyct/anno_test.py
+++ b/tensorflow/contrib/autograph/pyct/anno_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import ast
 
-from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py
similarity index 87%
rename from tensorflow/contrib/py2tf/pyct/ast_util.py
rename to tensorflow/contrib/autograph/pyct/ast_util.py
index f916775b9cf3cec960ec2896c334f1d737862205..4f76a695228f7d84b80b2e4b03801e15e94b8f11 100644
--- a/tensorflow/contrib/py2tf/pyct/ast_util.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util.py
@@ -22,7 +22,7 @@ import ast
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.autograph.pyct import anno
 
 
 class CleanCopier(gast.NodeVisitor):
@@ -84,7 +84,10 @@ class SymbolRenamer(gast.NodeTransformer):
     return self._process(node)
 
   def visit_Attribute(self, node):
-    return self._process(node)
+    if anno.hasanno(node, anno.Basic.QN):
+      return self._process(node)
+    # Attributes of dynamic objects will not have a QN.
+    return self.generic_visit(node)
 
 
 def rename_symbols(node, name_map):
@@ -94,3 +97,12 @@ def rename_symbols(node, name_map):
   elif isinstance(node, tuple):
     return tuple(renamer.visit(n) for n in node)
   return renamer.visit(node)
+
+
+def keywords_to_dict(keywords):
+  keys = []
+  values = []
+  for kw in keywords:
+    keys.append(gast.Str(kw.arg))
+    values.append(kw.value)
+  return gast.Dict(keys=keys, values=values)
diff --git a/tensorflow/contrib/py2tf/pyct/ast_util_test.py b/tensorflow/contrib/autograph/pyct/ast_util_test.py
similarity index 78%
rename from tensorflow/contrib/py2tf/pyct/ast_util_test.py
rename to tensorflow/contrib/autograph/pyct/ast_util_test.py
index a871ccad6fc7ea1487e41fd6da3ce6120bdcbcbd..8faf92c705d997db298dbb1115981fd9da26372d 100644
--- a/tensorflow/contrib/py2tf/pyct/ast_util_test.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 
 import ast
 
-from tensorflow.contrib.py2tf.pyct import ast_util
-from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
 from tensorflow.python.platform import test
 
 
@@ -74,6 +76,17 @@ class AstUtilTest(test.TestCase):
     self.assertFalse(ret is new_node.body[0])
     self.assertFalse(hasattr(new_node.body[0], '__foo'))
 
+  def test_keywords_to_dict(self):
+    keywords = parser.parse_expression('f(a=b, c=1, d=\'e\')').keywords
+    d = ast_util.keywords_to_dict(keywords)
+    # Make sure we generate a usable dict node by attaching it to a variable and
+    # compiling everything.
+    output = parser.parse_str('b = 3')
+    output.body += (ast.Assign([ast.Name(id='d', ctx=ast.Store())], d),)
+    result, _ = compiler.ast_to_object(output)
+    self.assertDictEqual(result.d, {'a': 3, 'c': 1, 'd': 'e'})
+    print(d)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/compiler.py b/tensorflow/contrib/autograph/pyct/compiler.py
similarity index 98%
rename from tensorflow/contrib/py2tf/pyct/compiler.py
rename to tensorflow/contrib/autograph/pyct/compiler.py
index 507dbc7ed3de9c0b8874164e97a3d1d149e42423..24c4517afa89147101f80af3ef60237132c1144c 100644
--- a/tensorflow/contrib/py2tf/pyct/compiler.py
+++ b/tensorflow/contrib/autograph/pyct/compiler.py
@@ -31,7 +31,7 @@ import astor
 import gast
 
 
-def ast_to_source(node, indentation):
+def ast_to_source(node, indentation='  '):
   """Return the source code of given AST."""
   if isinstance(node, gast.AST):
     node = gast.gast_to_ast(node)
diff --git a/tensorflow/contrib/py2tf/pyct/compiler_test.py b/tensorflow/contrib/autograph/pyct/compiler_test.py
similarity index 96%
rename from tensorflow/contrib/py2tf/pyct/compiler_test.py
rename to tensorflow/contrib/autograph/pyct/compiler_test.py
index 243f4c81538f5853a01ff444f2ff16ccf7cd5d62..98cdc1506b6aced603df99662f1468687a55f92c 100644
--- a/tensorflow/contrib/py2tf/pyct/compiler_test.py
+++ b/tensorflow/contrib/autograph/pyct/compiler_test.py
@@ -22,8 +22,8 @@ import textwrap
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 
diff --git a/tensorflow/contrib/py2tf/pyct/context.py b/tensorflow/contrib/autograph/pyct/context.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/context.py
rename to tensorflow/contrib/autograph/pyct/context.py
diff --git a/tensorflow/contrib/py2tf/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/inspect_utils.py
rename to tensorflow/contrib/autograph/pyct/inspect_utils.py
diff --git a/tensorflow/contrib/py2tf/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
similarity index 98%
rename from tensorflow/contrib/py2tf/pyct/inspect_utils_test.py
rename to tensorflow/contrib/autograph/pyct/inspect_utils_test.py
index 5528ac851f74bd7b7dacdbe7b930945afa8c9783..ddca6f963b8abadd621c544a79935c69326bf65e 100644
--- a/tensorflow/contrib/py2tf/pyct/inspect_utils_test.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -22,7 +22,7 @@ from functools import wraps
 
 import six
 
-from tensorflow.contrib.py2tf.pyct import inspect_utils
+from tensorflow.contrib.autograph.pyct import inspect_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/parser.py b/tensorflow/contrib/autograph/pyct/parser.py
similarity index 64%
rename from tensorflow/contrib/py2tf/pyct/parser.py
rename to tensorflow/contrib/autograph/pyct/parser.py
index dc7df883b349becd860bb0dbceab22cb39c750b5..c961efa892df6a21804dae8f52ef64bf99cd409e 100644
--- a/tensorflow/contrib/py2tf/pyct/parser.py
+++ b/tensorflow/contrib/autograph/pyct/parser.py
@@ -29,12 +29,30 @@ from tensorflow.python.util import tf_inspect
 
 
 def parse_entity(entity):
-  """Return the AST of given entity."""
+  """Returns the AST of given entity."""
   source = tf_inspect.getsource(entity)
   source = textwrap.dedent(source)
   return parse_str(source), source
 
 
 def parse_str(src):
-  """Return the AST of given piece of code."""
+  """Returns the AST of given piece of code."""
   return gast.parse(src)
+
+
+def parse_expression(src):
+  """Returns the AST of given identifier.
+
+  Args:
+    src: A piece of code that represents a single Python expression
+  Returns:
+    A gast.AST object.
+  Raises:
+    ValueError: if src does not consist of a single Expression.
+  """
+  node = parse_str(src)
+  assert isinstance(node, gast.Module)
+  if len(node.body) != 1 and not isinstance(node.body[0], gast.Expr):
+    raise ValueError(
+        'Expected a single expression, found instead %s' % node.body)
+  return node.body[0].value
diff --git a/tensorflow/contrib/py2tf/pyct/parser_test.py b/tensorflow/contrib/autograph/pyct/parser_test.py
similarity index 80%
rename from tensorflow/contrib/py2tf/pyct/parser_test.py
rename to tensorflow/contrib/autograph/pyct/parser_test.py
index f35dfa04c70dc191078248c32f9a04d28133129a..007a4c6fb0393b7235808478d55b3ffa469f85d0 100644
--- a/tensorflow/contrib/py2tf/pyct/parser_test.py
+++ b/tensorflow/contrib/autograph/pyct/parser_test.py
@@ -20,28 +20,33 @@ from __future__ import print_function
 
 import textwrap
 
-from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.python.platform import test
 
 
-def f(x):
-  return x + 1
-
-
 class ParserTest(test.TestCase):
 
   def test_parse_entity(self):
+
+    def f(x):
+      return x + 1
+
     mod, _ = parser.parse_entity(f)
     self.assertEqual('f', mod.body[0].name)
 
   def test_parse_str(self):
     mod = parser.parse_str(
         textwrap.dedent("""
-        def f(x):
-          return x + 1
+            def f(x):
+              return x + 1
     """))
     self.assertEqual('f', mod.body[0].name)
 
+  def test_parse_expression(self):
+    node = parser.parse_expression('a.b')
+    self.assertEqual('a', node.value.id)
+    self.assertEqual('b', node.attr)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/pretty_printer.py b/tensorflow/contrib/autograph/pyct/pretty_printer.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/pretty_printer.py
rename to tensorflow/contrib/autograph/pyct/pretty_printer.py
diff --git a/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py b/tensorflow/contrib/autograph/pyct/pretty_printer_test.py
similarity index 96%
rename from tensorflow/contrib/py2tf/pyct/pretty_printer_test.py
rename to tensorflow/contrib/autograph/pyct/pretty_printer_test.py
index 81e3f47b80b6cb3bb7ba9f4a1787d03df4151a99..0cb48f35760b7b2655eb5cf73017b70e28dae219 100644
--- a/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py
+++ b/tensorflow/contrib/autograph/pyct/pretty_printer_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import ast
 
-from tensorflow.contrib.py2tf.pyct import pretty_printer
+from tensorflow.contrib.autograph.pyct import pretty_printer
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/qual_names.py b/tensorflow/contrib/autograph/pyct/qual_names.py
similarity index 91%
rename from tensorflow/contrib/py2tf/pyct/qual_names.py
rename to tensorflow/contrib/autograph/pyct/qual_names.py
index 6bcbaeb2aeb3043919e84bc6599edf5aee583c6d..4d5764a974aac542ddf4a54a9acd36f1afcb0464 100644
--- a/tensorflow/contrib/py2tf/pyct/qual_names.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names.py
@@ -29,7 +29,7 @@ import collections
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.autograph.pyct import anno
 
 
 class Symbol(collections.namedtuple('Symbol', ['name'])):
@@ -169,14 +169,6 @@ class QnResolver(gast.NodeTransformer):
   Note: Not using NodeAnnos to avoid circular dependencies.
   """
 
-  def visit_Call(self, node):
-    node = self.generic_visit(node)
-    # This helps treat the following cases uniformly:
-    #   a = b[i]
-    #   a = b()[i]
-    anno.copyanno(node.func, node, anno.Basic.QN)
-    return node
-
   def visit_Name(self, node):
     node = self.generic_visit(node)
     anno.setanno(node, anno.Basic.QN, QN(node.id))
@@ -184,8 +176,9 @@ class QnResolver(gast.NodeTransformer):
 
   def visit_Attribute(self, node):
     node = self.generic_visit(node)
-    anno.setanno(node, anno.Basic.QN,
-                 QN(anno.getanno(node.value, anno.Basic.QN), attr=node.attr))
+    if anno.hasanno(node.value, anno.Basic.QN):
+      anno.setanno(node, anno.Basic.QN,
+                   QN(anno.getanno(node.value, anno.Basic.QN), attr=node.attr))
     return node
 
   def visit_Subscript(self, node):
@@ -201,9 +194,10 @@ class QnResolver(gast.NodeTransformer):
       subscript = QN(StringLiteral(s.value.s))
     else:
       subscript = anno.getanno(node.slice.value, anno.Basic.QN)
-    anno.setanno(node, anno.Basic.QN,
-                 QN(anno.getanno(node.value, anno.Basic.QN),
-                    subscript=subscript))
+    if anno.hasanno(node.value, anno.Basic.QN):
+      anno.setanno(node, anno.Basic.QN,
+                   QN(anno.getanno(node.value, anno.Basic.QN),
+                      subscript=subscript))
     return node
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/qual_names_test.py b/tensorflow/contrib/autograph/pyct/qual_names_test.py
similarity index 89%
rename from tensorflow/contrib/py2tf/pyct/qual_names_test.py
rename to tensorflow/contrib/autograph/pyct/qual_names_test.py
index f2cd8e98f02213c9035fdb5b20e0862f0a8fd3f6..103bd25aa380e9f61ecea9c5298f34df5157d629 100644
--- a/tensorflow/contrib/py2tf/pyct/qual_names_test.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names_test.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 import textwrap
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.qual_names import QN
-from tensorflow.contrib.py2tf.pyct.qual_names import resolve
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.qual_names import QN
+from tensorflow.contrib.autograph.pyct.qual_names import resolve
 from tensorflow.python.platform import test
 
 
@@ -208,6 +208,24 @@ class QNResolverTest(test.TestCase):
     self.assertQNStringIs(nodes[8], 'a.b[c[d]].e.f')
     self.assertQNStringIs(nodes[9], 'a.b[c[d.e.f].g].h')
 
+  def test_function_calls(self):
+    samples = """
+      a.b
+      a.b()
+      a().b
+      z[i]
+      z[i]()
+      z()[i]
+    """
+    nodes = resolve(parser.parse_str(textwrap.dedent(samples)))
+    nodes = tuple(n.value for n in nodes.body)
+    self.assertQNStringIs(nodes[0], 'a.b')
+    self.assertQNStringIs(nodes[1].func, 'a.b')
+    self.assertQNStringIs(nodes[2].value.func, 'a')
+    self.assertQNStringIs(nodes[3], 'z[i]')
+    self.assertQNStringIs(nodes[4].func, 'z[i]')
+    self.assertQNStringIs(nodes[5].value.func, 'z')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
similarity index 80%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
rename to tensorflow/contrib/autograph/pyct/static_analysis/BUILD
index 2799b56a0042e99b8f8b38100d07c5afaef9f424..83f3bafc4217649db6499566d548c1657428ad0b 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
@@ -25,7 +25,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "@gast_archive//:gast",
     ],
 )
@@ -34,9 +34,10 @@ py_test(
     name = "activity_test",
     srcs = ["activity_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":static_analysis",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
     ],
@@ -46,9 +47,10 @@ py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":static_analysis",
-        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -59,8 +61,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/contrib/autograph/utils",
         "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/__init__.py b/tensorflow/contrib/autograph/pyct/static_analysis/__init__.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/__init__.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/__init__.py
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
similarity index 96%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/activity.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index 87fc8c979c4e3310fb3aa82b0f23d909b0170cda..da6a2f6f0500ebba41b85d06dcc912aae9d68f97 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -22,10 +22,10 @@ import copy
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.qual_names import QN
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.qual_names import QN
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 # TODO(mdan): Add support for PY3 (e.g. Param vs arg).
 
@@ -171,6 +171,10 @@ class ActivityAnalizer(transformer.Base):
     self._in_return_statement = False
 
   def _track_symbol(self, node):
+    # This can happen when we have an attribute (or subscript) on a function
+    # call.  Example: a().b
+    if not anno.hasanno(node, anno.Basic.QN):
+      return
     qn = anno.getanno(node, anno.Basic.QN)
 
     if isinstance(node.ctx, gast.Store):
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
similarity index 95%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index b16d15b39d8eb4c444cbc50ae62baa3a8fcc7841..37c28872bb9fc4f0c6f95eec8145101b7a6c83de 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.qual_names import QN
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.qual_names import QN
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/annos.py b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
similarity index 100%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/annos.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/annos.py
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py b/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
similarity index 88%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
index 0388be5d252389f2f3516c8b27828905d6475589..53ae15459097baff918432a493edd7360ebf209d 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
@@ -25,9 +25,9 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import transformer
-from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class LiveValueResolver(transformer.Base):
@@ -55,11 +55,19 @@ class LiveValueResolver(transformer.Base):
       if not symbol_is_local and not symbol_is_param:
         if node.id in self.literals:
           anno.setanno(node, 'live_val', self.literals[node.id])
-          # TODO(mdan): Could live values have FQNs? i.e. 'a'.join()
         elif node.id in self.context.namespace:
           obj = self.context.namespace[node.id]
           anno.setanno(node, 'live_val', obj)
-          anno.setanno(node, 'fqn', (obj.__name__,))
+          if hasattr(obj, '__name__'):
+            anno.setanno(node, 'fqn', (obj.__name__,))
+          elif hasattr(obj, '__class__'):
+            obj_class = obj.__class__
+            anno.setanno(node, 'fqn',
+                         (obj_class.__module__, obj_class.__name__))
+          else:
+            # If the symbol value is for example a primitive, then it will not
+            # have a name.
+            pass
         else:
           pass
           # TODO(mdan): Should we raise an error here?
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/live_values_test.py
similarity index 78%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/live_values_test.py
index c133a455b3dd328689102634c6076f366212ac25..69e428bde109ed43c3cdda1a94970a832dc47852 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/live_values_test.py
@@ -18,13 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+import six
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis import live_values
+from tensorflow.contrib.autograph.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
@@ -57,13 +59,30 @@ class LiveValuesResolverTest(test.TestCase):
 
   def test_literals(self):
 
+    a = None
+
     def test_fn():
-      return Foo  # pylint: disable=undefined-variable
+      return a
 
-    node = self._parse_and_analyze(test_fn, {}, {'Foo': 'bar'})
+    node = self._parse_and_analyze(test_fn, {}, literals={'a': 'bar'})
     retval_node = node.body[0].body[0].value
     self.assertEquals('bar', anno.getanno(retval_node, 'live_val'))
 
+  def test_primitive_values(self):
+
+    a = None
+
+    def test_fn():
+      return a
+
+    node = self._parse_and_analyze(test_fn, {'a': True})
+    retval_node = node.body[0].body[0].value
+    if six.PY2:
+      self.assertEqual(
+          anno.getanno(retval_node, 'fqn'), ('__builtin__', 'bool'))
+    else:
+      self.assertEqual(anno.getanno(retval_node, 'fqn'), ('builtins', 'bool'))
+
   def test_namespace(self):
 
     def foo():
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
similarity index 93%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index 5556a58c025da695bcef10352c597c7c8dd612d9..203aa3c3d18ab15300bbf424adeece6e74d9c994 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -43,8 +43,8 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
 
@@ -168,6 +168,15 @@ class TypeInfoResolver(transformer.Base):
                      anno.getanno(definition, 'element_type'))
     return node
 
+  def _process_tuple_assignment(self, source, t):
+    for i, e in enumerate(t.elts):
+      if isinstance(e, gast.Tuple):
+        self._process_tuple_assignment(source, e)
+      else:
+        self.scope.setval(
+            anno.getanno(e, anno.Basic.QN),
+            gast.Subscript(source, gast.Index(i), ctx=gast.Store()))
+
   def _process_variable_assignment(self, source, targets):
     if isinstance(source, gast.Call):
       func = source.func
@@ -183,10 +192,9 @@ class TypeInfoResolver(transformer.Base):
 
     for t in targets:
       if isinstance(t, gast.Tuple):
-        for i, e in enumerate(t.elts):
-          self.scope.setval(
-              anno.getanno(e, anno.Basic.QN),
-              gast.Subscript(source, gast.Index(i), ctx=gast.Store()))
+        # need to recurse on the case of assigning nested tuples,
+        # ex. a, (b, c) = f()
+        self._process_tuple_assignment(source, t)
       elif isinstance(t, (gast.Name, gast.Attribute)):
         self.scope.setval(anno.getanno(t, anno.Basic.QN), source)
       else:
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
similarity index 86%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
rename to tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
index 0d9d5a85f055b170ea6e493e8ac185f1298ebf3c..c0de4a604301b6e9f80ee83e4797b9ac7e558a48 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
@@ -18,14 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
-from tensorflow.contrib.py2tf.pyct.static_analysis import activity
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+from tensorflow.contrib.autograph.pyct.static_analysis import live_values
+from tensorflow.contrib.autograph.pyct.static_analysis import type_info
 from tensorflow.python.client import session
 from tensorflow.python.platform import test
 from tensorflow.python.training import training
@@ -196,6 +196,23 @@ class TypeInfoResolverTest(test.TestCase):
     f_ref = node.body[0].body[1].value
     self.assertEqual(anno.getanno(f_ref, 'element_type'), Foo)
 
+  def test_nested_assignment(self):
+
+    def test_fn(foo):
+      a, (b, c) = foo
+      return a, b, c
+
+    node = self._parse_and_analyze(test_fn, {'foo': (1, 2, 3)})
+    lhs = node.body[0].body[1].value.elts
+    a = lhs[0]
+    b = lhs[1]
+    c = lhs[2]
+    # TODO(mdan): change these once we have the live values propagating
+    # correctly
+    self.assertFalse(anno.hasanno(a, 'live_val'))
+    self.assertFalse(anno.hasanno(b, 'live_val'))
+    self.assertFalse(anno.hasanno(c, 'live_val'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/autograph/pyct/templates.py
similarity index 65%
rename from tensorflow/contrib/py2tf/pyct/templates.py
rename to tensorflow/contrib/autograph/pyct/templates.py
index cdd71dc56de33cde46d6115085350a321093d792..baf7923fff7c786c1abd05e11fa6ffdb8c8f0912 100644
--- a/tensorflow/contrib/py2tf/pyct/templates.py
+++ b/tensorflow/contrib/autograph/pyct/templates.py
@@ -26,9 +26,9 @@ import textwrap
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import ast_util
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
 
 
 class ReplaceTransformer(gast.NodeTransformer):
@@ -44,8 +44,6 @@ class ReplaceTransformer(gast.NodeTransformer):
     self.replacements = replacements
     self.in_replacements = False
 
-  # TODO(mdan): Make a more detailed pass and clean up if needed.
-
   def visit_Expr(self, node):
     if (isinstance(node.value, gast.Name) and
         node.value.id in self.replacements):
@@ -53,17 +51,66 @@ class ReplaceTransformer(gast.NodeTransformer):
     self.generic_visit(node)
     return node
 
+  def visit_keyword(self, node):
+    if node.arg in self.replacements:
+      repl = self.replacements[node.arg]
+      if isinstance(repl, gast.keyword):
+        return repl
+      elif (isinstance(repl, (list, tuple)) and repl and
+            all(isinstance(r, gast.keyword) for r in repl)):
+        return repl
+      # TODO(mdan): We may allow replacing with a string as well.
+      # For example, if one wanted to replace foo with bar in foo=baz, then
+      # we could allow changing just node arg, so that we end up with bar=baz.
+      raise ValueError(
+          'a keyword argument may only be replaced by another keyword or a '
+          'non-empty list of keywords. Found: %s' % repl)
+    return self.generic_visit(node)
+
   def visit_FunctionDef(self, node):
     node = self.generic_visit(node)
     if node.name in self.replacements:
       repl = self.replacements[node.name]
       if not isinstance(repl, (gast.Name, ast.Name)):
         raise ValueError(
-            'A function name can only be replaced by a Name node. Found: %s' %
+            'a function name can only be replaced by a Name node. Found: %s' %
             repl)
       node.name = repl.id
     return node
 
+  def _check_has_context(self, node):
+    if not node.ctx:
+      raise ValueError('node %s is missing ctx value' % node)
+
+  def _check_inner_children_have_context(self, node):
+    if isinstance(node, gast.Attribute):
+      self._check_inner_children_have_context(node.value)
+      self._check_has_context(node)
+    elif isinstance(node, gast.Tuple):
+      for e in node.elts:
+        self._check_inner_children_have_context(e)
+      self._check_has_context(node)
+    elif isinstance(node, gast.Dict):
+      for e in node.keys:
+        self._check_inner_children_have_context(e)
+      for e in node.values:
+        self._check_inner_children_have_context(e)
+    elif isinstance(node, gast.Subscript):
+      self._check_inner_children_have_context(node.value)
+      self._check_inner_children_have_context(node.slice)
+    elif isinstance(node, gast.Slice):
+      self._check_inner_children_have_context(node.lower)
+      if node.upper:
+        self._check_inner_children_have_context(node.upper)
+      if node.step:
+        self._check_inner_children_have_context(node.step)
+    elif isinstance(node, gast.Name):
+      self._check_has_context(node)
+    elif isinstance(node, (gast.Str, gast.Num)):
+      pass
+    else:
+      raise ValueError('unexpected node type "%s"' % node)
+
   def _set_inner_child_context(self, node, ctx):
     if isinstance(node, gast.Attribute):
       self._set_inner_child_context(node.value, ctx)
@@ -74,6 +121,24 @@ class ReplaceTransformer(gast.NodeTransformer):
       node.ctx = ctx
     elif isinstance(node, gast.Name):
       node.ctx = ctx
+    elif isinstance(node, gast.Call):
+      self._set_inner_child_context(node.func, ctx)
+      # We may be able to override these to Load(), but for now it's simpler
+      # to just assert that they're set.
+      for a in node.args:
+        self._check_inner_children_have_context(a)
+      for k in node.keywords:
+        self._check_inner_children_have_context(k.value)
+    elif isinstance(node, gast.Dict):
+      # We may be able to override these to Load(), but for now it's simpler
+      # to just assert that they're set.
+      for e in node.keys:
+        self._check_inner_children_have_context(e)
+      for e in node.values:
+        self._check_inner_children_have_context(e)
+    elif isinstance(node, gast.Subscript):
+      self._set_inner_child_context(node.value, ctx)
+      self._check_inner_children_have_context(node.slice)
     elif isinstance(node, (gast.Str, gast.Num)):
       pass
     else:
diff --git a/tensorflow/contrib/py2tf/pyct/templates_test.py b/tensorflow/contrib/autograph/pyct/templates_test.py
similarity index 70%
rename from tensorflow/contrib/py2tf/pyct/templates_test.py
rename to tensorflow/contrib/autograph/pyct/templates_test.py
index d7835b80a7f53c3ba012d01cac34b68c57bfe348..a01f8bf04c4faa6ec1779e0fb306155d99f5bd09 100644
--- a/tensorflow/contrib/py2tf/pyct/templates_test.py
+++ b/tensorflow/contrib/autograph/pyct/templates_test.py
@@ -22,8 +22,9 @@ import imp
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import templates
 from tensorflow.python.platform import test
 
 
@@ -96,6 +97,50 @@ class TemplatesTest(test.TestCase):
     with self.assertRaises(ValueError):
       templates.replace(template, foo=1)
 
+  def test_replace_call_keyword(self):
+    template = """
+      def test_fn():
+        def f(a, d, f):
+          return a + d + f
+        return f(1, kws=None)
+    """
+
+    source = parser.parse_expression('f(d=3, f=5)')
+    node = templates.replace(template, kws=source.keywords)[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(9, result.test_fn())
+
+    with self.assertRaises(ValueError):
+      templates.replace(template, kws=[])
+      templates.replace(template, kws=1)
+
+  def test_replace_name_with_call(self):
+    template = """
+      def test_fn():
+        b = 5
+        def g(a):
+          return 3 * a
+        def f():
+          return g
+        return foo
+    """
+
+    source = parser.parse_expression('f()(b)')
+    node = templates.replace(template, foo=source)[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(15, result.test_fn())
+
+  def test_replace_name_with_dict(self):
+    template = """
+      def test_fn():
+        return foo['bar']
+    """
+
+    source = parser.parse_expression('{\'bar\': 3}')
+    node = templates.replace(template, foo=source)[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(3, result.test_fn())
+
   def replace_as_expression(self):
     template = """
       foo(a)
diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
similarity index 77%
rename from tensorflow/contrib/py2tf/pyct/transformer.py
rename to tensorflow/contrib/autograph/pyct/transformer.py
index 57016bb4ce84776dfc8dfbe380322a03eb4b37b8..35f114b6e11901a854c1d631061ae42285c0e261 100644
--- a/tensorflow/contrib/py2tf/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -23,14 +23,22 @@ import sys
 import gast
 import six
 
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import pretty_printer
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import pretty_printer
 
 
-class PyFlowParseError(SyntaxError):
+class AutographParseError(SyntaxError):
   pass
 
 
+def try_ast_to_source(node):
+  try:
+    return compiler.ast_to_source(node)
+  except AssertionError:
+    return '<could not convert AST to source>'
+
+
 class Base(gast.NodeTransformer):
   """Base class for specialized transformers."""
 
@@ -62,14 +70,15 @@ class Base(gast.NodeTransformer):
       return super(Base, self).visit(node)
     except (ValueError, AttributeError, KeyError, NotImplementedError,
             AssertionError) as e:
-      msg = '%s: %s\nOccurred at node:\n%s' % (
-          e.__class__.__name__, str(e), pretty_printer.fmt(node, color=False))
+      msg = '%s: %s\nOffending source:\n%s\n\nOccurred at node:\n%s' % (
+          e.__class__.__name__, str(e), try_ast_to_source(node),
+          pretty_printer.fmt(node, color=False))
       if source_code:
         line = source_code.splitlines()[self._lineno - 1]
       else:
         line = '<no source available>'
-      six.reraise(PyFlowParseError,
-                  PyFlowParseError(
+      six.reraise(AutographParseError,
+                  AutographParseError(
                       msg,
                       (source_file, self._lineno, self._col_offset + 1, line)),
                   sys.exc_info()[2])
diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/autograph/utils/BUILD
similarity index 95%
rename from tensorflow/contrib/py2tf/utils/BUILD
rename to tensorflow/contrib/autograph/utils/BUILD
index 8bc338e801aa283967f4f6e6a659df9683cbc154..d3a1b9468892531cbc51bc13de66ef595f1a95f8 100644
--- a/tensorflow/contrib/py2tf/utils/BUILD
+++ b/tensorflow/contrib/autograph/utils/BUILD
@@ -35,6 +35,7 @@ py_library(
     deps = [
         "//tensorflow/python:list_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "@six_archive//:six",
     ],
 )
@@ -43,6 +44,7 @@ py_test(
     name = "builtins_test",
     srcs = ["builtins_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
@@ -83,7 +85,7 @@ py_test(
     name = "py_func_test",
     srcs = ["py_func_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = ["no_windows"],
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/autograph/utils/__init__.py b/tensorflow/contrib/autograph/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..22898b17e98bb004b4d2aa529b58cc99fc64dbb2
--- /dev/null
+++ b/tensorflow/contrib/autograph/utils/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility module that contains APIs usable in the generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.utils.builtins import dynamic_builtin
+from tensorflow.contrib.autograph.utils.builtins import dynamic_dataset
+from tensorflow.contrib.autograph.utils.builtins import dynamic_for_cond
+from tensorflow.contrib.autograph.utils.builtins import dynamic_print
+from tensorflow.contrib.autograph.utils.builtins import dynamic_range
+from tensorflow.contrib.autograph.utils.context_managers import control_dependency_on_returns
+from tensorflow.contrib.autograph.utils.misc import alias_tensors
+from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is
+from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is_not
+from tensorflow.contrib.autograph.utils.multiple_dispatch import run_cond
+from tensorflow.contrib.autograph.utils.multiple_dispatch import run_while
+from tensorflow.contrib.autograph.utils.py_func import wrap_py_func
+from tensorflow.contrib.autograph.utils.tensor_list import dynamic_list_append
+from tensorflow.contrib.autograph.utils.testing import fake_tf
+from tensorflow.contrib.autograph.utils.type_check import is_tensor
+from tensorflow.contrib.autograph.utils.type_hints import set_element_type
diff --git a/tensorflow/contrib/py2tf/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
similarity index 50%
rename from tensorflow/contrib/py2tf/utils/builtins.py
rename to tensorflow/contrib/autograph/utils/builtins.py
index 3cb62b55d4d23545af4d641ecab1663ee7f7b876..c6af0e4d13b8d15bebf857ff7e1129149490ee7a 100644
--- a/tensorflow/contrib/py2tf/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -18,12 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
+
 import six
 
-from tensorflow.contrib.py2tf.utils import py_func
-from tensorflow.contrib.py2tf.utils import type_check
+from tensorflow.contrib.autograph.utils import py_func
+from tensorflow.contrib.autograph.utils import type_check
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_inspect
@@ -54,7 +58,6 @@ def dynamic_len(list_or_tensor):
       raise ValueError(
           'len requires non-zero rank for tensor "%s"' % list_or_tensor)
     return array_ops.shape(list_or_tensor)[0]
-
   return len(list_or_tensor)
 
 
@@ -96,4 +99,76 @@ def dynamic_print(*values):
 
   if all(map(is_tf_print_compatible, values)):
     return logging_ops.Print(1, values)
-  return py_func.wrap_py_func(print, None, values, use_dummy_return=True)
+
+  def flushed_print(*vals):
+    print(*vals)
+    sys.stdout.flush()
+
+  return py_func.wrap_py_func(
+      flushed_print, None, values, use_dummy_return=True)
+
+
+def dynamic_dataset(iterated):
+  """Implementartion of smart tf.data.Dataset epoch wrapping.
+
+  The function checks if the input is a tf.data.Dataset and if so then wraps it
+  so that for each element it returns it also returns the current epoch the
+  dataset iteration is in, for two epochs.  If the input is not a
+  tf.data.Dataset then it just returns the input.
+
+  Args:
+    iterated: The iterable or tf.data.Dataset that is being iterated over.
+  Returns:
+    Either just the untouched input, or in the case of input being a
+    tf.data.Dataset then it returns a wrapped  tf.data.Dataset where for each
+    element it returns it also returns the current epoch the dataset iteration
+    is in.
+  """
+  if not isinstance(iterated, dataset_ops.Dataset):
+    return iterated
+
+  def epoch_dataset_number_helper(i):
+    return dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(i).repeat(), iterated))
+
+  epoch_numbers = dataset_ops.Dataset.range(2)
+  return epoch_numbers.flat_map(epoch_dataset_number_helper)
+
+
+def dynamic_for_cond(iteration, iterated):
+  """Implementartion of smart while-loop condition using dynamic dispatch.
+
+  The function checks if it is iterating over a tf.data.Dataset or not, and in
+  the case it is not then it simply returns if we are still in range of the
+  iterated and the next element.  If it is iterating over a dataset then it only
+  iterates for a single epoch.
+
+  Args:
+    iteration: The current iteration of the loop.
+    iterated: The iterable or tf.data.Dataset that is being iterated over.
+  Returns:
+    A tuple of a bool that indicates whether the loop should continue, and the
+    next element in iterated.
+  """
+  # TODO(znado): Clean up.
+  # TODO(znado): This won't work for unpacked iterates. Fix.
+  if isinstance(iterated, dataset_ops.Dataset):
+    curr_epoch, next_elem = iterated.make_one_shot_iterator().get_next()
+    return math_ops.less(curr_epoch, 1), next_elem
+  elif tensor_util.is_tensor(iterated):
+    if iterated.shape.ndims > 1:
+      elem_shape = array_ops.shape(iterated)[1:]
+    else:
+      elem_shape = ()
+    if iterated.shape.ndims == 0 or iterated.shape[0] == 0:
+      return False, array_ops.zeros(elem_shape, iterated.dtype)
+    return control_flow_ops.cond(
+        math_ops.less(iteration, dynamic_len(iterated)),
+        lambda: (True, iterated[iteration]),
+        lambda: (False, array_ops.zeros(elem_shape, iterated.dtype)))
+  elif hasattr(iterated, '__len__'):
+    if iteration < len(iterated):
+      return True, iterated[iteration]
+    return False, None
+  else:
+    raise NotImplementedError('Python iterators not yet supported.')
diff --git a/tensorflow/contrib/py2tf/utils/builtins_test.py b/tensorflow/contrib/autograph/utils/builtins_test.py
similarity index 98%
rename from tensorflow/contrib/py2tf/utils/builtins_test.py
rename to tensorflow/contrib/autograph/utils/builtins_test.py
index 59b3573d38c5bd98f416c7b77d1bc772cb8069dd..d9f7913d89a5471c76eb7ae484674bd7a1853ac9 100644
--- a/tensorflow/contrib/py2tf/utils/builtins_test.py
+++ b/tensorflow/contrib/autograph/utils/builtins_test.py
@@ -22,7 +22,7 @@ import sys
 
 import six
 
-from tensorflow.contrib.py2tf.utils import builtins
+from tensorflow.contrib.autograph.utils import builtins
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/py2tf/utils/context_managers.py b/tensorflow/contrib/autograph/utils/context_managers.py
similarity index 100%
rename from tensorflow/contrib/py2tf/utils/context_managers.py
rename to tensorflow/contrib/autograph/utils/context_managers.py
diff --git a/tensorflow/contrib/py2tf/utils/context_managers_test.py b/tensorflow/contrib/autograph/utils/context_managers_test.py
similarity index 96%
rename from tensorflow/contrib/py2tf/utils/context_managers_test.py
rename to tensorflow/contrib/autograph/utils/context_managers_test.py
index 404f6e44e59d8bd6131367e3234843f03b351910..42e27724b9856f715b524cdd7539897851715638 100644
--- a/tensorflow/contrib/py2tf/utils/context_managers_test.py
+++ b/tensorflow/contrib/autograph/utils/context_managers_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.utils import context_managers
+from tensorflow.contrib.autograph.utils import context_managers
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import tensor_array_ops
diff --git a/tensorflow/contrib/py2tf/utils/misc.py b/tensorflow/contrib/autograph/utils/misc.py
similarity index 100%
rename from tensorflow/contrib/py2tf/utils/misc.py
rename to tensorflow/contrib/autograph/utils/misc.py
diff --git a/tensorflow/contrib/py2tf/utils/misc_test.py b/tensorflow/contrib/autograph/utils/misc_test.py
similarity index 96%
rename from tensorflow/contrib/py2tf/utils/misc_test.py
rename to tensorflow/contrib/autograph/utils/misc_test.py
index 8aedd4cd64798660cc07364c45487399986c9be6..71e358c33e1ea9887d267c67bc80362bac26c3a6 100644
--- a/tensorflow/contrib/py2tf/utils/misc_test.py
+++ b/tensorflow/contrib/autograph/utils/misc_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.utils.misc import alias_tensors
+from tensorflow.contrib.autograph.utils.misc import alias_tensors
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.ops.variables import Variable
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
similarity index 82%
rename from tensorflow/contrib/py2tf/utils/multiple_dispatch.py
rename to tensorflow/contrib/autograph/utils/multiple_dispatch.py
index da7a942703d83b55edbd1607cb49ad4137daeb04..47049255f31113a0c7b2f5a1269593afdbbc9b19 100644
--- a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for type-dependent behavior used in py2tf-generated code."""
+"""Utilities for type-dependent behavior used in autograph-generated code."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,23 +20,18 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.contrib.py2tf.utils.type_check import is_tensor
+from tensorflow.contrib.autograph.utils.type_check import is_tensor
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
 
 
 def dynamic_is(left, right):
-  if is_tensor(left, right):
-    return math_ops.equal(left.name, right.name)
-  else:
-    return left is right
+  # TODO(alexbw) if we're sure we should leave 'is' in place,
+  # then change the semantics in converters/logical_expressions.py
+  return left is right
 
 
 def dynamic_is_not(left, right):
-  if is_tensor(left, right):
-    return math_ops.not_equal(left.name, right.name)
-  else:
-    return left is not right
+  return left is not right
 
 
 def run_cond(condition, true_fn, false_fn):
@@ -60,10 +55,17 @@ def run_cond(condition, true_fn, false_fn):
 
 
 def py_cond(condition, true_fn, false_fn):
+  """Functional version of Python's conditional."""
   if condition:
-    return true_fn()
+    results = true_fn()
   else:
-    return false_fn()
+    results = false_fn()
+
+  # The contract for the branch functions is to return tuples, but they should
+  # be collapsed to a single element when there is only one output.
+  if len(results) == 1:
+    return results[0]
+  return results
 
 
 def run_while(cond_fn, body_fn, init_args):
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
similarity index 86%
rename from tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
rename to tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
index 8d89b6898a366fe90ee1d43a55d0a7f10690224b..e6a41bb4166e8cfc8c703685f56eb90a1b5f63b4 100644
--- a/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.py2tf.utils import multiple_dispatch
+from tensorflow.contrib.autograph.utils import multiple_dispatch
 from tensorflow.python.client.session import Session
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.platform import test
@@ -50,26 +50,25 @@ class MultipleDispatchTest(test.TestCase):
       should_be_false1 = multiple_dispatch.dynamic_is_not(a, also_a)
       should_be_true2 = multiple_dispatch.dynamic_is_not(a, not_actually_a)
       should_be_false2 = multiple_dispatch.dynamic_is(a, not_actually_a)
-      self.assertTrue(should_be_true1.eval())
-      self.assertTrue(should_be_true2.eval())
-      self.assertFalse(should_be_false1.eval())
-      self.assertFalse(should_be_false2.eval())
+      self.assertTrue(should_be_true1)
+      self.assertTrue(should_be_true2)
+      self.assertFalse(should_be_false1)
+      self.assertFalse(should_be_false2)
 
   def test_run_cond_python(self):
-    true_fn = lambda: 2.0
-    false_fn = lambda: 3.0
-    self.assertEqual(multiple_dispatch.run_cond(True, true_fn, false_fn), 2.0)
-    self.assertEqual(multiple_dispatch.run_cond(False, true_fn, false_fn), 3.0)
+    true_fn = lambda: (2,)
+    false_fn = lambda: (3,)
+    self.assertEqual(multiple_dispatch.run_cond(True, true_fn, false_fn), 2)
+    self.assertEqual(multiple_dispatch.run_cond(False, true_fn, false_fn), 3)
 
   def test_run_cond_tf(self):
-
-    true_fn = lambda: constant([2.0])
-    false_fn = lambda: constant([3.0])
+    true_fn = lambda: (constant(2),)
+    false_fn = lambda: (constant(3),)
     with Session() as sess:
       out = multiple_dispatch.run_cond(constant(True), true_fn, false_fn)
-      self.assertEqual(sess.run(out), 2.0)
+      self.assertEqual(sess.run(out), 2)
       out = multiple_dispatch.run_cond(constant(False), true_fn, false_fn)
-      self.assertEqual(sess.run(out), 3.0)
+      self.assertEqual(sess.run(out), 3)
 
   def test_run_while_python(self):
     cond_fn = lambda x, t, s: x > t
diff --git a/tensorflow/contrib/autograph/utils/py_func.py b/tensorflow/contrib/autograph/utils/py_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..11ebfb2e49f0e762b56ae2cde2b76d2e24032d72
--- /dev/null
+++ b/tensorflow/contrib/autograph/utils/py_func.py
@@ -0,0 +1,131 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pyfunc creation utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import script_ops
+
+
+class MatchDType(namedtuple('MatchDType', ('arg_number',))):
+  """Allows matching the dtype of an argument.
+
+  Used in conjunction with function calls. For example, MatchDType(0) will
+  match the DType of the first argument.
+  """
+
+  pass
+
+
+def wrap_py_func(f, return_dtypes, args, kwargs=None, use_dummy_return=False):
+  """Helper that wraps a callable to py_func.
+
+  The helper passes tensor arguments through the py_func interface. Non-tensor
+  arguments are allowed, and will be passed to f directly. Note that non-tensor
+  arguments are captured by f will not update every time the wrapper is
+  called (this is consistent with its argument list, which only includes
+  the tensor arguments). In general, it's safest not to reuse this wrapper.
+
+  Args:
+    f: Callable
+    return_dtypes: None, individual of tuple/list of DType or MatchDType, the
+        data type for each of f's return value(s). Set to None if f has no
+        return values or use_dummy_return is True. Use MatchDType to define a
+        dtype identical to that of `i`th argument (argument 0 is the first);
+        an argument must of Tensor type if it is to be used with MatchDType.
+    args: Positional arguments for f, as list or tuple.
+    kwargs: Keyword arguments for f, as dict with string keys. May be None.
+    use_dummy_return: If True, the function will return a dummy value of 1
+        and discard its actual return value.
+  Returns:
+    The return values of f converted to tensor.
+  Raises:
+    ValueError: if any of the arguments are incorrect.
+  """
+
+  if return_dtypes and use_dummy_return:
+    raise ValueError('if use_dummy_return is True, return_dtypes must be empty')
+
+  tensor_args = []
+  tensor_args_idx = {}
+
+  # Of the positional arguments, only grab the tensor ones to be passed through
+  # the py_func.
+  n_args = len(args)
+  arg_is_tensor = tuple(map(tensor_util.is_tensor, args))
+  for i in range(n_args):
+    if arg_is_tensor[i]:
+      tensor_args_idx[i] = len(tensor_args)
+      tensor_args.append(args[i])
+
+  # We essentially take the tensor kwargs, if any, and add them to the list of
+  # positional arguments. The kwargs are then reconstructed inside the py_func.
+  #
+  # For example, if
+  #
+  #     args = [Tensor(1), 'foo']
+  #     kwargs = {'a': Tensor(2), 'b': 'bar'}
+  #
+  # Then
+  #
+  #     tensor_args = (Tensor(1), Tensor(2))
+  #     kwarg_keys = ('a', 'b')
+  if kwargs:
+    kwarg_keys = tuple(kwargs.keys())
+    kwarg_is_tensor = {k: tensor_util.is_tensor(kwargs[k]) for k in kwarg_keys}
+    for k in kwarg_keys:
+      if kwarg_is_tensor[k]:
+        tensor_args_idx[k] = len(tensor_args)
+        tensor_args.append(kwargs[k])
+  else:
+    kwarg_keys = ()
+
+  # Set up return dtypes.
+  def match_arg_dtype(arg_number):
+    arg = args[arg_number]
+    if not arg_is_tensor[arg_number]:
+      raise ValueError(
+          'argument %d was used with MatchDType and must be a tf.Tensor, but '
+          'was %s instead' % (arg_number, type(arg)))
+    return arg.dtype
+
+  if return_dtypes:
+    if isinstance(return_dtypes, MatchDType):
+      return_dtypes = match_arg_dtype(return_dtypes.arg_number)
+    elif isinstance(return_dtypes, (list, tuple)):
+      return_dtypes = tuple(
+          match_arg_dtype(a.arg_number) if isinstance(a, MatchDType) else a
+          for a in return_dtypes)
+    else:
+      assert isinstance(return_dtypes, dtypes.DType)
+
+  def f_wrapper(*tensor_args):
+    f_args = tuple(tensor_args[tensor_args_idx[i]] if arg_is_tensor[i] else a
+                   for i, a in enumerate(args))
+    f_kwargs = {
+        k: tensor_args[tensor_args_idx[k]] if kwarg_is_tensor[k] else kwargs[k]
+        for i, k in enumerate(kwarg_keys)
+    }
+    retval = f(*f_args, **f_kwargs)
+    return 1 if use_dummy_return else retval
+
+  return script_ops.py_func(f_wrapper, tensor_args, dtypes.int64
+                            if use_dummy_return else return_dtypes)
diff --git a/tensorflow/contrib/autograph/utils/py_func_test.py b/tensorflow/contrib/autograph/utils/py_func_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2468263142f14332e86db99d198ba0f5c633dc69
--- /dev/null
+++ b/tensorflow/contrib/autograph/utils/py_func_test.py
@@ -0,0 +1,103 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for wrap_py_func module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.utils import py_func
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class PyFuncTest(test.TestCase):
+
+  def test_wrap_py_func_simple(self):
+
+    def test_fn(a, b, c):
+      return a + b + c
+
+    with self.test_session() as sess:
+      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+                                    (1, constant_op.constant(1), 1))
+      self.assertEqual(3, sess.run(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int64, (1, 1, 1))
+      self.assertEqual(3, sess.run(result))
+      result = py_func.wrap_py_func(
+          test_fn, dtypes.int64,
+          (constant_op.constant(1), 1, constant_op.constant(1)))
+      self.assertEqual(3, sess.run(result))
+
+  def test_wrap_py_func_complex_args(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.foo = 5
+
+    def test_fn(a, b):
+      return a * b.foo
+
+    with self.test_session() as sess:
+      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass()))
+      self.assertEqual(35, sess.run(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+                                    (constant_op.constant(7), TestClass()))
+      self.assertEqual(35, sess.run(result))
+
+  def test_wrap_py_func_kwargs(self):
+
+    class TestClass(object):
+
+      def __init__(self, foo):
+        self.foo = foo
+
+    def test_fn(a, b, c, d):
+      return a * b.foo + c * d.foo
+
+    with self.test_session() as sess:
+      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass(5)), {
+          'c': 11,
+          'd': TestClass(13)
+      })
+      self.assertEqual(178, sess.run(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+                                    (constant_op.constant(7), TestClass(5)), {
+                                        'c': constant_op.constant(11),
+                                        'd': TestClass(13)
+                                    })
+      self.assertEqual(178, sess.run(result))
+
+  def test_wrap_py_func_dummy_return(self):
+
+    side_counter = [0]
+
+    def test_fn(_):
+      side_counter[0] += 1
+
+    with self.test_session() as sess:
+      result = py_func.wrap_py_func(test_fn, None, (5,), use_dummy_return=True)
+      self.assertEqual(1, sess.run(result))
+      self.assertEqual([1], side_counter)
+      result = py_func.wrap_py_func(
+          test_fn, None, (constant_op.constant(5),), use_dummy_return=True)
+      self.assertEqual(1, sess.run(result))
+      self.assertEqual([2], side_counter)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/tensor_list.py b/tensorflow/contrib/autograph/utils/tensor_list.py
similarity index 100%
rename from tensorflow/contrib/py2tf/utils/tensor_list.py
rename to tensorflow/contrib/autograph/utils/tensor_list.py
diff --git a/tensorflow/contrib/py2tf/utils/tensor_list_test.py b/tensorflow/contrib/autograph/utils/tensor_list_test.py
similarity index 97%
rename from tensorflow/contrib/py2tf/utils/tensor_list_test.py
rename to tensorflow/contrib/autograph/utils/tensor_list_test.py
index 110e4d105e934d9d752afc2ccc0c53c99b70d41d..d58489eb68b6b949a4276520605c62b7c2825558 100644
--- a/tensorflow/contrib/py2tf/utils/tensor_list_test.py
+++ b/tensorflow/contrib/autograph/utils/tensor_list_test.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for PyFlow list."""
+"""Tests for Autograph lists."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.utils import tensor_list as tl
+from tensorflow.contrib.autograph.utils import tensor_list as tl
 from tensorflow.python.client.session import Session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/py2tf/utils/testing.py b/tensorflow/contrib/autograph/utils/testing.py
similarity index 100%
rename from tensorflow/contrib/py2tf/utils/testing.py
rename to tensorflow/contrib/autograph/utils/testing.py
diff --git a/tensorflow/contrib/py2tf/utils/type_check.py b/tensorflow/contrib/autograph/utils/type_check.py
similarity index 95%
rename from tensorflow/contrib/py2tf/utils/type_check.py
rename to tensorflow/contrib/autograph/utils/type_check.py
index b9b2b451a4e22684a19f0d10fbf5e4fae5d6152b..8748abc47bcfb55b4d0b11178a46816249732da9 100644
--- a/tensorflow/contrib/py2tf/utils/type_check.py
+++ b/tensorflow/contrib/autograph/utils/type_check.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities used in py2tf-generated code."""
+"""Utilities used in autograph-generated code."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/py2tf/utils/type_check_test.py b/tensorflow/contrib/autograph/utils/type_check_test.py
similarity index 96%
rename from tensorflow/contrib/py2tf/utils/type_check_test.py
rename to tensorflow/contrib/autograph/utils/type_check_test.py
index 7d0428e9cccecdc67511e236bc00655a055aea29..3b67b7194c5656b193d47860f93986a985cb1aef 100644
--- a/tensorflow/contrib/py2tf/utils/type_check_test.py
+++ b/tensorflow/contrib/autograph/utils/type_check_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy
 
-from tensorflow.contrib.py2tf.utils import type_check
+from tensorflow.contrib.autograph.utils import type_check
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/py2tf/utils/type_hints.py b/tensorflow/contrib/autograph/utils/type_hints.py
similarity index 100%
rename from tensorflow/contrib/py2tf/utils/type_hints.py
rename to tensorflow/contrib/autograph/utils/type_hints.py
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index ee67909133fc26ba98355db05a4b90d3dfa6b97b..d65c990c87cbc316472237d183c03765416501e7 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -112,14 +112,3 @@ py_test(
         "//tensorflow/python:script_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/batching/test_util/BUILD b/tensorflow/contrib/batching/test_util/BUILD
index 6db627faad1df4a4b73082e74e7754829ff2b514..7cb2d8079bd18660f72eab92654629434ce4d6a5 100644
--- a/tensorflow/contrib/batching/test_util/BUILD
+++ b/tensorflow/contrib/batching/test_util/BUILD
@@ -8,17 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
-
 cc_library(
     name = "fake_clock_env",
     testonly = 1,
diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
index 2a84a7712a8fa66e89db41ff4e7ebe4f620029ca..8f81b6702f2807d7da7e72190ce2d86b28e52113 100644
--- a/tensorflow/contrib/batching/util/BUILD
+++ b/tensorflow/contrib/batching/util/BUILD
@@ -8,18 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "**/google_*",
-        ],
-    ),
-)
-
 cc_library(
     name = "periodic_function_dynamic",
     hdrs = ["periodic_function.h"],
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index c6feec68e0104ff33451bbb6fa7de51d13e0a43c..5a2d7f6a3c0ba233299a5790fa80488786712f3c 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -37,25 +37,6 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "metropolis_hastings_test",
-    size = "large",
-    srcs = ["python/kernel_tests/metropolis_hastings_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-    ],
-)
-
 cuda_py_test(
     name = "monte_carlo_test",
     size = "small",
@@ -76,37 +57,3 @@ cuda_py_test(
         "//tensorflow/python:random_seed",
     ],
 )
-
-cuda_py_test(
-    name = "hmc_test",
-    size = "large",
-    srcs = ["python/kernel_tests/hmc_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_seed",
-    ],
-    tags = ["nomsan"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/bayesflow/README.md b/tensorflow/contrib/bayesflow/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10323dc6d59918a9f8cf1840d06dcd219dfe3568
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/README.md
@@ -0,0 +1,17 @@
+# Notice
+
+`tf.contrib.bayesflow` has moved!
+
+See new code at [github.com/tensorflow/probability](
+https://github.com/tensorflow/probability).
+
+Switch imports with:
+
+```python
+# old
+import tensorflow as tf
+tfp = tf.contrib.bayesflow
+
+# new
+import tensorflow_probability as tfp
+```
diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py
index f86820382682f79e85e6a92c7f63fa15bb8be1a3..41a8c920fc4e81af90f4c94a149d8c404c58b747 100644
--- a/tensorflow/contrib/bayesflow/__init__.py
+++ b/tensorflow/contrib/bayesflow/__init__.py
@@ -21,8 +21,6 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.bayesflow.python.ops import hmc
-from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings
 from tensorflow.contrib.bayesflow.python.ops import monte_carlo
 # pylint: enable=unused-import,line-too-long
 
@@ -30,13 +28,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
-    'entropy',
-    'hmc',
-    'metropolis_hastings',
     'monte_carlo',
-    'special_math',
-    'stochastic_variables',
-    'variational_inference',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
deleted file mode 100644
index dabadfc7b6a3da8786e88d559fe2d05b44599ca0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ /dev/null
@@ -1,737 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Hamiltonian Monte Carlo."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import numpy as np
-from scipy import stats
-
-from tensorflow.contrib.bayesflow.python.ops import hmc
-from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _compute_energy_change
-from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _leapfrog_integrator
-
-from tensorflow.contrib.distributions.python.ops import independent as independent_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_linalg_ops
-from tensorflow.python.ops import gradients_impl as gradients_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.distributions import gamma as gamma_lib
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging_ops
-
-
-def _reduce_variance(x, axis=None, keepdims=False):
-  sample_mean = math_ops.reduce_mean(x, axis, keepdims=True)
-  return math_ops.reduce_mean(
-      math_ops.squared_difference(x, sample_mean), axis, keepdims)
-
-
-class HMCTest(test.TestCase):
-
-  def setUp(self):
-    self._shape_param = 5.
-    self._rate_param = 10.
-
-    random_seed.set_random_seed(10003)
-    np.random.seed(10003)
-
-  def assertAllFinite(self, x):
-    self.assertAllEqual(np.ones_like(x).astype(bool), np.isfinite(x))
-
-  def _log_gamma_log_prob(self, x, event_dims=()):
-    """Computes log-pdf of a log-gamma random variable.
-
-    Args:
-      x: Value of the random variable.
-      event_dims: Dimensions not to treat as independent.
-
-    Returns:
-      log_prob: The log-pdf up to a normalizing constant.
-    """
-    return math_ops.reduce_sum(self._shape_param * x -
-                               self._rate_param * math_ops.exp(x),
-                               event_dims)
-
-  def _integrator_conserves_energy(self, x, independent_chain_ndims, sess,
-                                   feed_dict=None):
-    step_size = array_ops.placeholder(np.float32, [], name="step_size")
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name="hmc_lf_steps")
-
-    if feed_dict is None:
-      feed_dict = {}
-    feed_dict[hmc_lf_steps] = 1000
-
-    event_dims = math_ops.range(independent_chain_ndims,
-                                array_ops.rank(x))
-
-    m = random_ops.random_normal(array_ops.shape(x))
-    log_prob_0 = self._log_gamma_log_prob(x, event_dims)
-    grad_0 = gradients_ops.gradients(log_prob_0, x)
-    old_energy = -log_prob_0 + 0.5 * math_ops.reduce_sum(m**2., event_dims)
-
-    new_m, _, log_prob_1, _ = _leapfrog_integrator(
-        current_momentums=[m],
-        target_log_prob_fn=lambda x: self._log_gamma_log_prob(x, event_dims),
-        current_state_parts=[x],
-        step_sizes=[step_size],
-        num_leapfrog_steps=hmc_lf_steps,
-        current_target_log_prob=log_prob_0,
-        current_grads_target_log_prob=grad_0)
-    new_m = new_m[0]
-
-    new_energy = -log_prob_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
-                                                         event_dims)
-
-    x_shape = sess.run(x, feed_dict).shape
-    event_size = np.prod(x_shape[independent_chain_ndims:])
-    feed_dict[step_size] = 0.1 / event_size
-    old_energy_, new_energy_ = sess.run([old_energy, new_energy],
-                                        feed_dict)
-    logging_ops.vlog(1, "average energy relative change: {}".format(
-        (1. - new_energy_ / old_energy_).mean()))
-    self.assertAllClose(old_energy_, new_energy_, atol=0., rtol=0.02)
-
-  def _integrator_conserves_energy_wrapper(self, independent_chain_ndims):
-    """Tests the long-term energy conservation of the leapfrog integrator.
-
-    The leapfrog integrator is symplectic, so for sufficiently small step
-    sizes it should be possible to run it more or less indefinitely without
-    the energy of the system blowing up or collapsing.
-
-    Args:
-      independent_chain_ndims: Python `int` scalar representing the number of
-        dims associated with independent chains.
-    """
-    with self.test_session(graph=ops.Graph()) as sess:
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
-      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
-      self._integrator_conserves_energy(x_ph, independent_chain_ndims,
-                                        sess, feed_dict)
-
-  def testIntegratorEnergyConservationNullShape(self):
-    self._integrator_conserves_energy_wrapper(0)
-
-  def testIntegratorEnergyConservation1(self):
-    self._integrator_conserves_energy_wrapper(1)
-
-  def testIntegratorEnergyConservation2(self):
-    self._integrator_conserves_energy_wrapper(2)
-
-  def testIntegratorEnergyConservation3(self):
-    self._integrator_conserves_energy_wrapper(3)
-
-  def testSampleChainSeedReproducibleWorksCorrectly(self):
-    with self.test_session(graph=ops.Graph()) as sess:
-      num_results = 10
-      independent_chain_ndims = 1
-
-      def log_gamma_log_prob(x):
-        event_dims = math_ops.range(independent_chain_ndims,
-                                    array_ops.rank(x))
-        return self._log_gamma_log_prob(x, event_dims)
-
-      kwargs = dict(
-          target_log_prob_fn=log_gamma_log_prob,
-          current_state=np.random.rand(4, 3, 2),
-          step_size=0.1,
-          num_leapfrog_steps=2,
-          num_burnin_steps=150,
-          seed=52,
-      )
-
-      samples0, kernel_results0 = hmc.sample_chain(
-          **dict(list(kwargs.items()) + list(dict(
-              num_results=2 * num_results,
-              num_steps_between_results=0).items())))
-
-      samples1, kernel_results1 = hmc.sample_chain(
-          **dict(list(kwargs.items()) + list(dict(
-              num_results=num_results,
-              num_steps_between_results=1).items())))
-
-      [
-          samples0_,
-          samples1_,
-          target_log_prob0_,
-          target_log_prob1_,
-      ] = sess.run([
-          samples0,
-          samples1,
-          kernel_results0.current_target_log_prob,
-          kernel_results1.current_target_log_prob,
-      ])
-      self.assertAllClose(samples0_[::2], samples1_,
-                          atol=1e-5, rtol=1e-5)
-      self.assertAllClose(target_log_prob0_[::2], target_log_prob1_,
-                          atol=1e-5, rtol=1e-5)
-
-  def _chain_gets_correct_expectations(self, x, independent_chain_ndims,
-                                       sess, feed_dict=None):
-    counter = collections.Counter()
-    def log_gamma_log_prob(x):
-      counter["target_calls"] += 1
-      event_dims = math_ops.range(independent_chain_ndims,
-                                  array_ops.rank(x))
-      return self._log_gamma_log_prob(x, event_dims)
-
-    num_results = array_ops.placeholder(
-        np.int32, [], name="num_results")
-    step_size = array_ops.placeholder(
-        np.float32, [], name="step_size")
-    num_leapfrog_steps = array_ops.placeholder(
-        np.int32, [], name="num_leapfrog_steps")
-
-    if feed_dict is None:
-      feed_dict = {}
-    feed_dict.update({num_results: 150,
-                      step_size: 0.05,
-                      num_leapfrog_steps: 2})
-
-    samples, kernel_results = hmc.sample_chain(
-        num_results=num_results,
-        target_log_prob_fn=log_gamma_log_prob,
-        current_state=x,
-        step_size=step_size,
-        num_leapfrog_steps=num_leapfrog_steps,
-        num_burnin_steps=150,
-        seed=42)
-
-    self.assertAllEqual(dict(target_calls=2), counter)
-
-    expected_x = (math_ops.digamma(self._shape_param)
-                  - np.log(self._rate_param))
-
-    expected_exp_x = self._shape_param / self._rate_param
-
-    log_accept_ratio_, samples_, expected_x_ = sess.run(
-        [kernel_results.log_accept_ratio, samples, expected_x],
-        feed_dict)
-
-    actual_x = samples_.mean()
-    actual_exp_x = np.exp(samples_).mean()
-    acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))
-
-    logging_ops.vlog(1, "True      E[x, exp(x)]: {}\t{}".format(
-        expected_x_, expected_exp_x))
-    logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format(
-        actual_x, actual_exp_x))
-    self.assertNear(actual_x, expected_x_, 2e-2)
-    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
-    self.assertAllEqual(np.ones_like(acceptance_probs, np.bool),
-                        acceptance_probs > 0.5)
-    self.assertAllEqual(np.ones_like(acceptance_probs, np.bool),
-                        acceptance_probs <= 1.)
-
-  def _chain_gets_correct_expectations_wrapper(self, independent_chain_ndims):
-    with self.test_session(graph=ops.Graph()) as sess:
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
-      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
-      self._chain_gets_correct_expectations(x_ph, independent_chain_ndims,
-                                            sess, feed_dict)
-
-  def testHMCChainExpectationsNullShape(self):
-    self._chain_gets_correct_expectations_wrapper(0)
-
-  def testHMCChainExpectations1(self):
-    self._chain_gets_correct_expectations_wrapper(1)
-
-  def testHMCChainExpectations2(self):
-    self._chain_gets_correct_expectations_wrapper(2)
-
-  def testKernelResultsUsingTruncatedDistribution(self):
-    def log_prob(x):
-      return array_ops.where(
-          x >= 0.,
-          -x - x**2,  # Non-constant gradient.
-          array_ops.fill(x.shape, math_ops.cast(-np.inf, x.dtype)))
-    # This log_prob has the property that it is likely to attract
-    # the flow toward, and below, zero...but for x <=0,
-    # log_prob(x) = -inf, which should result in rejection, as well
-    # as a non-finite log_prob.  Thus, this distribution gives us an opportunity
-    # to test out the kernel results ability to correctly capture rejections due
-    # to finite AND non-finite reasons.
-    # Why use a non-constant gradient?  This ensures the leapfrog integrator
-    # will not be exact.
-
-    num_results = 1000
-    # Large step size, will give rejections due to integration error in addition
-    # to rejection due to going into a region of log_prob = -inf.
-    step_size = 0.1
-    num_leapfrog_steps = 5
-    num_chains = 2
-
-    with self.test_session(graph=ops.Graph()) as sess:
-
-      # Start multiple independent chains.
-      initial_state = ops.convert_to_tensor([0.1] * num_chains)
-
-      states, kernel_results = hmc.sample_chain(
-          num_results=num_results,
-          target_log_prob_fn=log_prob,
-          current_state=initial_state,
-          step_size=step_size,
-          num_leapfrog_steps=num_leapfrog_steps,
-          seed=42)
-
-      states_, kernel_results_ = sess.run([states, kernel_results])
-      pstates_ = kernel_results_.proposed_state
-
-      neg_inf_mask = np.isneginf(kernel_results_.proposed_target_log_prob)
-
-      # First:  Test that the mathematical properties of the above log prob
-      # function in conjunction with HMC show up as expected in kernel_results_.
-
-      # We better have log_prob = -inf some of the time.
-      self.assertLess(0, neg_inf_mask.sum())
-      # We better have some rejections due to something other than -inf.
-      self.assertLess(neg_inf_mask.sum(), (~kernel_results_.is_accepted).sum())
-      # We better have accepted a decent amount, even near end of the chain.
-      self.assertLess(
-          0.1, kernel_results_.is_accepted[int(0.9 * num_results):].mean())
-      # We better not have any NaNs in states or log_prob.
-      # We may have some NaN in grads, which involve multiplication/addition due
-      # to gradient rules.  This is the known "NaN grad issue with tf.where."
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isnan(kernel_results_.proposed_target_log_prob))
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isnan(states_))
-      # We better not have any +inf in states, grads, or log_prob.
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isposinf(kernel_results_.proposed_target_log_prob))
-      self.assertAllEqual(
-          np.zeros_like(states_),
-          np.isposinf(kernel_results_.proposed_grads_target_log_prob[0]))
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isposinf(states_))
-
-      # Second:  Test that kernel_results is congruent with itself and
-      # acceptance/rejection of states.
-
-      # Proposed state is negative iff proposed target log prob is -inf.
-      np.testing.assert_array_less(pstates_[neg_inf_mask], 0.)
-      np.testing.assert_array_less(0., pstates_[~neg_inf_mask])
-
-      # Acceptance probs are zero whenever proposed state is negative.
-      acceptance_probs = np.exp(np.minimum(
-          kernel_results_.log_accept_ratio, 0.))
-      self.assertAllEqual(
-          np.zeros_like(pstates_[neg_inf_mask]),
-          acceptance_probs[neg_inf_mask])
-
-      # The move is accepted ==> state = proposed state.
-      self.assertAllEqual(
-          states_[kernel_results_.is_accepted],
-          pstates_[kernel_results_.is_accepted],
-      )
-      # The move was rejected <==> state[t] == state[t - 1].
-      for t in range(1, num_results):
-        for i in range(num_chains):
-          if kernel_results_.is_accepted[t, i]:
-            self.assertNotEqual(states_[t, i], states_[t - 1, i])
-          else:
-            self.assertEqual(states_[t, i], states_[t - 1, i])
-
-  def _kernel_leaves_target_invariant(self, initial_draws,
-                                      independent_chain_ndims,
-                                      sess, feed_dict=None):
-    def log_gamma_log_prob(x):
-      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
-      return self._log_gamma_log_prob(x, event_dims)
-
-    def fake_log_prob(x):
-      """Cooled version of the target distribution."""
-      return 1.1 * log_gamma_log_prob(x)
-
-    step_size = array_ops.placeholder(np.float32, [], name="step_size")
-
-    if feed_dict is None:
-      feed_dict = {}
-
-    feed_dict[step_size] = 0.4
-
-    sample, kernel_results = hmc.kernel(
-        target_log_prob_fn=log_gamma_log_prob,
-        current_state=initial_draws,
-        step_size=step_size,
-        num_leapfrog_steps=5,
-        seed=43)
-
-    bad_sample, bad_kernel_results = hmc.kernel(
-        target_log_prob_fn=fake_log_prob,
-        current_state=initial_draws,
-        step_size=step_size,
-        num_leapfrog_steps=5,
-        seed=44)
-
-    [
-        log_accept_ratio_,
-        bad_log_accept_ratio_,
-        initial_draws_,
-        updated_draws_,
-        fake_draws_,
-    ] = sess.run([
-        kernel_results.log_accept_ratio,
-        bad_kernel_results.log_accept_ratio,
-        initial_draws,
-        sample,
-        bad_sample,
-    ], feed_dict)
-
-    # Confirm step size is small enough that we usually accept.
-    acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))
-    bad_acceptance_probs = np.exp(np.minimum(bad_log_accept_ratio_, 0.))
-    self.assertGreater(acceptance_probs.mean(), 0.5)
-    self.assertGreater(bad_acceptance_probs.mean(), 0.5)
-
-    # Confirm step size is large enough that we sometimes reject.
-    self.assertLess(acceptance_probs.mean(), 0.99)
-    self.assertLess(bad_acceptance_probs.mean(), 0.99)
-
-    _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(),
-                                        updated_draws_.flatten())
-    _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(),
-                                        fake_draws_.flatten())
-
-    logging_ops.vlog(1, "acceptance rate for true target: {}".format(
-        acceptance_probs.mean()))
-    logging_ops.vlog(1, "acceptance rate for fake target: {}".format(
-        bad_acceptance_probs.mean()))
-    logging_ops.vlog(1, "K-S p-value for true target: {}".format(
-        ks_p_value_true))
-    logging_ops.vlog(1, "K-S p-value for fake target: {}".format(
-        ks_p_value_fake))
-    # Make sure that the MCMC update hasn't changed the empirical CDF much.
-    self.assertGreater(ks_p_value_true, 1e-3)
-    # Confirm that targeting the wrong distribution does
-    # significantly change the empirical CDF.
-    self.assertLess(ks_p_value_fake, 1e-6)
-
-  def _kernel_leaves_target_invariant_wrapper(self, independent_chain_ndims):
-    """Tests that the kernel leaves the target distribution invariant.
-
-    Draws some independent samples from the target distribution,
-    applies an iteration of the MCMC kernel, then runs a
-    Kolmogorov-Smirnov test to determine if the distribution of the
-    MCMC-updated samples has changed.
-
-    We also confirm that running the kernel with a different log-pdf
-    does change the target distribution. (And that we can detect that.)
-
-    Args:
-      independent_chain_ndims: Python `int` scalar representing the number of
-        dims associated with independent chains.
-    """
-    with self.test_session(graph=ops.Graph()) as sess:
-      initial_draws = np.log(np.random.gamma(self._shape_param,
-                                             size=[50000, 2, 2]))
-      initial_draws -= np.log(self._rate_param)
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
-
-      feed_dict = {x_ph: initial_draws}
-
-      self._kernel_leaves_target_invariant(x_ph, independent_chain_ndims,
-                                           sess, feed_dict)
-
-  def testKernelLeavesTargetInvariant1(self):
-    self._kernel_leaves_target_invariant_wrapper(1)
-
-  def testKernelLeavesTargetInvariant2(self):
-    self._kernel_leaves_target_invariant_wrapper(2)
-
-  def testKernelLeavesTargetInvariant3(self):
-    self._kernel_leaves_target_invariant_wrapper(3)
-
-  def testNanRejection(self):
-    """Tests that an update that yields NaN potentials gets rejected.
-
-    We run HMC with a target distribution that returns NaN
-    log-likelihoods if any element of x < 0, and unit-scale
-    exponential log-likelihoods otherwise. The exponential potential
-    pushes x towards 0, ensuring that any reasonably large update will
-    push us over the edge into NaN territory.
-    """
-    def _unbounded_exponential_log_prob(x):
-      """An exponential distribution with log-likelihood NaN for x < 0."""
-      per_element_potentials = array_ops.where(
-          x < 0.,
-          array_ops.fill(array_ops.shape(x), x.dtype.as_numpy_dtype(np.nan)),
-          -x)
-      return math_ops.reduce_sum(per_element_potentials)
-
-    with self.test_session(graph=ops.Graph()) as sess:
-      initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, kernel_results = hmc.kernel(
-          target_log_prob_fn=_unbounded_exponential_log_prob,
-          current_state=initial_x,
-          step_size=2.,
-          num_leapfrog_steps=5,
-          seed=46)
-      initial_x_, updated_x_, log_accept_ratio_ = sess.run(
-          [initial_x, updated_x, kernel_results.log_accept_ratio])
-      acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))
-
-      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
-      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
-      logging_ops.vlog(1, "log_accept_ratio = {}".format(log_accept_ratio_))
-
-      self.assertAllEqual(initial_x_, updated_x_)
-      self.assertEqual(acceptance_probs, 0.)
-
-  def testNanFromGradsDontPropagate(self):
-    """Test that update with NaN gradients does not cause NaN in results."""
-    def _nan_log_prob_with_nan_gradient(x):
-      return np.nan * math_ops.reduce_sum(x)
-
-    with self.test_session(graph=ops.Graph()) as sess:
-      initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, kernel_results = hmc.kernel(
-          target_log_prob_fn=_nan_log_prob_with_nan_gradient,
-          current_state=initial_x,
-          step_size=2.,
-          num_leapfrog_steps=5,
-          seed=47)
-      initial_x_, updated_x_, log_accept_ratio_ = sess.run(
-          [initial_x, updated_x, kernel_results.log_accept_ratio])
-      acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))
-
-      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
-      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
-      logging_ops.vlog(1, "log_accept_ratio = {}".format(log_accept_ratio_))
-
-      self.assertAllEqual(initial_x_, updated_x_)
-      self.assertEqual(acceptance_probs, 0.)
-
-      self.assertAllFinite(
-          gradients_ops.gradients(updated_x, initial_x)[0].eval())
-      self.assertAllEqual([True], [g is None for g in gradients_ops.gradients(
-          kernel_results.proposed_grads_target_log_prob, initial_x)])
-      self.assertAllEqual([False], [g is None for g in gradients_ops.gradients(
-          kernel_results.proposed_grads_target_log_prob,
-          kernel_results.proposed_state)])
-
-      # Gradients of the acceptance probs and new log prob are not finite.
-      # self.assertAllFinite(
-      #     gradients_ops.gradients(acceptance_probs, initial_x)[0].eval())
-      # self.assertAllFinite(
-      #     gradients_ops.gradients(new_log_prob, initial_x)[0].eval())
-
-  def _testChainWorksDtype(self, dtype):
-    with self.test_session(graph=ops.Graph()) as sess:
-      states, kernel_results = hmc.sample_chain(
-          num_results=10,
-          target_log_prob_fn=lambda x: -math_ops.reduce_sum(x**2., axis=-1),
-          current_state=np.zeros(5).astype(dtype),
-          step_size=0.01,
-          num_leapfrog_steps=10,
-          seed=48)
-      states_, log_accept_ratio_ = sess.run(
-          [states, kernel_results.log_accept_ratio])
-      self.assertEqual(dtype, states_.dtype)
-      self.assertEqual(dtype, log_accept_ratio_.dtype)
-
-  def testChainWorksIn64Bit(self):
-    self._testChainWorksDtype(np.float64)
-
-  def testChainWorksIn16Bit(self):
-    self._testChainWorksDtype(np.float16)
-
-  def testChainWorksCorrelatedMultivariate(self):
-    dtype = np.float32
-    true_mean = dtype([0, 0])
-    true_cov = dtype([[1, 0.5],
-                      [0.5, 1]])
-    num_results = 2000
-    counter = collections.Counter()
-    with self.test_session(graph=ops.Graph()) as sess:
-      def target_log_prob(x, y):
-        counter["target_calls"] += 1
-        # Corresponds to unnormalized MVN.
-        # z = matmul(inv(chol(true_cov)), [x, y] - true_mean)
-        z = array_ops.stack([x, y], axis=-1) - true_mean
-        z = array_ops.squeeze(
-            gen_linalg_ops.matrix_triangular_solve(
-                np.linalg.cholesky(true_cov),
-                z[..., array_ops.newaxis]),
-            axis=-1)
-        return -0.5 * math_ops.reduce_sum(z**2., axis=-1)
-      states, _ = hmc.sample_chain(
-          num_results=num_results,
-          target_log_prob_fn=target_log_prob,
-          current_state=[dtype(-2), dtype(2)],
-          step_size=[0.5, 0.5],
-          num_leapfrog_steps=2,
-          num_burnin_steps=200,
-          num_steps_between_results=1,
-          seed=54)
-      self.assertAllEqual(dict(target_calls=2), counter)
-      states = array_ops.stack(states, axis=-1)
-      self.assertEqual(num_results, states.shape[0].value)
-      sample_mean = math_ops.reduce_mean(states, axis=0)
-      x = states - sample_mean
-      sample_cov = math_ops.matmul(x, x, transpose_a=True) / dtype(num_results)
-      [sample_mean_, sample_cov_] = sess.run([
-          sample_mean, sample_cov])
-      self.assertAllClose(true_mean, sample_mean_,
-                          atol=0.05, rtol=0.)
-      self.assertAllClose(true_cov, sample_cov_,
-                          atol=0., rtol=0.1)
-
-
-class _EnergyComputationTest(object):
-
-  def testHandlesNanFromPotential(self):
-    with self.test_session(graph=ops.Graph()) as sess:
-      x = [1, np.inf, -np.inf, np.nan]
-      target_log_prob, proposed_target_log_prob = [
-          self.dtype(x.flatten()) for x in np.meshgrid(x, x)]
-      num_chains = len(target_log_prob)
-      dummy_momentums = [-1, 1]
-      momentums = [self.dtype([dummy_momentums] * num_chains)]
-      proposed_momentums = [self.dtype([dummy_momentums] * num_chains)]
-
-      target_log_prob = ops.convert_to_tensor(target_log_prob)
-      momentums = [ops.convert_to_tensor(momentums[0])]
-      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
-      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
-
-      energy = _compute_energy_change(
-          target_log_prob,
-          momentums,
-          proposed_target_log_prob,
-          proposed_momentums,
-          independent_chain_ndims=1)
-      grads = gradients_ops.gradients(energy, momentums)
-
-      [actual_energy, grads_] = sess.run([energy, grads])
-
-      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
-      # finite otherwise.
-      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
-      self.assertAllEqual(expected_energy, actual_energy)
-
-      # Ensure gradient is finite.
-      self.assertAllEqual(np.ones_like(grads_).astype(np.bool),
-                          np.isfinite(grads_))
-
-  def testHandlesNanFromKinetic(self):
-    with self.test_session(graph=ops.Graph()) as sess:
-      x = [1, np.inf, -np.inf, np.nan]
-      momentums, proposed_momentums = [
-          [np.reshape(self.dtype(x), [-1, 1])]
-          for x in np.meshgrid(x, x)]
-      num_chains = len(momentums[0])
-      target_log_prob = np.ones(num_chains, self.dtype)
-      proposed_target_log_prob = np.ones(num_chains, self.dtype)
-
-      target_log_prob = ops.convert_to_tensor(target_log_prob)
-      momentums = [ops.convert_to_tensor(momentums[0])]
-      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
-      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
-
-      energy = _compute_energy_change(
-          target_log_prob,
-          momentums,
-          proposed_target_log_prob,
-          proposed_momentums,
-          independent_chain_ndims=1)
-      grads = gradients_ops.gradients(energy, momentums)
-
-      [actual_energy, grads_] = sess.run([energy, grads])
-
-      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
-      # finite otherwise.
-      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
-      self.assertAllEqual(expected_energy, actual_energy)
-
-      # Ensure gradient is finite.
-      g = grads_[0].reshape([len(x), len(x)])[:, 0]
-      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isfinite(g))
-
-      # The remaining gradients are nan because the momentum was itself nan or
-      # inf.
-      g = grads_[0].reshape([len(x), len(x)])[:, 1:]
-      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isnan(g))
-
-
-class EnergyComputationTest16(test.TestCase, _EnergyComputationTest):
-  dtype = np.float16
-
-
-class EnergyComputationTest32(test.TestCase, _EnergyComputationTest):
-  dtype = np.float32
-
-
-class EnergyComputationTest64(test.TestCase, _EnergyComputationTest):
-  dtype = np.float64
-
-
-class _HMCHandlesLists(object):
-
-  def testStateParts(self):
-    with self.test_session(graph=ops.Graph()) as sess:
-      dist_x = normal_lib.Normal(loc=self.dtype(0), scale=self.dtype(1))
-      dist_y = independent_lib.Independent(
-          gamma_lib.Gamma(concentration=self.dtype([1, 2]),
-                          rate=self.dtype([0.5, 0.75])),
-          reinterpreted_batch_ndims=1)
-      def target_log_prob(x, y):
-        return dist_x.log_prob(x) + dist_y.log_prob(y)
-      x0 = [dist_x.sample(seed=1), dist_y.sample(seed=2)]
-      samples, _ = hmc.sample_chain(
-          num_results=int(2e3),
-          target_log_prob_fn=target_log_prob,
-          current_state=x0,
-          step_size=0.85,
-          num_leapfrog_steps=3,
-          num_burnin_steps=int(250),
-          seed=49)
-      actual_means = [math_ops.reduce_mean(s, axis=0) for s in samples]
-      actual_vars = [_reduce_variance(s, axis=0) for s in samples]
-      expected_means = [dist_x.mean(), dist_y.mean()]
-      expected_vars = [dist_x.variance(), dist_y.variance()]
-      [
-          actual_means_,
-          actual_vars_,
-          expected_means_,
-          expected_vars_,
-      ] = sess.run([
-          actual_means,
-          actual_vars,
-          expected_means,
-          expected_vars,
-      ])
-      self.assertAllClose(expected_means_, actual_means_, atol=0.05, rtol=0.16)
-      self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.25)
-
-
-class HMCHandlesLists32(_HMCHandlesLists, test.TestCase):
-  dtype = np.float32
-
-
-class HMCHandlesLists64(_HMCHandlesLists, test.TestCase):
-  dtype = np.float64
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/metropolis_hastings_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/metropolis_hastings_test.py
deleted file mode 100644
index f508e5b114a55fc1aeb07212595fda45fc308c7b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/metropolis_hastings_test.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Metropolis-Hastings."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings_impl as mh
-from tensorflow.contrib.distributions.python.ops import mvn_tril as mvn_tril_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.platform import test
-
-
-class MetropolisHastingsTest(test.TestCase):
-
-  def testKernelStateTensor(self):
-    """Test that transition kernel works with tensor input to `state`."""
-    loc = variable_scope.get_variable("loc", initializer=0.)
-
-    def target_log_prob_fn(loc):
-      return normal_lib.Normal(loc=0.0, scale=0.1).log_prob(loc)
-
-    new_state, _ = mh.kernel(
-        target_log_prob_fn=target_log_prob_fn,
-        proposal_fn=mh.proposal_normal(scale=0.05),
-        current_state=loc,
-        seed=231251)
-    loc_update = loc.assign(new_state)
-
-    init = variables.initialize_all_variables()
-    with self.test_session() as sess:
-      sess.run(init)
-      loc_samples = []
-      for _ in range(2500):
-        loc_sample = sess.run(loc_update)
-        loc_samples.append(loc_sample)
-    loc_samples = loc_samples[500:]  # drop samples for burn-in
-
-    self.assertAllClose(np.mean(loc_samples), 0.0, rtol=1e-5, atol=1e-1)
-    self.assertAllClose(np.std(loc_samples), 0.1, rtol=1e-5, atol=1e-1)
-
-  def testKernelStateList(self):
-    """Test that transition kernel works with list input to `state`."""
-    num_chains = 2
-    loc_one = variable_scope.get_variable(
-        "loc_one", [num_chains],
-        initializer=init_ops.zeros_initializer())
-    loc_two = variable_scope.get_variable(
-        "loc_two", [num_chains], initializer=init_ops.zeros_initializer())
-
-    def target_log_prob_fn(loc_one, loc_two):
-      loc = array_ops.stack([loc_one, loc_two])
-      log_prob = mvn_tril_lib.MultivariateNormalTriL(
-          loc=constant_op.constant([0., 0.]),
-          scale_tril=constant_op.constant([[0.1, 0.1], [0.0, 0.1]])).log_prob(
-              loc)
-      return math_ops.reduce_sum(log_prob, 0)
-
-    def proposal_fn(loc_one, loc_two):
-      loc_one_proposal = mh.proposal_normal(scale=0.05)
-      loc_two_proposal = mh.proposal_normal(scale=0.05)
-      loc_one_sample, _ = loc_one_proposal(loc_one)
-      loc_two_sample, _ = loc_two_proposal(loc_two)
-      return [loc_one_sample, loc_two_sample], None
-
-    new_state, _ = mh.kernel(
-        target_log_prob_fn=target_log_prob_fn,
-        proposal_fn=proposal_fn,
-        current_state=[loc_one, loc_two],
-        seed=12415)
-    loc_one_update = loc_one.assign(new_state[0])
-    loc_two_update = loc_two.assign(new_state[1])
-
-    init = variables.initialize_all_variables()
-    with self.test_session() as sess:
-      sess.run(init)
-      loc_one_samples = []
-      loc_two_samples = []
-      for _ in range(10000):
-        loc_one_sample, loc_two_sample = sess.run(
-            [loc_one_update, loc_two_update])
-        loc_one_samples.append(loc_one_sample)
-        loc_two_samples.append(loc_two_sample)
-
-    loc_one_samples = np.array(loc_one_samples)
-    loc_two_samples = np.array(loc_two_samples)
-    loc_one_samples = loc_one_samples[1000:]  # drop samples for burn-in
-    loc_two_samples = loc_two_samples[1000:]  # drop samples for burn-in
-
-    self.assertAllClose(np.mean(loc_one_samples, 0),
-                        np.array([0.] * num_chains),
-                        rtol=1e-5, atol=1e-1)
-    self.assertAllClose(np.mean(loc_two_samples, 0),
-                        np.array([0.] * num_chains),
-                        rtol=1e-5, atol=1e-1)
-    self.assertAllClose(np.std(loc_one_samples, 0),
-                        np.array([0.1] * num_chains),
-                        rtol=1e-5, atol=1e-1)
-    self.assertAllClose(np.std(loc_two_samples, 0),
-                        np.array([0.1] * num_chains),
-                        rtol=1e-5, atol=1e-1)
-
-  def testKernelResultsUsingTruncatedDistribution(self):
-    def log_prob(x):
-      return array_ops.where(
-          x >= 0.,
-          -x - x**2,
-          array_ops.fill(x.shape, math_ops.cast(-np.inf, x.dtype)))
-    # The truncated distribution has the property that it is likely to attract
-    # the flow toward, and below, zero...but for x <=0,
-    # log_prob(x) = -inf, which should result in rejection, as well
-    # as a non-finite log_prob.  Thus, this distribution gives us an opportunity
-    # to test out the kernel results ability to correctly capture rejections due
-    # to finite AND non-finite reasons.
-
-    num_results = 1000
-    # Large step size, will give rejections due to going into a region of
-    # log_prob = -inf.
-    step_size = 0.3
-    num_chains = 2
-
-    with self.test_session(graph=ops.Graph()) as sess:
-
-      # Start multiple independent chains.
-      initial_state = ops.convert_to_tensor([0.1] * num_chains)
-
-      states = []
-      is_accepted = []
-      proposed_states = []
-      current_state = initial_state
-      for _ in range(num_results):
-        current_state, kernel_results = mh.kernel(
-            target_log_prob_fn=log_prob,
-            proposal_fn=mh.proposal_uniform(step_size=step_size),
-            current_state=current_state,
-            seed=42)
-        states.append(current_state)
-        proposed_states.append(kernel_results.proposed_state)
-        is_accepted.append(kernel_results.is_accepted)
-
-      states = array_ops.stack(states)
-      proposed_states = array_ops.stack(proposed_states)
-      is_accepted = array_ops.stack(is_accepted)
-      states_, pstates_, is_accepted_ = sess.run(
-          [states, proposed_states, is_accepted])
-
-      # We better have accepted a decent amount, even near end of the chain.
-      self.assertLess(
-          0.1, is_accepted_[int(0.9 * num_results):].mean())
-      # We better not have any NaNs in states.
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isnan(states_))
-      # We better not have any +inf in states.
-      self.assertAllEqual(np.zeros_like(states_),
-                          np.isposinf(states_))
-
-      # The move is accepted ==> state = proposed state.
-      self.assertAllEqual(
-          states_[is_accepted_],
-          pstates_[is_accepted_],
-      )
-
-      # The move was rejected <==> state[t] == state[t - 1].
-      for t in range(1, num_results):
-        for i in range(num_chains):
-          if is_accepted_[t, i]:
-            self.assertNotEqual(states_[t, i], states_[t - 1, i])
-          else:
-            self.assertEqual(states_[t, i], states_[t - 1, i])
-
-  def testDensityIncreasingStepAccepted(self):
-    """Tests that if a transition increases density, it is always accepted."""
-    target_log_density = lambda x: - x * x
-    state = variable_scope.get_variable("state", initializer=10.)
-    state_log_density = variable_scope.get_variable(
-        "state_log_density",
-        initializer=target_log_density(state.initialized_value()))
-    log_accept_ratio = variable_scope.get_variable(
-        "log_accept_ratio", initializer=0.)
-
-    get_next_proposal = lambda x: (x - 1., None)
-    step = mh.evolve(state, state_log_density, log_accept_ratio,
-                     target_log_density, get_next_proposal, seed=1234)
-    init = variables.initialize_all_variables()
-    with self.test_session() as sess:
-      sess.run(init)
-      for j in range(9):
-        sess.run(step)
-        sample = sess.run(state)
-        sample_log_density = sess.run(state_log_density)
-        self.assertAlmostEqual(sample, 9 - j)
-        self.assertAlmostEqual(sample_log_density, - (9 - j) * (9 - j))
-
-  def testSampleProperties(self):
-    """Tests that the samples converge to the target distribution."""
-
-    def target_log_density(x):
-      """Log-density corresponding to a normal distribution with mean = 4."""
-      return - (x - 2.0) * (x - 2.0) * 0.5
-
-    # Use the uniform random walker to generate proposals.
-    proposal_fn = mh.proposal_uniform(
-        step_size=1.0, seed=1234)
-
-    state = variable_scope.get_variable("state", initializer=0.0)
-    state_log_density = variable_scope.get_variable(
-        "state_log_density",
-        initializer=target_log_density(state.initialized_value()))
-    log_accept_ratio = variable_scope.get_variable(
-        "log_accept_ratio", initializer=0.)
-
-    # Random walk MCMC converges slowly so need to put in enough iterations.
-    num_iterations = 5000
-    step = mh.evolve(state, state_log_density, log_accept_ratio,
-                     target_log_density, proposal_fn, seed=4321)
-
-    init = variables.global_variables_initializer()
-
-    sample_sum, sample_sq_sum = 0.0, 0.0
-    with self.test_session() as sess:
-      sess.run(init)
-      for _ in np.arange(num_iterations):
-        # Allow for the mixing of the chain and discard these samples.
-        sess.run(step)
-      for _ in np.arange(num_iterations):
-        sess.run(step)
-        sample = sess.run(state)
-        sample_sum += sample
-        sample_sq_sum += sample * sample
-
-    sample_mean = sample_sum / num_iterations
-    sample_variance = sample_sq_sum / num_iterations - sample_mean * sample_mean
-    # The samples have large autocorrelation which reduces the effective sample
-    # size.
-    self.assertAlmostEqual(sample_mean, 2.0, delta=0.1)
-    self.assertAlmostEqual(sample_variance, 1.0, delta=0.1)
-
-  def testProposalNormal(self):
-    """Tests that the normal proposals are correctly distributed."""
-
-    initial_points = array_ops.ones([10000], dtype=dtypes.float32)
-    proposal_fn = mh.proposal_normal(
-        scale=2.0, seed=1234)
-    proposal_points, _ = proposal_fn(initial_points)
-
-    with self.test_session() as sess:
-      sample = sess.run(proposal_points)
-
-    # It is expected that the elements in proposal_points have the same mean as
-    # initial_points and have the standard deviation that was supplied to the
-    # proposal scheme.
-    self.assertAlmostEqual(np.mean(sample), 1.0, delta=0.1)
-    self.assertAlmostEqual(np.std(sample), 2.0, delta=0.1)
-
-  def testDocstringExample(self):
-    """Tests the simplified docstring example with multiple chains."""
-
-    n = 2  # dimension of the problem
-
-    # Generate 300 initial values randomly. Each of these would be an
-    # independent starting point for a Markov chain.
-    state = variable_scope.get_variable(
-        "state", initializer=random_ops.random_normal(
-            [300, n], mean=3.0, dtype=dtypes.float32, seed=42))
-
-    # Computes the log(p(x)) for the unit normal density and ignores the
-    # normalization constant.
-    def log_density(x):
-      return  - math_ops.reduce_sum(x * x, reduction_indices=-1) / 2.0
-
-    # Initial log-density value
-    state_log_density = variable_scope.get_variable(
-        "state_log_density",
-        initializer=log_density(state.initialized_value()))
-
-    # A variable to store the log_acceptance_ratio:
-    log_acceptance_ratio = variable_scope.get_variable(
-        "log_acceptance_ratio",
-        initializer=array_ops.zeros([300], dtype=dtypes.float32))
-
-    # Generates random proposals by moving each coordinate uniformly and
-    # independently in a box of size 2 centered around the current value.
-    # Returns the new point and also the log of the Hastings ratio (the
-    # ratio of the probability of going from the proposal to origin and the
-    # probability of the reverse transition). When this ratio is 1, the value
-    # may be omitted and replaced by None.
-    def random_proposal(x):
-      return (x + random_ops.random_uniform(
-          array_ops.shape(x), minval=-1, maxval=1,
-          dtype=x.dtype, seed=12)), None
-
-    #  Create the op to propagate the chain for 100 steps.
-    stepper = mh.evolve(
-        state, state_log_density, log_acceptance_ratio,
-        log_density, random_proposal, n_steps=100, seed=123)
-    init = variables.initialize_all_variables()
-    with self.test_session() as sess:
-      sess.run(init)
-      # Run the chains for a total of 1000 steps.
-      for _ in range(10):
-        sess.run(stepper)
-      samples = sess.run(state)
-      covariance = np.eye(n)
-      # Verify that the estimated mean and covariance are close to the true
-      # values.
-      self.assertAlmostEqual(
-          np.max(np.abs(np.mean(samples, 0)
-                        - np.zeros(n))), 0,
-          delta=0.1)
-      self.assertAlmostEqual(
-          np.max(np.abs(np.reshape(np.cov(samples, rowvar=False), [n**2])
-                        - np.reshape(covariance, [n**2]))), 0,
-          delta=0.2)
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
deleted file mode 100644
index 66afcc749746ab5c04114e585c5f93a3f3354d86..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
+++ /dev/null
@@ -1,961 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
-
-@@sample_chain
-@@kernel
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gradients_impl as gradients_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.distributions import util as distributions_util
-
-__all__ = [
-    "sample_chain",
-    "kernel",
-]
-
-
-KernelResults = collections.namedtuple(
-    "KernelResults",
-    [
-        "log_accept_ratio",
-        "current_grads_target_log_prob",  # "Current result" means "accepted".
-        "current_target_log_prob",  # "Current result" means "accepted".
-        "is_accepted",
-        "proposed_grads_target_log_prob",
-        "proposed_state",
-        "proposed_target_log_prob",
-    ])
-
-
-def _make_dummy_kernel_results(
-    dummy_state,
-    dummy_target_log_prob,
-    dummy_grads_target_log_prob):
-  return KernelResults(
-      log_accept_ratio=dummy_target_log_prob,
-      current_grads_target_log_prob=dummy_grads_target_log_prob,
-      current_target_log_prob=dummy_target_log_prob,
-      is_accepted=array_ops.ones_like(dummy_target_log_prob, dtypes.bool),
-      proposed_grads_target_log_prob=dummy_grads_target_log_prob,
-      proposed_state=dummy_state,
-      proposed_target_log_prob=dummy_target_log_prob,
-  )
-
-
-def sample_chain(
-    num_results,
-    target_log_prob_fn,
-    current_state,
-    step_size,
-    num_leapfrog_steps,
-    num_burnin_steps=0,
-    num_steps_between_results=0,
-    seed=None,
-    current_target_log_prob=None,
-    current_grads_target_log_prob=None,
-    name=None):
-  """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains.
-
-  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm
-  that takes a series of gradient-informed steps to produce a Metropolis
-  proposal. This function samples from an HMC Markov chain at `current_state`
-  and whose stationary distribution has log-unnormalized-density
-  `target_log_prob_fn()`.
-
-  This function samples from multiple chains in parallel. It assumes that the
-  the leftmost dimensions of (each) `current_state` (part) index an independent
-  chain.  The function `target_log_prob_fn()` sums log-probabilities across
-  event dimensions (i.e., current state (part) rightmost dimensions). Each
-  element of the output of `target_log_prob_fn()` represents the (possibly
-  unnormalized) log-probability of the joint distribution over (all) the current
-  state (parts).
-
-  The `current_state` can be represented as a single `Tensor` or a `list` of
-  `Tensors` which collectively represent the current state. When specifying a
-  `list`, one must also specify a list of `step_size`s.
-
-  Note: `target_log_prob_fn` is called exactly twice.
-
-  Since HMC states are correlated, it is sometimes desirable to produce
-  additional intermediate states, and then discard them, ending up with a set of
-  states with decreased autocorrelation.  See [1].  Such "thinning" is made
-  possible by setting `num_steps_between_results > 0`.  The chain then takes
-  `num_steps_between_results` extra steps between the steps that make it into
-  the results.  The extra steps are never materialized (in calls to `sess.run`),
-  and thus do not increase memory requirements.
-
-  [1]: "Statistically efficient thinning of a Markov chain sampler."
-       Art B. Owen. April 2017.
-       http://statweb.stanford.edu/~owen/reports/bestthinning.pdf
-
-  #### Examples:
-
-  ##### Sample from a diagonal-variance Gaussian.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  def make_likelihood(true_variances):
-    return tfd.MultivariateNormalDiag(
-        scale_diag=tf.sqrt(true_variances))
-
-  dims = 10
-  dtype = np.float32
-  true_variances = tf.linspace(dtype(1), dtype(3), dims)
-  likelihood = make_likelihood(true_variances)
-
-  states, kernel_results = hmc.sample_chain(
-      num_results=1000,
-      target_log_prob_fn=likelihood.log_prob,
-      current_state=tf.zeros(dims),
-      step_size=0.5,
-      num_leapfrog_steps=2,
-      num_burnin_steps=500)
-
-  # Compute sample stats.
-  sample_mean = tf.reduce_mean(states, axis=0)
-  sample_var = tf.reduce_mean(
-      tf.squared_difference(states, sample_mean),
-      axis=0)
-  ```
-
-  ##### Sampling from factor-analysis posteriors with known factors.
-
-  I.e.,
-
-  ```none
-  for i=1..n:
-    w[i] ~ Normal(0, eye(d))            # prior
-    x[i] ~ Normal(loc=matmul(w[i], F))  # likelihood
-  ```
-
-  where `F` denotes factors.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  def make_prior(dims, dtype):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.zeros(dims, dtype))
-
-  def make_likelihood(weights, factors):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.tensordot(weights, factors, axes=[[0], [-1]]))
-
-  # Setup data.
-  num_weights = 10
-  num_factors = 4
-  num_chains = 100
-  dtype = np.float32
-
-  prior = make_prior(num_weights, dtype)
-  weights = prior.sample(num_chains)
-  factors = np.random.randn(num_factors, num_weights).astype(dtype)
-  x = make_likelihood(weights, factors).sample(num_chains)
-
-  def target_log_prob(w):
-    # Target joint is: `f(w) = p(w, x | factors)`.
-    return prior.log_prob(w) + make_likelihood(w, factors).log_prob(x)
-
-  # Get `num_results` samples from `num_chains` independent chains.
-  chains_states, kernels_results = hmc.sample_chain(
-      num_results=1000,
-      target_log_prob_fn=target_log_prob,
-      current_state=tf.zeros([num_chains, dims], dtype),
-      step_size=0.1,
-      num_leapfrog_steps=2,
-      num_burnin_steps=500)
-
-  # Compute sample stats.
-  sample_mean = tf.reduce_mean(chains_states, axis=[0, 1])
-  sample_var = tf.reduce_mean(
-      tf.squared_difference(chains_states, sample_mean),
-      axis=[0, 1])
-  ```
-
-  Args:
-    num_results: Integer number of Markov chain draws.
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
-      for the leapfrog integrator. Must broadcast with the shape of
-      `current_state`. Larger step sizes lead to faster progress, but too-large
-      step sizes make rejection exponentially more likely. When possible, it's
-      often helpful to match per-variable step sizes to the standard deviations
-      of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    num_burnin_steps: Integer number of chain steps to take before starting to
-      collect results.
-      Default value: 0 (i.e., no burn-in).
-    num_steps_between_results: Integer number of chain steps between collecting
-      a result. Only one out of every `num_steps_between_samples + 1` steps is
-      included in the returned results.  The number of returned chain states is
-      still equal to `num_results`.  Default value: 0 (i.e., no thinning).
-    seed: Python integer to seed the random number generator.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn` at the `current_state`. The only reason to specify
-      this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `target_log_prob` at the `current_state` and wrt
-      the `current_state`. Must have same shape as `current_state`. The only
-      reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_sample_chain").
-
-  Returns:
-    next_states: Tensor or Python list of `Tensor`s representing the
-      state(s) of the Markov chain(s) at each result step. Has same shape as
-      input `current_state` but with a prepended `num_results`-size dimension.
-    kernel_results: `collections.namedtuple` of internal calculations used to
-      advance the chain.
-  """
-  with ops.name_scope(
-      name, "hmc_sample_chain",
-      [num_results, current_state, step_size, num_leapfrog_steps,
-       num_burnin_steps, num_steps_between_results, seed,
-       current_target_log_prob, current_grads_target_log_prob]):
-    with ops.name_scope("initialize"):
-      [
-          current_state,
-          step_size,
-          current_target_log_prob,
-          current_grads_target_log_prob,
-      ] = _prepare_args(
-          target_log_prob_fn,
-          current_state,
-          step_size,
-          current_target_log_prob,
-          current_grads_target_log_prob)
-      num_results = ops.convert_to_tensor(
-          num_results,
-          dtype=dtypes.int32,
-          name="num_results")
-      num_leapfrog_steps = ops.convert_to_tensor(
-          num_leapfrog_steps,
-          dtype=dtypes.int32,
-          name="num_leapfrog_steps")
-      num_burnin_steps = ops.convert_to_tensor(
-          num_burnin_steps,
-          dtype=dtypes.int32,
-          name="num_burnin_steps")
-      num_steps_between_results = ops.convert_to_tensor(
-          num_steps_between_results,
-          dtype=dtypes.int32,
-          name="num_steps_between_results")
-
-    def _run_chain(num_steps, current_state, kernel_results):
-      """Runs the chain(s) for `num_steps`."""
-      def _loop_body(iter_, current_state, kernel_results):
-        return [iter_ + 1] + list(kernel(
-            target_log_prob_fn,
-            current_state,
-            step_size,
-            num_leapfrog_steps,
-            seed,
-            kernel_results.current_target_log_prob,
-            kernel_results.current_grads_target_log_prob))
-      while_loop_kwargs = dict(
-          cond=lambda iter_, *args: iter_ < num_steps,
-          body=_loop_body,
-          loop_vars=[
-              np.int32(0),
-              current_state,
-              kernel_results,
-          ],
-      )
-      if seed is not None:
-        while_loop_kwargs["parallel_iterations"] = 1
-      return control_flow_ops.while_loop(
-          **while_loop_kwargs)[1:]  # Lop-off "iter_".
-
-    def _scan_body(args_list, iter_):
-      """Closure which implements `tf.scan` body."""
-      current_state, kernel_results = args_list
-      return _run_chain(
-          1 + array_ops.where(math_ops.equal(iter_, 0),
-                              num_burnin_steps,
-                              num_steps_between_results),
-          current_state,
-          kernel_results)
-
-    scan_kwargs = dict(
-        fn=_scan_body,
-        elems=math_ops.range(num_results),  # iter_: used to choose burnin.
-        initializer=[
-            current_state,
-            _make_dummy_kernel_results(
-                current_state,
-                current_target_log_prob,
-                current_grads_target_log_prob),
-        ])
-    if seed is not None:
-      scan_kwargs["parallel_iterations"] = 1
-    return functional_ops.scan(**scan_kwargs)
-
-
-def kernel(target_log_prob_fn,
-           current_state,
-           step_size,
-           num_leapfrog_steps,
-           seed=None,
-           current_target_log_prob=None,
-           current_grads_target_log_prob=None,
-           name=None):
-  """Runs one iteration of Hamiltonian Monte Carlo.
-
-  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
-  algorithm that takes a series of gradient-informed steps to produce
-  a Metropolis proposal. This function applies one step of HMC to
-  randomly update the variable `x`.
-
-  This function can update multiple chains in parallel. It assumes that all
-  leftmost dimensions of `current_state` index independent chain states (and are
-  therefore updated independently). The output of `target_log_prob_fn()` should
-  sum log-probabilities across all event dimensions. Slices along the rightmost
-  dimensions may have different target distributions; for example,
-  `current_state[0, :]` could have a different target distribution from
-  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
-  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
-
-  #### Examples:
-
-  ##### Simple chain with warm-up.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  # Tuning acceptance rates:
-  dtype = np.float32
-  target_accept_rate = 0.631
-  num_warmup_iter = 500
-  num_chain_iter = 500
-
-  x = tf.get_variable(name="x", initializer=dtype(1))
-  step_size = tf.get_variable(name="step_size", initializer=dtype(1))
-
-  target = tfd.Normal(loc=dtype(0), scale=dtype(1))
-
-  next_x, other_results = hmc.kernel(
-      target_log_prob_fn=target.log_prob,
-      current_state=x,
-      step_size=step_size,
-      num_leapfrog_steps=3)[:4]
-
-  x_update = x.assign(next_x)
-
-  step_size_update = step_size.assign_add(
-      step_size * tf.where(
-          tf.exp(tf.minimum(other_results.log_accept_ratio), 0.) >
-              target_accept_rate,
-          0.01, -0.01))
-
-  warmup = tf.group([x_update, step_size_update])
-
-  tf.global_variables_initializer().run()
-
-  sess.graph.finalize()  # No more graph building.
-
-  # Warm up the sampler and adapt the step size
-  for _ in xrange(num_warmup_iter):
-    sess.run(warmup)
-
-  # Collect samples without adapting step size
-  samples = np.zeros([num_chain_iter])
-  for i in xrange(num_chain_iter):
-    _, x_, target_log_prob_, grad_ = sess.run([
-        x_update,
-        x,
-        other_results.target_log_prob,
-        other_results.grads_target_log_prob])
-    samples[i] = x_
-
-  print(samples.mean(), samples.std())
-  ```
-
-  ##### Sample from more complicated posterior.
-
-  I.e.,
-
-  ```none
-    W ~ MVN(loc=0, scale=sigma * eye(dims))
-    for i=1...num_samples:
-        X[i] ~ MVN(loc=0, scale=eye(dims))
-      eps[i] ~ Normal(loc=0, scale=1)
-        Y[i] = X[i].T * W + eps[i]
-  ```
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  def make_training_data(num_samples, dims, sigma):
-    dt = np.asarray(sigma).dtype
-    zeros = tf.zeros(dims, dtype=dt)
-    x = tfd.MultivariateNormalDiag(
-        loc=zeros).sample(num_samples, seed=1)
-    w = tfd.MultivariateNormalDiag(
-        loc=zeros,
-        scale_identity_multiplier=sigma).sample(seed=2)
-    noise = tfd.Normal(
-        loc=dt(0),
-        scale=dt(1)).sample(num_samples, seed=3)
-    y = tf.tensordot(x, w, axes=[[1], [0]]) + noise
-    return y, x, w
-
-  def make_prior(sigma, dims):
-    # p(w | sigma)
-    return tfd.MultivariateNormalDiag(
-        loc=tf.zeros([dims], dtype=sigma.dtype),
-        scale_identity_multiplier=sigma)
-
-  def make_likelihood(x, w):
-    # p(y | x, w)
-    return tfd.MultivariateNormalDiag(
-        loc=tf.tensordot(x, w, axes=[[1], [0]]))
-
-  # Setup assumptions.
-  dtype = np.float32
-  num_samples = 150
-  dims = 10
-  num_iters = int(5e3)
-
-  true_sigma = dtype(0.5)
-  y, x, true_weights = make_training_data(num_samples, dims, true_sigma)
-
-  # Estimate of `log(true_sigma)`.
-  log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0))
-  sigma = tf.exp(log_sigma)
-
-  # State of the Markov chain.
-  weights = tf.get_variable(
-      name="weights",
-      initializer=np.random.randn(dims).astype(dtype))
-
-  prior = make_prior(sigma, dims)
-
-  def joint_log_prob_fn(w):
-    # f(w) = log p(w, y | x)
-    return prior.log_prob(w) + make_likelihood(x, w).log_prob(y)
-
-  weights_update = weights.assign(
-      hmc.kernel(target_log_prob_fn=joint_log_prob,
-                 current_state=weights,
-                 step_size=0.1,
-                 num_leapfrog_steps=5)[0])
-
-  with tf.control_dependencies([weights_update]):
-    loss = -prior.log_prob(weights)
-
-  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-  log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma])
-
-  sess.graph.finalize()  # No more graph building.
-
-  tf.global_variables_initializer().run()
-
-  sigma_history = np.zeros(num_iters, dtype)
-  weights_history = np.zeros([num_iters, dims], dtype)
-
-  for i in xrange(num_iters):
-    _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights])
-    weights_history[i, :] = weights_
-    sigma_history[i] = sigma_
-
-  true_weights_ = sess.run(true_weights)
-
-  # Should converge to something close to true_sigma.
-  plt.plot(sigma_history);
-  plt.ylabel("sigma");
-  plt.xlabel("iteration");
-  ```
-
-  Args:
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
-      for the leapfrog integrator. Must broadcast with the shape of
-      `current_state`. Larger step sizes lead to faster progress, but too-large
-      step sizes make rejection exponentially more likely. When possible, it's
-      often helpful to match per-variable step sizes to the standard deviations
-      of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    seed: Python integer to seed the random number generator.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn` at the `current_state`. The only reason to
-      specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `current_target_log_prob` at the `current_state`
-      and wrt the `current_state`. Must have same shape as `current_state`. The
-      only reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_kernel").
-
-  Returns:
-    next_state: Tensor or Python list of `Tensor`s representing the state(s)
-      of the Markov chain(s) at each result step. Has same shape as
-      `current_state`.
-    kernel_results: `collections.namedtuple` of internal calculations used to
-      advance the chain.
-
-  Raises:
-    ValueError: if there isn't one `step_size` or a list with same length as
-      `current_state`.
-  """
-  with ops.name_scope(
-      name, "hmc_kernel",
-      [current_state, step_size, num_leapfrog_steps, seed,
-       current_target_log_prob, current_grads_target_log_prob]):
-    with ops.name_scope("initialize"):
-      [current_state_parts, step_sizes, current_target_log_prob,
-       current_grads_target_log_prob] = _prepare_args(
-           target_log_prob_fn, current_state, step_size,
-           current_target_log_prob, current_grads_target_log_prob,
-           maybe_expand=True)
-      independent_chain_ndims = distributions_util.prefer_static_rank(
-          current_target_log_prob)
-      current_momentums = []
-      for s in current_state_parts:
-        current_momentums.append(random_ops.random_normal(
-            shape=array_ops.shape(s),
-            dtype=s.dtype.base_dtype,
-            seed=seed))
-        seed = distributions_util.gen_new_seed(
-            seed, salt="hmc_kernel_momentums")
-
-      num_leapfrog_steps = ops.convert_to_tensor(
-          num_leapfrog_steps,
-          dtype=dtypes.int32,
-          name="num_leapfrog_steps")
-    [
-        proposed_momentums,
-        proposed_state_parts,
-        proposed_target_log_prob,
-        proposed_grads_target_log_prob,
-    ] = _leapfrog_integrator(current_momentums,
-                             target_log_prob_fn,
-                             current_state_parts,
-                             step_sizes,
-                             num_leapfrog_steps,
-                             current_target_log_prob,
-                             current_grads_target_log_prob)
-
-    energy_change = _compute_energy_change(current_target_log_prob,
-                                           current_momentums,
-                                           proposed_target_log_prob,
-                                           proposed_momentums,
-                                           independent_chain_ndims)
-    log_accept_ratio = -energy_change
-
-    # u < exp(log_accept_ratio),  where u~Uniform[0,1)
-    # ==> log(u) < log_accept_ratio
-    random_value = random_ops.random_uniform(
-        shape=array_ops.shape(energy_change),
-        dtype=energy_change.dtype,
-        seed=seed)
-    random_negative = math_ops.log(random_value)
-    is_accepted = random_negative < log_accept_ratio
-
-    accepted_target_log_prob = array_ops.where(is_accepted,
-                                               proposed_target_log_prob,
-                                               current_target_log_prob)
-
-    next_state_parts = [_choose(is_accepted,
-                                proposed_state_part,
-                                current_state_part,
-                                independent_chain_ndims)
-                        for current_state_part, proposed_state_part
-                        in zip(current_state_parts, proposed_state_parts)]
-
-    accepted_grads_target_log_prob = [
-        _choose(is_accepted,
-                proposed_grad,
-                grad,
-                independent_chain_ndims)
-        for proposed_grad, grad
-        in zip(proposed_grads_target_log_prob, current_grads_target_log_prob)]
-
-    maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0]
-    return [
-        maybe_flatten(next_state_parts),
-        KernelResults(
-            log_accept_ratio=log_accept_ratio,
-            current_grads_target_log_prob=accepted_grads_target_log_prob,
-            current_target_log_prob=accepted_target_log_prob,
-            is_accepted=is_accepted,
-            proposed_grads_target_log_prob=proposed_grads_target_log_prob,
-            proposed_state=maybe_flatten(proposed_state_parts),
-            proposed_target_log_prob=proposed_target_log_prob,
-        ),
-    ]
-
-
-def _leapfrog_integrator(current_momentums,
-                         target_log_prob_fn,
-                         current_state_parts,
-                         step_sizes,
-                         num_leapfrog_steps,
-                         current_target_log_prob=None,
-                         current_grads_target_log_prob=None,
-                         name=None):
-  """Applies `num_leapfrog_steps` of the leapfrog integrator.
-
-  Assumes a simple quadratic kinetic energy function: `0.5 ||momentum||**2`.
-
-  #### Examples:
-
-  ##### Simple quadratic potential.
-
-  ```python
-  tfd = tf.contrib.distributions
-
-  dims = 10
-  num_iter = int(1e3)
-  dtype = np.float32
-
-  position = tf.placeholder(np.float32)
-  momentum = tf.placeholder(np.float32)
-
-  [
-      next_momentums,
-      next_positions,
-  ] = hmc._leapfrog_integrator(
-      current_momentums=[momentum],
-      target_log_prob_fn=tfd.MultivariateNormalDiag(
-          loc=tf.zeros(dims, dtype)).log_prob,
-      current_state_parts=[position],
-      step_sizes=0.1,
-      num_leapfrog_steps=3)[:2]
-
-  sess.graph.finalize()  # No more graph building.
-
-  momentum_ = np.random.randn(dims).astype(dtype)
-  position_ = np.random.randn(dims).astype(dtype)
-
-  positions = np.zeros([num_iter, dims], dtype)
-  for i in xrange(num_iter):
-    position_, momentum_ = sess.run(
-        [next_momentums[0], next_position[0]],
-        feed_dict={position: position_, momentum: momentum_})
-    positions[i] = position_
-
-  plt.plot(positions[:, 0]);  # Sinusoidal.
-  ```
-
-  Args:
-    current_momentums: Tensor containing the value(s) of the momentum
-      variable(s) to update.
-    target_log_prob_fn: Python callable which takes an argument like
-      `*current_state_parts` and returns its (possibly unnormalized) log-density
-      under the target distribution.
-    current_state_parts: Python `list` of `Tensor`s representing the current
-      state(s) of the Markov chain(s). The first `independent_chain_ndims` of
-      the `Tensor`(s) index different chains.
-    step_sizes: Python `list` of `Tensor`s representing the step size for the
-      leapfrog integrator. Must broadcast with the shape of
-      `current_state_parts`.  Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely. When
-      possible, it's often helpful to match per-variable step sizes to the
-      standard deviations of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn(*current_state_parts)`. The only reason to specify
-      this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `target_log_prob_fn(*current_state_parts`) wrt
-      `current_state_parts`. Must have same shape as `current_state_parts`. The
-      only reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_leapfrog_integrator").
-
-  Returns:
-    proposed_momentums: Updated value of the momentum.
-    proposed_state_parts: Tensor or Python list of `Tensor`s representing the
-      state(s) of the Markov chain(s) at each result step. Has same shape as
-      input `current_state_parts`.
-    proposed_target_log_prob: `Tensor` representing the value of
-      `target_log_prob_fn` at `next_state`.
-    proposed_grads_target_log_prob: Gradient of `proposed_target_log_prob` wrt
-      `next_state`.
-
-  Raises:
-    ValueError: if `len(momentums) != len(state_parts)`.
-    ValueError: if `len(state_parts) != len(step_sizes)`.
-    ValueError: if `len(state_parts) != len(grads_target_log_prob)`.
-    TypeError: if `not target_log_prob.dtype.is_floating`.
-  """
-  def _loop_body(step,
-                 current_momentums,
-                 current_state_parts,
-                 ignore_current_target_log_prob,  # pylint: disable=unused-argument
-                 current_grads_target_log_prob):
-    return [step + 1] + list(_leapfrog_step(current_momentums,
-                                            target_log_prob_fn,
-                                            current_state_parts,
-                                            step_sizes,
-                                            current_grads_target_log_prob))
-
-  with ops.name_scope(
-      name, "hmc_leapfrog_integrator",
-      [current_momentums, current_state_parts, step_sizes, num_leapfrog_steps,
-       current_target_log_prob, current_grads_target_log_prob]):
-    if len(current_momentums) != len(current_state_parts):
-      raise ValueError("`momentums` must be in one-to-one correspondence "
-                       "with `state_parts`")
-    num_leapfrog_steps = ops.convert_to_tensor(num_leapfrog_steps,
-                                               name="num_leapfrog_steps")
-    current_target_log_prob, current_grads_target_log_prob = (
-        _maybe_call_fn_and_grads(
-            target_log_prob_fn,
-            current_state_parts,
-            current_target_log_prob,
-            current_grads_target_log_prob))
-    return control_flow_ops.while_loop(
-        cond=lambda iter_, *args: iter_ < num_leapfrog_steps,
-        body=_loop_body,
-        loop_vars=[
-            np.int32(0),  # iter_
-            current_momentums,
-            current_state_parts,
-            current_target_log_prob,
-            current_grads_target_log_prob,
-        ],
-        back_prop=False)[1:]  # Lop-off "iter_".
-
-
-def _leapfrog_step(current_momentums,
-                   target_log_prob_fn,
-                   current_state_parts,
-                   step_sizes,
-                   current_grads_target_log_prob,
-                   name=None):
-  """Applies one step of the leapfrog integrator."""
-  with ops.name_scope(
-      name, "_leapfrog_step",
-      [current_momentums, current_state_parts, step_sizes,
-       current_grads_target_log_prob]):
-    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
-                          in zip(current_momentums,
-                                 step_sizes,
-                                 current_grads_target_log_prob)]
-    proposed_state_parts = [x + ss * m for x, ss, m
-                            in zip(current_state_parts,
-                                   step_sizes,
-                                   proposed_momentums)]
-    proposed_target_log_prob = target_log_prob_fn(*proposed_state_parts)
-    if not proposed_target_log_prob.dtype.is_floating:
-      raise TypeError("`target_log_prob_fn` must produce a `Tensor` "
-                      "with `float` `dtype`.")
-    proposed_grads_target_log_prob = gradients_ops.gradients(
-        proposed_target_log_prob, proposed_state_parts)
-    if any(g is None for g in proposed_grads_target_log_prob):
-      raise ValueError(
-          "Encountered `None` gradient. Does your target `target_log_prob_fn` "
-          "access all `tf.Variable`s via `tf.get_variable`?\n"
-          "  current_state_parts: {}\n"
-          "  proposed_state_parts: {}\n"
-          "  proposed_grads_target_log_prob: {}".format(
-              current_state_parts,
-              proposed_state_parts,
-              proposed_grads_target_log_prob))
-    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
-                          in zip(proposed_momentums,
-                                 step_sizes,
-                                 proposed_grads_target_log_prob)]
-    return [
-        proposed_momentums,
-        proposed_state_parts,
-        proposed_target_log_prob,
-        proposed_grads_target_log_prob,
-    ]
-
-
-def _compute_energy_change(current_target_log_prob,
-                           current_momentums,
-                           proposed_target_log_prob,
-                           proposed_momentums,
-                           independent_chain_ndims,
-                           name=None):
-  """Helper to `kernel` which computes the energy change."""
-  with ops.name_scope(
-      name, "compute_energy_change",
-      ([current_target_log_prob, proposed_target_log_prob,
-        independent_chain_ndims] +
-       current_momentums + proposed_momentums)):
-    # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy
-    # since they're a mouthful and lets us inline more.
-    lk0, lk1 = [], []
-    for current_momentum, proposed_momentum in zip(current_momentums,
-                                                   proposed_momentums):
-      axis = math_ops.range(independent_chain_ndims,
-                            array_ops.rank(current_momentum))
-      lk0.append(_log_sum_sq(current_momentum, axis))
-      lk1.append(_log_sum_sq(proposed_momentum, axis))
-
-    lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1),
-                                                  axis=-1)
-    lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1),
-                                                  axis=-1)
-    lp0 = -current_target_log_prob   # potential
-    lp1 = -proposed_target_log_prob  # proposed_potential
-    x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)],
-                        axis=-1)
-
-    # The sum is NaN if any element is NaN or we see both +Inf and -Inf.
-    # Thus we will replace such rows with infinite energy change which implies
-    # rejection. Recall that float-comparisons with NaN are always False.
-    is_sum_determinate = (
-        math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) &
-        math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1))
-    is_sum_determinate = array_ops.tile(
-        is_sum_determinate[..., array_ops.newaxis],
-        multiples=array_ops.concat([
-            array_ops.ones(array_ops.rank(is_sum_determinate),
-                           dtype=dtypes.int32),
-            [4],
-        ], axis=0))
-    x = array_ops.where(is_sum_determinate,
-                        x,
-                        array_ops.fill(array_ops.shape(x),
-                                       value=x.dtype.as_numpy_dtype(np.inf)))
-
-    return math_ops.reduce_sum(x, axis=-1)
-
-
-def _choose(is_accepted,
-            accepted,
-            rejected,
-            independent_chain_ndims,
-            name=None):
-  """Helper to `kernel` which expand_dims `is_accepted` to apply tf.where."""
-  def _expand_is_accepted_like(x):
-    with ops.name_scope("_choose"):
-      expand_shape = array_ops.concat([
-          array_ops.shape(is_accepted),
-          array_ops.ones([array_ops.rank(x) - array_ops.rank(is_accepted)],
-                         dtype=dtypes.int32),
-      ], axis=0)
-      multiples = array_ops.concat([
-          array_ops.ones([array_ops.rank(is_accepted)], dtype=dtypes.int32),
-          array_ops.shape(x)[independent_chain_ndims:],
-      ], axis=0)
-      m = array_ops.tile(array_ops.reshape(is_accepted, expand_shape),
-                         multiples)
-      m.set_shape(x.shape)
-      return m
-  with ops.name_scope(name, "_choose", values=[
-      is_accepted, accepted, rejected, independent_chain_ndims]):
-    return array_ops.where(_expand_is_accepted_like(accepted),
-                           accepted,
-                           rejected)
-
-
-def _maybe_call_fn_and_grads(fn,
-                             fn_arg_list,
-                             fn_result=None,
-                             grads_fn_result=None,
-                             description="target_log_prob"):
-  """Helper which computes `fn_result` and `grads` if needed."""
-  fn_arg_list = (list(fn_arg_list) if _is_list_like(fn_arg_list)
-                 else [fn_arg_list])
-  if fn_result is None:
-    fn_result = fn(*fn_arg_list)
-  if not fn_result.dtype.is_floating:
-    raise TypeError("`{}` must be a `Tensor` with `float` `dtype`.".format(
-        description))
-  if grads_fn_result is None:
-    grads_fn_result = gradients_ops.gradients(
-        fn_result, fn_arg_list)
-  if len(fn_arg_list) != len(grads_fn_result):
-    raise ValueError("`{}` must be in one-to-one correspondence with "
-                     "`grads_{}`".format(*[description]*2))
-  if any(g is None for g in grads_fn_result):
-    raise ValueError("Encountered `None` gradient.")
-  return fn_result, grads_fn_result
-
-
-def _prepare_args(target_log_prob_fn, state, step_size,
-                  target_log_prob=None, grads_target_log_prob=None,
-                  maybe_expand=False, description="target_log_prob"):
-  """Helper which processes input args to meet list-like assumptions."""
-  state_parts = list(state) if _is_list_like(state) else [state]
-  state_parts = [ops.convert_to_tensor(s, name="state")
-                 for s in state_parts]
-  target_log_prob, grads_target_log_prob = _maybe_call_fn_and_grads(
-      target_log_prob_fn,
-      state_parts,
-      target_log_prob,
-      grads_target_log_prob,
-      description)
-  step_sizes = list(step_size) if _is_list_like(step_size) else [step_size]
-  step_sizes = [
-      ops.convert_to_tensor(
-          s, name="step_size", dtype=target_log_prob.dtype)
-      for s in step_sizes]
-  if len(step_sizes) == 1:
-    step_sizes *= len(state_parts)
-  if len(state_parts) != len(step_sizes):
-    raise ValueError("There should be exactly one `step_size` or it should "
-                     "have same length as `current_state`.")
-  maybe_flatten = lambda x: x if maybe_expand or _is_list_like(state) else x[0]
-  return [
-      maybe_flatten(state_parts),
-      maybe_flatten(step_sizes),
-      target_log_prob,
-      grads_target_log_prob,
-  ]
-
-
-def _is_list_like(x):
-  """Helper which returns `True` if input is `list`-like."""
-  return isinstance(x, (tuple, list))
-
-
-def _log_sum_sq(x, axis=None):
-  """Computes log(sum(x**2))."""
-  return math_ops.reduce_logsumexp(2. * math_ops.log(math_ops.abs(x)), axis)
diff --git a/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings_impl.py b/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings_impl.py
deleted file mode 100644
index 05aa134ed5c11092316af5f3e45ba07fdb491e90..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings_impl.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Metropolis-Hastings and proposal distributions.
-
-@@kernel
-@@evolve
-@@proposal_uniform
-@@proposal_normal
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import state_ops
-
-__all__ = [
-    "kernel",
-    "evolve",
-    "proposal_uniform",
-    "proposal_normal",
-]
-
-
-KernelResults = collections.namedtuple(
-    "KernelResults",
-    [
-        "log_accept_ratio",
-        "current_target_log_prob",  # "Current result" means "accepted".
-        "is_accepted",
-        "proposed_state",
-    ])
-
-
-def kernel(target_log_prob_fn,
-           proposal_fn,
-           current_state,
-           seed=None,
-           current_target_log_prob=None,
-           name=None):
-  """Runs the Metropolis-Hastings transition kernel.
-
-  This function can update multiple chains in parallel. It assumes that all
-  leftmost dimensions of `current_state` index independent chain states (and are
-  therefore updated independently). The output of `target_log_prob_fn()` should
-  sum log-probabilities across all event dimensions. Slices along the rightmost
-  dimensions may have different target distributions; for example,
-  `current_state[0, :]` could have a different target distribution from
-  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
-  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
-
-  Args:
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    proposal_fn: Python callable which takes an argument like `current_state`
-      (or `*current_state` if it's a list) and returns a tuple of proposed
-      states of same shape as `state`, and a log ratio `Tensor` of same shape
-      as `current_target_log_prob`. The log ratio is the log-probability of
-      `state` given proposed states minus the log-probability of proposed
-      states given `state`. If the proposal is symmetric, set the second value
-      to `None`: this enables more efficient computation than explicitly
-      supplying a tensor of zeros.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    seed: Python integer to seed the random number generator.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn` at the `current_state`. The only reason to
-      specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: A name of the operation (optional).
-
-  Returns:
-    next_state: Tensor or Python list of `Tensor`s representing the state(s)
-      of the Markov chain(s) at each result step. Has same shape as
-      `current_state`.
-    kernel_results: `collections.namedtuple` of internal calculations used to
-      advance the chain.
-
-  #### Examples
-
-  We illustrate Metropolis-Hastings on a Normal likelihood with
-  unknown mean.
-
-  ```python
-  tfd = tf.contrib.distributions
-  tfp = tf.contrib.bayesflow
-
-  loc = tf.get_variable("loc", initializer=1.)
-  x = tf.constant([0.0] * 50)
-
-  def make_target_log_prob_fn(x):
-    def target_log_prob_fn(loc):
-      prior = tfd.Normal(loc=0., scale=1.)
-      likelihood = tfd.Independent(
-        tfd.Normal(loc=loc, scale=0.1),
-        reinterpreted_batch_ndims=1)
-      return prior.log_prob(loc) + likelihood.log_prob(x)
-    return target_log_prob_fn
-
-  next_state, kernel_results = tfp.metropolis_hastings.kernel(
-      target_log_prob_fn=make_target_log_prob_fn(x),
-      proposal_fn=tfp.metropolis_hastings.proposal_normal(),
-      current_state=loc)
-  loc_update = loc.assign(next_state)
-  ```
-
-  We illustrate Metropolis-Hastings on a Normal likelihood with
-  unknown mean and variance. We apply 4 chains.
-
-  ```python
-  tfd = tf.contrib.distributions
-  tfp = tf.contrib.bayesflow
-
-  num_chains = 4
-  loc = tf.get_variable("loc", shape=[num_chains],
-                        initializer=tf.random_normal_initializer())
-  scale = tf.get_variable("scale", shape=[num_chains],
-                          initializer=tf.ones_initializer())
-  x = tf.constant([0.0] * 50)
-
-  def make_target_log_prob_fn(x):
-    data = tf.reshape(x, shape=[-1, 1])
-    def target_log_prob_fn(loc, scale):
-      prior_loc = tfd.Normal(loc=0., scale=1.)
-      prior_scale = tfd.InverseGamma(concentration=1., rate=1.)
-      likelihood = tfd.Independent(
-        tfd.Normal(loc=loc, scale=scale),
-        reinterpreted_batch_ndims=1)
-      return (prior_loc.log_prob(loc) +
-              prior_scale.log_prob(scale) +
-              likelihood.log_prob(data))
-    return target_log_prob_fn
-
-  def proposal_fn(loc, scale):
-    loc_proposal = tfp.metropolis_hastings.proposal_normal()
-    scale_proposal = tfp.metropolis_hastings.proposal_uniform(minval=-1.)
-    proposed_loc, _ = loc_proposal(loc)
-    proposed_scale, _ = scale_proposal(scale)
-    proposed_scale = tf.maximum(proposed_scale, 0.01)
-    return [proposed_loc, proposed_scale], None
-
-  next_state, kernel_results = tfp.metropolis_hastings.kernel(
-      target_log_prob_fn=make_target_log_prob_fn(x),
-      proposal_fn=proposal_fn,
-      current_state=[loc, scale])
-  train_op = tf.group(loc.assign(next_state[0]),
-                      scale.assign(next_state[1]))
-  ```
-
-  """
-  with ops.name_scope(
-      name, "metropolis_hastings_kernel",
-      [current_state, seed, current_target_log_prob]):
-    with ops.name_scope("initialize"):
-      maybe_expand = lambda x: list(x) if _is_list_like(x) else [x]
-      current_state_parts = maybe_expand(current_state)
-      if current_target_log_prob is None:
-        current_target_log_prob = target_log_prob_fn(*current_state_parts)
-
-    proposed_state, log_transit_ratio = proposal_fn(*current_state_parts)
-    proposed_state_parts = maybe_expand(proposed_state)
-
-    proposed_target_log_prob = target_log_prob_fn(*proposed_state_parts)
-
-    with ops.name_scope(
-        "accept_reject",
-        [current_state_parts, proposed_state_parts,
-         current_target_log_prob, proposed_target_log_prob]):
-      log_accept_ratio = proposed_target_log_prob - current_target_log_prob
-      if log_transit_ratio is not None:
-        # If the log_transit_ratio is None, then assume the proposal is
-        # symmetric, i.e.,
-        #   log p(old | new) - log p(new | old) = 0.
-        log_accept_ratio += log_transit_ratio
-
-      # u < exp(log_accept_ratio),  where u~Uniform[0,1)
-      # ==> log(u) < log_accept_ratio
-      random_value = random_ops.random_uniform(
-          array_ops.shape(log_accept_ratio),
-          dtype=log_accept_ratio.dtype,
-          seed=seed)
-      random_negative = math_ops.log(random_value)
-      is_accepted = random_negative < log_accept_ratio
-      next_state_parts = [array_ops.where(is_accepted,
-                                          proposed_state_part,
-                                          current_state_part)
-                          for proposed_state_part, current_state_part in
-                          zip(proposed_state_parts, current_state_parts)]
-      accepted_log_prob = array_ops.where(is_accepted,
-                                          proposed_target_log_prob,
-                                          current_target_log_prob)
-    maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0]
-    return [
-        maybe_flatten(next_state_parts),
-        KernelResults(
-            log_accept_ratio=log_accept_ratio,
-            current_target_log_prob=accepted_log_prob,
-            is_accepted=is_accepted,
-            proposed_state=maybe_flatten(proposed_state_parts),
-        ),
-    ]
-
-
-def evolve(initial_sample,
-           initial_log_density,
-           initial_log_accept_ratio,
-           target_log_prob_fn,
-           proposal_fn,
-           n_steps=1,
-           seed=None,
-           name=None):
-  """Performs `n_steps` of the Metropolis-Hastings update.
-
-  Given a probability density function, `f(x)` and a proposal scheme which
-  generates new points from old, this `Op` returns a tensor
-  which may be used to generate approximate samples from the target distribution
-  using the Metropolis-Hastings algorithm. These samples are from a Markov chain
-  whose equilibrium distribution matches the target distribution.
-
-  The probability distribution may have an unknown normalization constan.
-  We parameterize the probability density as follows:
-
-  ```none
-  f(x) = exp(L(x) + constant)
-  ```
-
-  Here `L(x)` is any continuous function with an (possibly unknown but finite)
-  upper bound, i.e. there exists a number beta such that
-  `L(x)< beta < infinity` for all x. The constant is the normalization needed
-  to make `f(x)` a probability density (as opposed to just a finite measure).
-
-  Although `initial_sample` can be arbitrary, a poor choice may result in a
-  slow-to-mix chain. In many cases the best choice is the one that maximizes
-  the target density, i.e., choose `initial_sample` such that
-  `f(initial_sample) >= f(x)` for all `x`.
-
-
-  If the support of the distribution is a strict subset of R^n (but of non zero
-  measure), then the unnormalized log-density `L(x)` should return `-infinity`
-  outside the support domain. This effectively forces the sampler to only
-  explore points in the regions of finite support.
-
-  Usage:
-  This function is meant to be wrapped up with some of the common proposal
-  schemes (e.g. random walk, Langevin diffusion etc) to produce a more user
-  friendly interface. However, it may also be used to create bespoke samplers.
-
-  The following example, demonstrates the use to generate a 1000 uniform random
-  walk Metropolis samplers run in parallel for the normal target distribution.
-
-  ```python
-  n = 3  # dimension of the problem
-
-  # Generate 1000 initial values randomly. Each of these would be an
-  # independent starting point for a Markov chain.
-  state = tf.get_variable(
-      "state",
-      initializer=tf.random_normal([1000, n],
-                                   mean=3.0,
-                                   dtype=tf.float64,
-                                   seed=42))
-
-  # Computes the log(p(x)) for the unit normal density and ignores the
-  # normalization constant.
-  def log_density(x):
-    return -tf.reduce_sum(x * x, reduction_indices=-1) / 2.0
-
-  # Initial log-density value
-  state_log_density = tf.get_variable(
-      "state_log_density",
-      initializer=log_density(state.initialized_value()))
-
-  # A variable to store the log_acceptance_ratio:
-  log_acceptance_ratio = tf.get_variable(
-      "log_acceptance_ratio",
-      initializer=tf.zeros([1000], dtype=tf.float64))
-
-  # Generates random proposals by moving each coordinate uniformly and
-  # independently in a box of size 2 centered around the current value.
-  # Returns the new point and also the log of the Hastings ratio (the
-  # ratio of the probability of going from the proposal to origin and the
-  # probability of the reverse transition). When this ratio is 1, the value
-  # may be omitted and replaced by None.
-  def random_proposal(x):
-    return (x + tf.random_uniform(tf.shape(x), minval=-1, maxval=1,
-                                  dtype=x.dtype, seed=12)), None
-
-  #  Create the op to propagate the chain for 100 steps.
-  stepper = mh.evolve(
-      state, state_log_density, log_acceptance_ratio,
-      log_density, random_proposal, n_steps=100, seed=123)
-  init = tf.initialize_all_variables()
-  with tf.Session() as sess:
-    sess.run(init)
-    # Run the chains for a total of 1000 steps and print out the mean across
-    # the chains every 100 iterations.
-    for n_iter in range(10):
-      # Executing the stepper advances the chain to the next state.
-      sess.run(stepper)
-      # Print out the current value of the mean(sample) for every dimension.
-      print(np.mean(sess.run(state), 0))
-    # Estimated covariance matrix
-    samples = sess.run(state)
-    print(np.cov(samples, rowvar=False))
-  ```
-
-  Args:
-    initial_sample: A float-like `tf.Variable` of any shape that can
-      be consumed by the `target_log_prob_fn` and `proposal_fn`
-      callables.
-    initial_log_density: Float-like `tf.Variable` with `dtype` and shape
-      equivalent  to `target_log_prob_fn(initial_sample)`, i.e., matching
-        the result of `target_log_prob_fn` invoked at `current_state`.
-    initial_log_accept_ratio: A `tf.Variable` with `dtype` and shape matching
-      `initial_log_density`. Stands for the log of Metropolis-Hastings
-      acceptance ratio after propagating the chain for `n_steps`.
-    target_log_prob_fn: A Python callable evaluated at
-      `current_state` and returning a float-like `Tensor` of log target-density
-      up to a normalizing constant. In other words,
-      `target_log_prob_fn(x) = log(g(x))`, where
-      `target_density = g(x)/Z` for some constant `A`. The shape of the input
-      tensor is the same as the shape of the `current_state`. The shape of the
-      output tensor is either
-        (a). Same as the input shape if the density being sampled is one
-          dimensional, or
-        (b). If the density is defined for `events` of shape
-          `event_shape = [E1, E2, ... Ee]`, then the input tensor should be of
-          shape `batch_shape + event_shape`, here `batch_shape = [B1, ..., Bb]`
-          and the result must be of shape [B1, ..., Bb]. For example, if the
-          distribution that is being sampled is a 10 dimensional normal,
-          then the input tensor may be of shape [100, 10] or [30, 20, 10]. The
-          last dimension will then be 'consumed' by `target_log_prob_fn`
-          and it should return tensors of shape [100] and [30, 20] respectively.
-    proposal_fn: A callable accepting a real valued `Tensor` of current sample
-      points and returning a tuple of two `Tensors`. The first element of the
-      pair should be a `Tensor` containing the proposal state and should have
-      the same shape as the input `Tensor`. The second element of the pair gives
-      the log of the ratio of the probability of transitioning from the
-      proposal points to the input points and the probability of transitioning
-      from the input points to the proposal points. If the proposal is
-      symmetric, i.e.
-      Probability(Proposal -> Current) = Probability(Current -> Proposal)
-      the second value should be set to None instead of explicitly supplying a
-      tensor of zeros. In addition to being convenient, this also leads to a
-      more efficient graph.
-    n_steps: A positive `int` or a scalar `int32` tensor. Sets the number of
-      iterations of the chain.
-    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
-      applied.
-    name: A string that sets the name for this `Op`.
-
-  Returns:
-    forward_step: an `Op` to step the Markov chain forward for `n_steps`.
-  """
-
-  with ops.name_scope(name, "metropolis_hastings", [initial_sample]):
-    current_state = initial_sample
-    current_target_log_prob = initial_log_density
-    log_accept_ratio = initial_log_accept_ratio
-
-    def step(i, current_state, current_target_log_prob, log_accept_ratio):
-      """Wrap single Markov chain iteration in `while_loop`."""
-      next_state, kernel_results = kernel(
-          target_log_prob_fn=target_log_prob_fn,
-          proposal_fn=proposal_fn,
-          current_state=current_state,
-          current_target_log_prob=current_target_log_prob,
-          seed=seed)
-      accepted_log_prob = kernel_results.current_target_log_prob
-      log_accept_ratio = kernel_results.log_accept_ratio
-      return i + 1, next_state, accepted_log_prob, log_accept_ratio
-
-    (_, accepted_state, accepted_target_log_prob, accepted_log_accept_ratio) = (
-        control_flow_ops.while_loop(
-            cond=lambda i, *ignored_args: i < n_steps,
-            body=step,
-            loop_vars=[
-                0,  # i
-                current_state,
-                current_target_log_prob,
-                log_accept_ratio,
-            ],
-            parallel_iterations=1 if seed is not None else 10,
-            # TODO(b/73775595): Confirm optimal setting of swap_memory.
-            swap_memory=1))
-
-    forward_step = control_flow_ops.group(
-        state_ops.assign(current_target_log_prob, accepted_target_log_prob),
-        state_ops.assign(current_state, accepted_state),
-        state_ops.assign(log_accept_ratio, accepted_log_accept_ratio))
-
-    return forward_step
-
-
-def proposal_uniform(step_size=1.,
-                     seed=None,
-                     name=None):
-  """Returns a callable that adds a random uniform tensor to the input.
-
-  This function returns a callable that accepts one `Tensor` argument of any
-  shape and a real data type (i.e. `tf.float32` or `tf.float64`). It adds a
-  sample from a random uniform distribution drawn from [-stepsize, stepsize]
-  to its input. It also returns the log of the ratio of the probability of
-  moving from the input point to the proposed point, but since this log ratio is
-  identically equal to 0 (because the probability of drawing a value `x` from
-  the symmetric uniform distribution is the same as the probability of drawing
-  `-x`), it simply returns None for the second element of the returned tuple.
-
-  Args:
-    step_size: A positive `float` or a scalar tensor of real dtype
-      controlling the scale of the uniform distribution.
-      If step_size = a, then draws are made uniformly from [-a, a].
-    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
-      applied.
-    name: A string that sets the name for this `Op`.
-
-  Returns:
-    proposal_fn:  A callable accepting one float-like `Tensor` and returning a
-      2-tuple. The first value in the tuple is a `Tensor` of the same shape and
-      dtype as the input argument and the second element of the tuple is None.
-  """
-
-  with ops.name_scope(name, "proposal_uniform", [step_size]):
-    step_size = ops.convert_to_tensor(step_size, name="step_size")
-
-    def proposal_fn(input_state, name=None):
-      """Adds a uniform perturbation to the input state.
-
-      Args:
-        input_state: A `Tensor` of any shape and real dtype.
-        name: A string that sets the name for this `Op`.
-
-      Returns:
-        proposal_state:  A float-like `Tensor` with `dtype` and shape matching
-          `input_state`.
-        log_transit_ratio: `None`. Proposal is symmetric.
-      """
-      with ops.name_scope(name, "proposer", [input_state]):
-        input_state = ops.convert_to_tensor(input_state, name="input_state")
-        return input_state + random_ops.random_uniform(
-            array_ops.shape(input_state),
-            minval=-step_size,
-            maxval=step_size,
-            seed=seed), None
-    return proposal_fn
-
-
-def proposal_normal(scale=1.,
-                    seed=None,
-                    name=None):
-  """Returns a callable that adds a random normal tensor to the input.
-
-  This function returns a callable that accepts one `Tensor` argument of any
-  shape and a real data type (i.e. `tf.float32` or `tf.float64`). The callable
-  adds a sample from a normal distribution with the supplied standard deviation
-  and zero mean to its input argument (called the proposal point).
-  The callable returns a tuple with the proposal point as the first element.
-  The second element is identically `None`. It is included so the callable is
-  compatible with the expected signature of the proposal scheme argument in the
-  `metropolis_hastings` function. A value of `None` indicates that the
-  probability of going from the input point to the proposal point is equal to
-  the probability of going from the proposal point to the input point.
-
-  Args:
-    scale: A positive `float` or a scalar tensor of any real dtype controlling
-      the scale of the normal distribution.
-    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
-      applied.
-    name: A string that sets the name for this `Op`.
-
-  Returns:
-    proposal_fn: A callable accepting one float-like `Tensor` and returning a
-      2-tuple. The first value in the tuple is a `Tensor` of the same shape and
-      dtype as the input argument and the second element of the tuple is None.
-  """
-
-  with ops.name_scope(name, "proposal_normal", [scale]):
-    scale = ops.convert_to_tensor(scale, name="scale")
-
-    def proposal_fn(input_state, name=None):
-      """Adds a normal perturbation to the input state.
-
-      Args:
-        input_state: A `Tensor` of any shape and real dtype.
-        name: A string that sets the name for this `Op`.
-
-      Returns:
-        proposal_state:  A float-like `Tensor` with `dtype` and shape matching
-          `input_state`.
-        log_transit_ratio: `None`. Proposal is symmetric.
-      """
-
-      with ops.name_scope(name, "proposer", [input_state]):
-        input_state = ops.convert_to_tensor(input_state, name="input_state")
-        return input_state + random_ops.random_normal(
-            array_ops.shape(input_state),
-            mean=0.,
-            stddev=scale,
-            dtype=scale.dtype,
-            seed=seed), None
-    return proposal_fn
-
-
-def _is_list_like(x):
-  """Helper which returns `True` if input is `list`-like."""
-  return isinstance(x, (tuple, list))
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 985177e897f443989e466d1a498c461a30aeb5cb..d193a8459d00b83580509c8de25d5f7801b195fe 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -44,14 +44,14 @@ def expectation_importance_sampler(f,
                                    n=None,
                                    seed=None,
                                    name='expectation_importance_sampler'):
-  r"""Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`.
+  r"""Monte Carlo estimate of `\\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\)`.
 
-  With `p(z) := exp{log_p(z)}`, this `Op` returns
+  With `\\(p(z) := exp^{log_p(z)}\\)`, this `Op` returns
 
   ```
-  n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,
-  \approx E_q[ f(Z) p(Z) / q(Z) ]
-  =       E_p[f(Z)]
+  \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,\\)
+  \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\)
+  \\(=       E_p[f(Z)]\\)
   ```
 
   This integral is done in log-space with max-subtraction to better handle the
@@ -95,9 +95,9 @@ def expectation_importance_sampler(f,
       log_values = log_f_z + log_p_z - q_log_prob_z
       return _logspace_mean(log_values)
 
-    # With f_plus(z) = max(0, f(z)), f_minus(z) = max(0, -f(z)),
-    # E_p[f(Z)] = E_p[f_plus(Z)] - E_p[f_minus(Z)]
-    #           = E_p[f_plus(Z) + 1] - E_p[f_minus(Z) + 1]
+    # With \\(f_{plus}(z) = max(0, f(z)), f_{minus}(z) = max(0, -f(z))\\),
+    # \\(E_p[f(Z)] = E_p[f_{plus}(Z)] - E_p[f_{minus}(Z)]\\)
+    # \\(          = E_p[f_{plus}(Z) + 1] - E_p[f_{minus}(Z) + 1]\\)
     # Without incurring bias, 1 is added to each to prevent zeros in logspace.
     # The logarithm is approximately linear around 1 + epsilon, so this is good
     # for small values of 'z' as well.
@@ -121,13 +121,13 @@ def expectation_importance_sampler_logspace(
     name='expectation_importance_sampler_logspace'):
   r"""Importance sampling with a positive function, in log-space.
 
-  With `p(z) := exp{log_p(z)}`, and `f(z) = exp{log_f(z)}`, this `Op`
-  returns
+  With `\\(p(z) := exp^{log_p(z)}\\)`, and `\\(f(z) = exp{log_f(z)}\\)`,
+  this `Op` returns
 
   ```
-  Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,
-  \approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]
-  =       Log[E_p[f(Z)]]
+  \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,\\)
+  \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\)
+  \\(=       Log[E_p[f(Z)]]\\)
   ```
 
   This integral is done in log-space with max-subtraction to better handle the
@@ -196,12 +196,12 @@ def _logspace_mean(log_values):
 
 def expectation(f, samples, log_prob=None, use_reparametrization=True,
                 axis=0, keep_dims=False, name=None):
-  """Computes the Monte-Carlo approximation of `E_p[f(X)]`.
+  """Computes the Monte-Carlo approximation of `\\(E_p[f(X)]\\)`.
 
   This function computes the Monte-Carlo approximation of an expectation, i.e.,
 
   ```none
-  E_p[f(X)] approx= m**-1 sum_i^m f(x_j),  x_j ~iid p(X)
+  \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j),  x_j\  ~iid\ p(X)\\)
   ```
 
   where:
@@ -216,8 +216,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   parameterless distribution (e.g.,
   `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
   expectation, i.e.,
-  `grad[ Avg{ s_i : i=1...n } ] = Avg{ grad[s_i] : i=1...n }` where
-  `S_n = Avg{s_i}` and `s_i = f(x_i), x_i ~ p`.
+  `grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n }` where
+  `S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\)`.
 
   However, if p is not reparameterized, TensorFlow's gradient will be incorrect
   since the chain-rule stops at samples of non-reparameterized distributions.
@@ -296,7 +296,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   Args:
     f: Python callable which can return `f(samples)`.
     samples: `Tensor` of samples used to form the Monte-Carlo approximation of
-      `E_p[f(X)]`.  A batch of samples should be indexed by `axis` dimensions.
+      `\\(E_p[f(X)]\\)`.  A batch of samples should be indexed by `axis`
+      dimensions.
     log_prob: Python callable which can return `log_prob(samples)`. Must
       correspond to the natural-logarithm of the pdf/pmf of each sample. Only
       required/used if `use_reparametrization=False`.
@@ -316,7 +317,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
   Returns:
     approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation
-      of `E_p[f(X)]`.
+      of `\\(E_p[f(X)]\\)`.
 
   Raises:
     ValueError: if `f` is not a Python `callable`.
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index 6fdcd0f996ee011842a5add79f06264a28a2145c..8eac1243ef63dd09c5c5dad4bcd9bd7a15f58900 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -14,15 +14,6 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 package_group(name = "friends")
 
 cc_library(
@@ -128,7 +119,7 @@ py_library(
 
 py_test(
     name = "gbdt_batch_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/training/functions/gbdt_batch_test.py"],
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 289f5bb3140974d8c37f4938ceef27275b099f9a..17e20c4b315bab8852c90788567a2f2f92119f40 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -10,23 +10,17 @@ package(
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "init_py",
-    srcs = [
-        "__init__.py",
-    ],
+    srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        "custom_export_strategy",
+        ":custom_loss_head",
+        ":estimator",
+        ":model",
+        ":trainer_hooks",
+    ],
 )
 
 py_library(
@@ -149,7 +143,7 @@ py_library(
 
 py_test(
     name = "dnn_tree_combined_estimator_test",
-    size = "small",
+    size = "medium",
     srcs = ["dnn_tree_combined_estimator_test.py"],
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index 23ba76210b3b68d0d0b2eef9d4040882654bdad9..d9b0d89a03dce40d34f76bb1262d26bb587a2dc7 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -54,7 +54,7 @@ def make_custom_export_strategy(name,
     An `ExportStrategy`.
   """
   base_strategy = saved_model_export_utils.make_export_strategy(
-      serving_input_fn=export_input_fn)
+      serving_input_fn=export_input_fn, strip_default_attrs=True)
   input_fn = export_input_fn()
   (sorted_feature_names, dense_floats, sparse_float_indices, _, _,
    sparse_int_indices, _, _) = gbdt_batch.extract_features(
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index cec3892b57655dc967b4e7926f7f5a6a30084487..2e7b8cba05b89feaac3f47e13d26e7ae37a7b0ae 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -25,15 +25,20 @@ from __future__ import division
 from __future__ import print_function
 
 import six
-
 from tensorflow.contrib import layers
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
 from tensorflow.contrib.layers.python.layers import optimizers
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.learn.python.learn.estimators import model_fn as contrib_model_fn_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.feature_column import feature_column as feature_column_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
@@ -46,6 +51,52 @@ from tensorflow.python.training import training_util
 
 _DNN_LEARNING_RATE = 0.001
 
+_CORE_MODE_TO_CONTRIB_MODE_ = {
+    model_fn_lib.ModeKeys.TRAIN: contrib_model_fn_lib.ModeKeys.TRAIN,
+    model_fn_lib.ModeKeys.EVAL: contrib_model_fn_lib.ModeKeys.EVAL,
+    model_fn_lib.ModeKeys.PREDICT: contrib_model_fn_lib.ModeKeys.INFER
+}
+
+
+def _core_mode_to_contrib_mode(mode):
+  return _CORE_MODE_TO_CONTRIB_MODE_[mode]
+
+
+def _export_outputs_to_output_alternatives(export_outputs):
+  """Converts EstimatorSpec.export_outputs to output_alternatives.
+
+  Args:
+    export_outputs: export_outputs created by create_estimator_spec.
+  Returns:
+    converted output_alternatives.
+  """
+  output = dict()
+  if export_outputs is not None:
+    for key, value in export_outputs.items():
+      if isinstance(value, export_output.ClassificationOutput):
+        exported_predictions = {
+            prediction_key.PredictionKey.SCORES: value.scores,
+            prediction_key.PredictionKey.CLASSES: value.classes
+        }
+        output[key] = (constants.ProblemType.CLASSIFICATION,
+                       exported_predictions)
+    return output
+  return None
+
+
+def _estimator_spec_to_model_fn_ops(estimator_spec, is_regression):
+  alternatives = []
+  if not is_regression:
+    _export_outputs_to_output_alternatives(estimator_spec.export_outputs)
+
+  return model_fn.ModelFnOps(
+      mode=_core_mode_to_contrib_mode(estimator_spec.mode),
+      predictions=estimator_spec.predictions,
+      loss=estimator_spec.loss,
+      train_op=estimator_spec.train_op,
+      eval_metric_ops=estimator_spec.eval_metric_ops,
+      output_alternatives=alternatives)
+
 
 def _get_optimizer(optimizer):
   if callable(optimizer):
@@ -59,16 +110,26 @@ def _add_hidden_layer_summary(value, tag):
   summary.histogram("%s_activation" % tag, value)
 
 
-def _dnn_tree_combined_model_fn(
-    features, labels, mode, head, dnn_hidden_units,
-    dnn_feature_columns, tree_learner_config, num_trees,
-    tree_examples_per_layer,
-    config=None, dnn_optimizer="Adagrad",
-    dnn_activation_fn=nn.relu, dnn_dropout=None,
-    dnn_input_layer_partitioner=None,
-    dnn_input_layer_to_tree=True, dnn_steps_to_train=10000,
-    tree_feature_columns=None,
-    tree_center_bias=True):
+def _dnn_tree_combined_model_fn(features,
+                                labels,
+                                mode,
+                                head,
+                                dnn_hidden_units,
+                                dnn_feature_columns,
+                                tree_learner_config,
+                                num_trees,
+                                tree_examples_per_layer,
+                                config=None,
+                                dnn_optimizer="Adagrad",
+                                dnn_activation_fn=nn.relu,
+                                dnn_dropout=None,
+                                dnn_input_layer_partitioner=None,
+                                dnn_input_layer_to_tree=True,
+                                dnn_steps_to_train=10000,
+                                tree_feature_columns=None,
+                                tree_center_bias=False,
+                                use_core_versions=False,
+                                is_regression=False):
   """DNN and GBDT combined model_fn.
 
   Args:
@@ -106,6 +167,9 @@ def _dnn_tree_combined_model_fn(
       set to True, these features are in addition to dnn_feature_columns.
     tree_center_bias: Whether a separate tree should be created for
       first fitting the bias.
+    use_core_versions: Whether feature columns and loss are from the core (as
+      opposed to contrib) version of tensorflow.
+    is_regression: Whether the problem is regression or not.
 
   Returns:
     A `ModelFnOps` object.
@@ -135,11 +199,17 @@ def _dnn_tree_combined_model_fn(
         "input_from_feature_columns",
         values=tuple(six.itervalues(features)),
         partitioner=dnn_partitioner) as input_layer_scope:
-      input_layer = layers.input_from_feature_columns(
-          columns_to_tensors=features,
-          feature_columns=dnn_feature_columns,
-          weight_collections=[dnn_parent_scope],
-          scope=input_layer_scope)
+      if use_core_versions:
+        input_layer = feature_column_lib.input_layer(
+            features=features,
+            feature_columns=dnn_feature_columns,
+            weight_collections=[dnn_parent_scope])
+      else:
+        input_layer = layers.input_from_feature_columns(
+            columns_to_tensors=features,
+            feature_columns=dnn_feature_columns,
+            weight_collections=[dnn_parent_scope],
+            scope=input_layer_scope)
     previous_layer = input_layer
     for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
       with variable_scope.variable_scope(
@@ -222,24 +292,51 @@ def _dnn_tree_combined_model_fn(
     del loss
     return control_flow_ops.no_op()
 
-  model_fn_ops = head.create_model_fn_ops(
-      features=features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_no_train_op_fn,
-      logits=tree_train_logits)
-  dnn_train_op = head.create_model_fn_ops(
-      features=features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_dnn_train_op_fn,
-      logits=dnn_logits).train_op
-  tree_train_op = head.create_model_fn_ops(
-      features=tree_features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_tree_train_op_fn,
-      logits=tree_train_logits).train_op
+  if use_core_versions:
+    model_fn_ops = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_no_train_op_fn,
+        logits=tree_train_logits)
+    dnn_train_op = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_dnn_train_op_fn,
+        logits=dnn_logits)
+    dnn_train_op = _estimator_spec_to_model_fn_ops(dnn_train_op,
+                                                   is_regression).train_op
+
+    tree_train_op = head.create_estimator_spec(
+        features=tree_features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_tree_train_op_fn,
+        logits=tree_train_logits)
+    tree_train_op = _estimator_spec_to_model_fn_ops(tree_train_op,
+                                                    is_regression).train_op
+
+    model_fn_ops = _estimator_spec_to_model_fn_ops(model_fn_ops, is_regression)
+  else:
+    model_fn_ops = head.create_model_fn_ops(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_no_train_op_fn,
+        logits=tree_train_logits)
+    dnn_train_op = head.create_model_fn_ops(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_dnn_train_op_fn,
+        logits=dnn_logits).train_op
+    tree_train_op = head.create_model_fn_ops(
+        features=tree_features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_tree_train_op_fn,
+        logits=tree_train_logits).train_op
 
   if tree_center_bias:
     num_trees += 1
@@ -277,7 +374,8 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
                tree_feature_columns=None,
-               tree_center_bias=True):
+               tree_center_bias=False,
+               use_core_versions=False):
     """Initializes a DNNBoostedTreeCombinedClassifier instance.
 
     Args:
@@ -322,6 +420,8 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
         set to True, these features are in addition to dnn_feature_columns.
       tree_center_bias: Whether a separate tree should be created for
         first fitting the bias.
+      use_core_versions: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     head = head_lib.multi_class_head(
         n_classes=n_classes,
@@ -336,8 +436,8 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
           tree_learner_config, num_trees, tree_examples_per_layer, config,
           dnn_optimizer, dnn_activation_fn, dnn_dropout,
           dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train,
-          tree_feature_columns, tree_center_bias)
+          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
+          use_core_versions)
 
     super(DNNBoostedTreeCombinedClassifier, self).__init__(
         model_fn=_model_fn, model_dir=model_dir,
@@ -366,7 +466,8 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
                tree_feature_columns=None,
-               tree_center_bias=True):
+               tree_center_bias=False,
+               use_core_versions=False):
     """Initializes a DNNBoostedTreeCombinedRegressor instance.
 
     Args:
@@ -411,6 +512,8 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
         set to True, these features are in addition to dnn_feature_columns.
       tree_center_bias: Whether a separate tree should be created for
         first fitting the bias.
+      use_core_versions: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     head = head_lib.regression_head(
         label_name=label_name,
@@ -426,11 +529,26 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
 
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
-          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
-          tree_learner_config, num_trees, tree_examples_per_layer, config,
-          dnn_optimizer, dnn_activation_fn, dnn_dropout,
-          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train, tree_feature_columns, tree_center_bias)
+          features,
+          labels,
+          mode,
+          head,
+          dnn_hidden_units,
+          dnn_feature_columns,
+          tree_learner_config,
+          num_trees,
+          tree_examples_per_layer,
+          config,
+          dnn_optimizer,
+          dnn_activation_fn,
+          dnn_dropout,
+          dnn_input_layer_partitioner,
+          dnn_input_layer_to_tree,
+          dnn_steps_to_train,
+          tree_feature_columns,
+          tree_center_bias,
+          use_core_versions,
+          is_regression=True)
 
     super(DNNBoostedTreeCombinedRegressor, self).__init__(
         model_fn=_model_fn, model_dir=model_dir,
@@ -460,7 +578,8 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
                tree_feature_columns=None,
-               tree_center_bias=True):
+               tree_center_bias=False,
+               use_core_versions=False):
     """Initializes a DNNBoostedTreeCombinedEstimator instance.
 
     Args:
@@ -500,6 +619,8 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
         set to True, these features are in addition to dnn_feature_columns.
       tree_center_bias: Whether a separate tree should be created for
         first fitting the bias.
+      use_core_versions: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
@@ -507,8 +628,8 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
           tree_learner_config, num_trees, tree_examples_per_layer, config,
           dnn_optimizer, dnn_activation_fn, dnn_dropout,
           dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train,
-          tree_feature_columns, tree_center_bias)
+          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
+          use_core_versions)
 
     super(DNNBoostedTreeCombinedEstimator, self).__init__(
         model_fn=_model_fn, model_dir=model_dir,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
index 83d58c561008e8a5a69eb503d1605bb9e940f281..f495edc62f0909880c170ccb4cf5d11e3f20f55c 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -19,15 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 import tempfile
-
 from tensorflow.contrib.boosted_trees.estimator_batch import dnn_tree_combined_estimator as estimator
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
 from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
 
 
@@ -100,6 +102,35 @@ class DNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
     classifier.fit(input_fn=_train_input_fn, steps=15)
     classifier.evaluate(input_fn=_eval_input_fn, steps=1)
 
+  def testFitAndEvaluateDontThrowExceptionWithCore(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    # Use core head
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+
+    classifier = estimator.DNNBoostedTreeCombinedEstimator(
+        head=head_fn,
+        dnn_hidden_units=[1],
+        # Use core feature columns
+        dnn_feature_columns=[core_feature_column.numeric_column("x")],
+        tree_learner_config=learner_config,
+        num_trees=1,
+        tree_examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        dnn_steps_to_train=10,
+        dnn_input_layer_to_tree=True,
+        tree_feature_columns=[],
+        use_core_versions=True)
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 01752416b347dd0a5e646283b6b5572592df4690..70454aa6dbdb19297028a3f80822719bef5a0f72 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -81,7 +81,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
         n_classes=n_classes,
         weight_column_name=weight_column_name,
         enable_centered_bias=False,
-        loss_fn=loss_fn)
+        loss_fn=loss_fn,
+        label_keys=label_keys)
     if learner_config.num_classes == 0:
       learner_config.num_classes = n_classes
     elif learner_config.num_classes != n_classes:
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 0f4c2298f56be48bb32f52d5d44cff8afe284f1e..0b28f81e7ca9a1228adc5bde19c429265e0aa9b8 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -253,7 +253,7 @@ class CreateQuantileAccumulatorOp : public OpKernel {
  private:
   float epsilon_;
   int32 num_quantiles_;
-  // An upperbound on the number of enteries that the summaries might have
+  // An upper bound on the number of entries that the summaries might have
   // for a feature.
   int64 max_elements_;
   bool generate_quantiles_;
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index 131bd48562a55a08981ac73277e93024db0d85d3..3028c2281705bd7e34b212332160d25386559d4e 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -15,17 +15,6 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # Utils
 
 cc_library(
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc
index cf4f9a097a3368465fd4d9afb981bbaa68b4df49..35b059f3496dbc8fb2b3d4fe6ec6b55a9d73dd0c 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc
@@ -54,7 +54,7 @@ Status BatchFeatures::Initialize(
     TF_CHECK_AND_RETURN_IF_ERROR(
         dense_float_feature.dim_size(1) == 1,
         errors::InvalidArgument(
-            "Dense float features may not be multi-valent: dim_size(1) = ",
+            "Dense float features may not be multivalent: dim_size(1) = ",
             dense_float_feature.dim_size(1)));
     dense_float_feature_columns_.emplace_back(dense_float_feature);
   }
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
index 7815fa049aa165a944c45872c762b7a5bf91b316..a3b1b013e3a40116f74d6ed2df78d87ed3a11ac7 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -48,9 +48,9 @@ class BatchFeatures {
   Status GetFeatureColumnSizes(int64* const num_dense_float_features,
                                int64* const num_sparse_float_features,
                                int64* const num_sparse_int_features) const {
-    QCHECK_NE(num_dense_float_features, (int64*) nullptr);
-    QCHECK_NE(num_sparse_float_features, (int64*) nullptr);
-    QCHECK_NE(num_sparse_int_features, (int64*) nullptr);
+    QCHECK_NE(num_dense_float_features, static_cast<int64*>(nullptr));
+    QCHECK_NE(num_sparse_float_features, static_cast<int64*>(nullptr));
+    QCHECK_NE(num_sparse_int_features, static_cast<int64*>(nullptr));
     *num_dense_float_features = dense_float_feature_columns_.size();
     *num_sparse_float_features = sparse_float_feature_columns_.size();
     *num_sparse_int_features = sparse_int_feature_columns_.size();
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
index 609519e8b1153a27d987c5f9ca9bfcc9ee6717d6..cfe9101e7435cd798569f3e52a87fc8ed7b6a239 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
@@ -59,7 +59,7 @@ TEST_F(BatchFeaturesTest, DenseFloatFeatures_Multivalent) {
   BatchFeatures batch_features(1);
   auto dense_vec = AsTensor<float>({3.0f, 7.0f}, {1, 2});
   auto expected_error = InvalidArgument(
-      "Dense float features may not be multi-valent: dim_size(1) = 2");
+      "Dense float features may not be multivalent: dim_size(1) = 2");
   EXPECT_EQ(expected_error,
             batch_features.Initialize({dense_vec}, {}, {}, {}, {}, {}, {}));
 }
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
index db34db998a7442c69f2ab468f4557d991429f4ee..ce67db797ded54f5023eaa89369d4781aad31a7c 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
@@ -54,7 +54,7 @@ Status DropoutUtils::DropOutTrees(
   if (probability_of_skipping_dropout < 0 ||
       probability_of_skipping_dropout > 1) {
     return errors::InvalidArgument(
-        "Probability of skiping dropout must be in [0,1] range");
+        "Probability of skipping dropout must be in [0,1] range");
   }
   const auto num_trees = weights.size();
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
index 928bfbfe5c9394ab4083aabced4c8e1149bb10aa..77c16da5410fe65b20839c7b6bc677067d7ff297 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
@@ -66,7 +66,7 @@ class DropoutUtils {
       // Current weights and num_updates will be updated as a result of this
       // func
       std::vector<float>* current_weights,
-      // How many weight assignements have been done for each tree already.
+      // How many weight assignments have been done for each tree already.
       std::vector<int32>* num_updates);
 };
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc
index 0138aae3dbd3773241cb6644db625b99f9bf1372..cc7604745e6bb90837eeca1123faa88dc914e4fc 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc
@@ -34,7 +34,7 @@ TEST_F(SparseColumnIterableTest, Empty) {
 }
 
 TEST_F(SparseColumnIterableTest, Iterate) {
-  // 8 examples having 7 sparse features with the 3rd and 7th multi-valent.
+  // 8 examples having 7 sparse features with the 3rd and 7th multivalent.
   // This can be visualized like the following:
   // Instance | Sparse |
   // 0        |  x     |
diff --git a/tensorflow/contrib/boosted_trees/proto/BUILD b/tensorflow/contrib/boosted_trees/proto/BUILD
index 9a61e163eb5ff51dc75de4e40e0f43b090d03c0c..b07f0a4314246eea63764bb6d5e166dd720644fb 100644
--- a/tensorflow/contrib/boosted_trees/proto/BUILD
+++ b/tensorflow/contrib/boosted_trees/proto/BUILD
@@ -4,17 +4,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "learner_proto",
     srcs = [
diff --git a/tensorflow/contrib/boosted_trees/proto/tree_config.proto b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
index 4407c4d981785a279b6296f4726a221cacb4c5b1..81411aa84ae848cfaa1392e82a1e38c3df19cdb6 100644
--- a/tensorflow/contrib/boosted_trees/proto/tree_config.proto
+++ b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
@@ -53,7 +53,7 @@ message DenseFloatBinarySplit {
   // Float feature column and split threshold describing
   // the rule feature <= threshold.
   int32 feature_column = 1;
-  // If feature column is multivalent, this holds the index of the dimensiong
+  // If feature column is multivalent, this holds the index of the dimension
   // for the split. Defaults to 0.
   int32 dimension_id = 5;
   float threshold = 2;
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
index c1acf351603dd80c2d14c7ee0a5b4c89706bc1bf..cf55759aaabfb265466f4bbf8b2806d4347ca0b1 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
@@ -120,8 +120,8 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
     """Sets up the prediction tests.
 
     Create a batch of two examples having one dense float, two sparse float
-    single valued, one sparse float multidimensionl and one sparse int features.
-    The data looks like the following:
+    single valued, one sparse float multidimensional and one sparse int
+    features.  The data looks like the following:
     | Instance | Dense0 | SparseF0 | SparseF1 | SparseI0 | SparseM
     | 0        |  7     |    -3    |          |    9,1   | __, 5.0
     | 1        | -2     |          | 4        |          |  3, ___
@@ -810,7 +810,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
     # building. This tree should never be dropped.
     num_trees = 10
     with self.test_session():
-      # Empty tree ensenble.
+      # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Add 10 trees with some weights.
       for i in range(0, num_trees):
@@ -951,7 +951,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
   def testDropOutZeroProb(self):
     with self.test_session():
-      # Empty tree ensenble.
+      # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Add 1000 trees with some weights.
       for i in range(0, 999):
@@ -994,7 +994,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
   def testAveragingAllTrees(self):
     with self.test_session():
-      # Empty tree ensenble.
+      # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       adjusted_tree_ensemble_config = (
           tree_config_pb2.DecisionTreeEnsembleConfig())
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index 81f58de28cbe98bb996c6665114eeb0030ee52f9..074623699d9d82f999c9cbc483ddcd8a959f4bad 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -482,7 +482,7 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
     """Sets up the quantile op tests.
 
     Create a batch of 4 examples having 2 dense and 4 sparse features.
-    Forth sparse feature is multivalent (3 dimensional)
+    Fourth sparse feature is multivalent (3 dimensional)
     The data looks like this
     | Instance | Dense 0 | Dense 1 | Sparse 0 | Sparse 1 |Sparse 2| SparseM
     | 0        |   -0.1  |  -1     |   -2     |   0.1    |        |_ ,1,_
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 97d57e8b23608d4c3a8719426a75056fc6417d1d..1b184d296b329cee481db67992e77d1e33e18035 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -184,7 +184,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     """Finalizes quantile summary stream and resets it for next iteration.
 
     Args:
-      stamp_token: Exepcted current token.
+      stamp_token: Expected current token.
       next_stamp_token: Next value for the token.
     Returns:
       A list of quantiles or approximate boundaries.
diff --git a/tensorflow/contrib/boosted_trees/resources/BUILD b/tensorflow/contrib/boosted_trees/resources/BUILD
index 9fc101612f1e2a6bf6c5d86ea8c7199936dbb069..c0651868453d40d57e842862855f89e6845c507f 100644
--- a/tensorflow/contrib/boosted_trees/resources/BUILD
+++ b/tensorflow/contrib/boosted_trees/resources/BUILD
@@ -9,17 +9,6 @@ package(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "stamped_resource",
     hdrs = ["stamped_resource.h"],
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index fe8bd072afd43a64fa62a65bd8900b5a98dbe761..f3a75e8688ece19a6e6fd53ee9faf7f4144d76cf 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -14,18 +14,6 @@ load(
     "tf_py_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_gen_op_libs(
     op_lib_names = ["bigquery_reader_ops"],
     deps = [
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index 56f930a9a8d32c5c3a025163ef56c9562f17d864..ff46f0daa80a70badedf73e15bfaf4dca85fdd89 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -20,20 +20,6 @@ load(
     "tf_proto_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_kernel_library(
     name = "bigquery_reader_ops",
     srcs = ["bigquery_reader_ops.cc"],
@@ -73,6 +59,7 @@ tf_cc_test(
     ],
     deps = [
         ":bigquery_table_accessor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
index e9b79a066def566096d6c3f3745974423e3371d1..7416eb19d3324fad84876cde5353bc25bac8f648 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -28,8 +29,8 @@ constexpr char kTestProject[] = "test-project";
 constexpr char kTestDataset[] = "test-dataset";
 constexpr char kTestTable[] = "test-table";
 
-bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 1a124eca364424b651de86bfaac6f33ad131804b..c239e6f8f960910cee14e1df7c4678c643496f54 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -10,19 +10,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
-
 py_library(
     name = "cluster_resolver_pip",
     srcs = [
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 300b19733e2b4d1b912f966e94ae0286ed9c694d..95c5c920aa2ccf92d8aa6aa179102fe379f0236c 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -73,7 +73,7 @@ class TPUClusterResolver(ClusterResolver):
                zone=None,
                project=None,
                job_name='worker',
-               coordinator_name='coordinator',
+               coordinator_name=None,
                coordinator_address=None,
                credentials='default',
                service=None):
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 48c3f6bb4f2d1643982e03d9ed68db14c10c184a..e1e3e6867a24b917885a9ab7e780df55742ec0f9 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -117,7 +117,8 @@ class TPUClusterResolverTest(test.TestCase):
         zone=None,
         tpu=['test-tpu-1'],
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
@@ -170,6 +171,7 @@ class TPUClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-central1-c',
         tpu=['test-tpu-1'],
+        coordinator_name='coordinator',
         coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
@@ -196,6 +198,7 @@ class TPUClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
+        coordinator_name='coordinator',
         coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
@@ -239,7 +242,8 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_cluster_resolver = TPUClusterResolver(
         tpu='test-tpu-1',
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 17f65999faaf5c0ca39bfbc968a9140dbff49c2e..1fefb731a775d9cd2478cbb654662ec6ba673fed 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 730b778632e79cc3c96ad237f282d687ee325ce7)
+set(GRPC_TAG bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
@@ -35,6 +35,7 @@ else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
 endif()
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index f3a37ff5088e3f9e54e38c0edb5777c27b26969f..b9d1dd88d4c2d3c9141ba56e14911e06b4d33f7c 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 8502189abfa44c249c01c2cad64e6ed660a9a668)
+set(nsync_TAG 0559ce013feac8db639ee1bf776aca0325d28777)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
diff --git a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
index aaae18a313dd082b428654091c9411600c981ec9..6f059c7225dd0938b758e8f9c28ec36fcff6db4c 100644
--- a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
@@ -42,7 +42,6 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
   include_directories ("${PROJECT_SOURCE_DIR}/platform/c++11")
   add_definitions ("-DNSYNC_USE_CPP11_TIMEPOINT -DNSYNC_ATOMIC_CPP11")
   set (NSYNC_OS_CPP_SRC
-    "platform/c++11/src/nsync_semaphore_mutex.cc"
     "platform/c++11/src/per_thread_waiter.cc"
     "platform/c++11/src/yield.cc"
     "platform/c++11/src/time_rep_timespec.cc"
@@ -52,6 +51,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/win32")
     add_compile_options ("/TP")
     set (NSYNC_OS_SRC
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       "platform/win32/src/clock_gettime.c"
       "platform/win32/src/pthread_key_win32.cc"
       ${NSYNC_OS_CPP_SRC}
@@ -68,6 +68,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
       ${NSYNC_OS_CPP_SRC}
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       "platform/posix/src/clock_gettime.c"
       "platform/posix/src/nsync_semaphore_mutex.c"
     )
@@ -75,9 +76,11 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
       "platform/posix/src/start_thread.c"
     )
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "LinuxX")
+    include_directories (BEFORE "${PROJECT_SOURCE_DIR}/platform/c++11.futex")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
+      "platform/linux/src/nsync_semaphore_futex.c"
       ${NSYNC_OS_CPP_SRC}
     )
     set (NSYNC_TEST_OS_SRC
@@ -87,6 +90,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       ${NSYNC_OS_CPP_SRC}
     )
     set (NSYNC_TEST_OS_SRC
@@ -96,6 +100,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       ${NSYNC_OS_CPP_SRC}
     )
     set (NSYNC_TEST_OS_SRC
@@ -105,6 +110,7 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
+      "platform/c++11/src/nsync_semaphore_mutex.cc"
       ${NSYNC_OS_CPP_SRC}
     )
     set (NSYNC_TEST_OS_SRC
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 0d2a6a23db26af2fb9498849aa93e74379915fe3..f273c7e5508e10407d013acd7adc08c732322841 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -79,9 +79,11 @@ tensorflow/python/keras/_impl/keras/preprocessing
 tensorflow/python/keras/_impl/keras/utils
 tensorflow/python/keras/_impl/keras/wrappers
 tensorflow/python/kernel_tests
+tensorflow/python/kernel_tests/boosted_trees
 tensorflow/python/kernel_tests/distributions
 tensorflow/python/kernel_tests/linalg
 tensorflow/python/kernel_tests/random
+tensorflow/python/kernel_tests/testdata
 tensorflow/python/layers
 tensorflow/python/lib
 tensorflow/python/lib/core
@@ -147,8 +149,6 @@ tensorflow/contrib/crf
 tensorflow/contrib/crf/python
 tensorflow/contrib/crf/python/ops
 tensorflow/contrib/cudnn_rnn
-tensorflow/contrib/cudnn_rnn/kernels
-tensorflow/contrib/cudnn_rnn/ops
 tensorflow/contrib/cudnn_rnn/python
 tensorflow/contrib/cudnn_rnn/python/layers
 tensorflow/contrib/cudnn_rnn/python/ops
@@ -160,6 +160,9 @@ tensorflow/contrib/data/python/ops
 tensorflow/contrib/decision_trees
 tensorflow/contrib/decision_trees/proto
 tensorflow/contrib/deprecated
+tensorflow/contrib/distribute
+tensorflow/contrib/distribute/python
+tensorflow/contrib/distribute/python/examples
 tensorflow/contrib/distributions
 tensorflow/contrib/distributions/python
 tensorflow/contrib/distributions/python/ops
@@ -332,6 +335,7 @@ tensorflow/contrib/nccl/kernels
 tensorflow/contrib/nccl/ops
 tensorflow/contrib/nccl/python
 tensorflow/contrib/nccl/python/ops
+tensorflow/contrib/nearest_neighbor
 tensorflow/contrib/nearest_neighbor/kernels
 tensorflow/contrib/nearest_neighbor/ops
 tensorflow/contrib/nearest_neighbor/python
@@ -342,6 +346,7 @@ tensorflow/contrib/nn/python/ops
 tensorflow/contrib/opt
 tensorflow/contrib/opt/python
 tensorflow/contrib/opt/python/training
+tensorflow/contrib/optimizer_v2
 tensorflow/contrib/pi_examples
 tensorflow/contrib/pi_examples/camera
 tensorflow/contrib/pi_examples/label_image
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index c03c0c80fe62a4f95d0fcf240ee25725a19d86f0..0c80d529af5230ed6d36b265e12ee4b749a14ec4 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -1,4 +1,5 @@
 tensorflow/core
+tensorflow/core/kernels/boosted_trees
 tensorflow/core/profiler
 tensorflow/python
 tensorflow/contrib/boosted_trees/proto
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 998f99ecc19f88921dce14fde892912fb699ad08..ed018b4fed8e47632f632723f19cc755f2079f86 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -67,8 +67,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 59e094812aaf4da2549d96314fc550e5635f9de8..092a48bc6b63503be39343a1f936875082490b3e 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -15,19 +15,21 @@
 set(tf_op_lib_names
     "audio_ops"
     "array_ops"
-		"batch_ops"
+    "batch_ops"
     "bitwise_ops"
+    "boosted_trees_ops"
     "candidate_sampling_ops"
     "checkpoint_ops"
     "control_flow_ops"
     "ctc_ops"
+    "cudnn_rnn_ops"
     "data_flow_ops"
     "dataset_ops"
     "functional_ops"
     "image_ops"
     "io_ops"
     "linalg_ops"
-		"list_ops"
+    "list_ops"
     "lookup_ops"
     "logging_ops"
     "manip_ops"
@@ -47,7 +49,7 @@ set(tf_op_lib_names
     "state_ops"
     "stateless_random_ops"
     "string_ops"
-		"summary_ops"
+    "summary_ops"
     "training_ops"
 )
 
@@ -84,7 +86,6 @@ GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_prediction "${tensorflow_source_dir}/t
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(coder "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(cudnn_rnn "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(data_dataset "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index b730ebd3baacafe8ae401e8987104f3062372954..fae45ead5cafcb0f55834af223555f6e65f16015 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -319,6 +319,7 @@ GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
 GENERATE_PYTHON_OP_LIB("batch_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
+GENERATE_PYTHON_OP_LIB("boosted_trees_ops")
 GENERATE_PYTHON_OP_LIB("math_ops")
 GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("candidate_sampling_ops")
@@ -326,6 +327,7 @@ GENERATE_PYTHON_OP_LIB("checkpoint_ops")
 GENERATE_PYTHON_OP_LIB("control_flow_ops"
   ADDITIONAL_LIBRARIES $<TARGET_OBJECTS:tf_no_op>)
 GENERATE_PYTHON_OP_LIB("ctc_ops")
+GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
@@ -348,6 +350,7 @@ GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
 GENERATE_PYTHON_OP_LIB("spectral_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
+GENERATE_PYTHON_OP_LIB("summary_ops")
 GENERATE_PYTHON_OP_LIB("user_ops")
 GENERATE_PYTHON_OP_LIB("training_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/training/gen_training_ops.py)
@@ -366,8 +369,6 @@ GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_stats_accumulator_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_stats_accumulator_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_coder_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/coder/python/ops/gen_coder_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_cudnn_rnn_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cudnn_rnn/ops/gen_cudnn_rnn_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_data_dataset_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_dataset_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
@@ -419,8 +420,6 @@ GENERATE_PYTHON_OP_LIB("stateless_random_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/stateless/gen_stateless_random_ops.py)
 GENERATE_PYTHON_OP_LIB("debug_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/debug/ops/gen_debug_ops.py)
-GENERATE_PYTHON_OP_LIB("summary_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/summary/gen_summary_ops.py)
 
 add_custom_target(tf_python_ops SOURCES ${tf_python_ops_generated_files} ${PYTHON_PROTO_GENFILES})
 add_dependencies(tf_python_ops tf_python_op_gen_main)
@@ -475,6 +474,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/lib/core/ndarray_tensor_bridge.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_exception_registry.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_exception_registry.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.h"
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 6d36d5fc5c2854b2d7d2542a3cb12e033e193b88..9738bbeb9aebaeb67495127528e26634887d392c 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -100,8 +100,7 @@ if(WIN32)
 endif(WIN32)
 
 target_include_directories(tensorflow PUBLIC 
-    $<INSTALL_INTERFACE:include/>
-    $<INSTALL_INTERFACE:include/external/nsync/public>)
+    $<INSTALL_INTERFACE:include/>)
 
 install(TARGETS tensorflow EXPORT tensorflow_export
         RUNTIME DESTINATION bin
@@ -133,10 +132,6 @@ install(DIRECTORY ${tensorflow_source_dir}/tensorflow/stream_executor/
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src/google/
         DESTINATION include/google
         FILES_MATCHING PATTERN "*.h")
-# nsync headers
-install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/
-        DESTINATION include/external/nsync
-        FILES_MATCHING PATTERN "*.h")
 # Eigen directory
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen/
         DESTINATION include/Eigen)
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index b3e5b30826097d6c747245fec975fcbea3785d15..92f2ab6dea8e7da5dd8481639eda24e31c06848f 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -195,9 +195,11 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/python/profiler/model_analyzer_test.py"
     # Fails because uses data dependencies with bazel
     "${tensorflow_source_dir}/tensorflow/python/saved_model/saved_model_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py"
     # requires scipy
     "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/preprocessing/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py"
     # Takes very long to run without sharding (defined in bazel build file).
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/cwise_ops_test.py"
     # Loading resources in contrib doesn't seem to work on Windows
@@ -208,6 +210,9 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py"
     # Test is flaky on Windows GPU builds (b/38283730).
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/python/ops/gmm_test.py"
+    # Disable following manual tag in BUILD.
+    "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py"
+
   )
   if (WIN32)
     set(tf_test_src_py_exclude
@@ -279,6 +284,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py"  # Deadlocks
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/sloppy_transformation_dataset_op_test.py"  # b/65430561
+      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py"  # Segfaults on Windows.
       # tensor_forest tests (also note that we exclude the hybrid tests for now)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py"  # Results in wrong order.
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py"  # Results in wrong order.
diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
index ec3d550b70d2aaa23b989c44f3d86fa87cffb335..ce12e38248785987e51befa47d04143e235554fe 100644
--- a/tensorflow/contrib/coder/BUILD
+++ b/tensorflow/contrib/coder/BUILD
@@ -154,14 +154,3 @@ tf_py_test(
     ],
     main = "python/ops/coder_ops_test.py",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index 388d8e6ed6d9cb9400b0bfbe8e3f50b80149ea1a..bcee0b04c8430588c2dcbc199504bede0436f8f1 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -46,15 +46,3 @@ cuda_py_test(
     ],
     xla_enabled = True,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/copy_graph/BUILD b/tensorflow/contrib/copy_graph/BUILD
index 8ec706df74e2c91345c4bf7a506fdb424a996773..fa44c4d54e1ee871feb425115525b1cf8b732214 100644
--- a/tensorflow/contrib/copy_graph/BUILD
+++ b/tensorflow/contrib/copy_graph/BUILD
@@ -41,15 +41,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index b806799202bff4f2f6dbf717fbeea74a04b8cd6e..102bc460fdadb0ad5dc9a2960b8655c55357108e 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -201,7 +201,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
     #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it
     #stores String-based info such as name, device and type of the op.
     #Unique to every Operation instance.
-    new_node_def = deepcopy(op._node_def)
+    new_node_def = deepcopy(op.node_def)
     #Change the name
     new_node_def.name = new_name
 
@@ -211,7 +211,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
 
     #Make a copy of the op_def too.
     #Its unique to every _type_ of Operation.
-    op_def = deepcopy(op._op_def)
+    op_def = deepcopy(op.op_def)
 
     #Initialize a new Operation instance
     new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types,
diff --git a/tensorflow/contrib/crf/BUILD b/tensorflow/contrib/crf/BUILD
index 7aad4abdb908d0284b85137bff842bd0f38d09c6..5c1a17df4f95f3c4d05b286de0e3d7b009a76bd7 100644
--- a/tensorflow/contrib/crf/BUILD
+++ b/tensorflow/contrib/crf/BUILD
@@ -40,15 +40,3 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index fec358c4e1067dc8dc8173d1b9d05dc90b90ca05..8b5d13f72555516babc4250fd934c55adc3d1b8b 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -9,52 +9,10 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-tf_custom_op_library(
-    name = "python/ops/_cudnn_rnn_ops.so",
-    srcs = [
-        "kernels/cudnn_rnn_ops.cc",
-        "ops/cudnn_rnn_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/core/kernels:bounds_check_lib",
-        "@farmhash_archive//:farmhash",
-    ],
-)
-
-tf_kernel_library(
-    name = "cudnn_rnn_kernels",
-    srcs = ["kernels/cudnn_rnn_ops.cc"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
-        "//tensorflow/core/kernels:bounds_check_lib",
-        "//third_party/eigen3",
-        "@farmhash_archive//:farmhash",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["cudnn_rnn_ops"],
-    deps = [
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "cudnn_rnn_ops",
-    deps = [":cudnn_rnn_ops_op_lib"],
-)
 
 tf_custom_op_py_library(
     name = "cudnn_rnn_py",
@@ -64,20 +22,13 @@ tf_custom_op_py_library(
         "python/layers/cudnn_rnn.py",
         "python/ops/cudnn_rnn_ops.py",
     ],
-    dso = [
-        ":python/ops/_cudnn_rnn_ops.so",
-    ],
-    kernels = [
-        ":cudnn_rnn_kernels",
-        ":cudnn_rnn_ops_op_lib",
-    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        ":cudnn_rnn_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cudnn_rnn_ops_gen",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
@@ -172,32 +123,3 @@ cuda_py_test(
         "requires_cudnn5",
     ],
 )
-
-tf_cc_test(
-    name = "cudnn_rnn_ops_test_cc",
-    size = "small",
-    srcs = [
-        "ops/cudnn_rnn_ops_test.cc",
-    ],
-    deps = [
-        ":cudnn_rnn_ops_op_lib",
-        "//tensorflow/core",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index e87162f0ee9cc4eed795555171f55a93639e83cf..2ac94424061a07e5727a98642aa855222c0afb81 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -17,27 +17,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cudnn_rnn.ops import gen_cudnn_rnn_ops
 from tensorflow.contrib.rnn.python.ops import lstm_ops
-from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
 
-_cudnn_rnn_ops_so = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
-
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
 CUDNN_RNN_BIDIRECTION = "bidirectional"
 CUDNN_LSTM = "lstm"
@@ -91,19 +86,23 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
 
   Cudnn compatible GRU (from Cudnn library user guide):
   ```python
-  r_t = sigma(x_t * W_r + h_t-1 * R_h + b_Wr + b_Rr)  # reset gate
-  u_t = sigma(x_t * W_u + h_t-1 * R_u + b_Wu + b_Ru)  # update gate
-  h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_Rh) + b_Wh)  # new memory gate
-  h_t = (1 - u_t) .* h'_t + u_t .* h_t-1
+  # reset gate
+  $$r_t = \sigma(x_t * W_r + h_t-1 * R_h + b_{Wr} + b_{Rr})$$
+  # update gate
+  $$u_t = \sigma(x_t * W_u + h_t-1 * R_u + b_{Wu} + b_{Ru})$$
+  # new memory gate
+  $$h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_{Rh}) + b_{Wh})$$
+  $$h_t = (1 - u_t) .* h'_t + u_t .* h_t-1$$
   ```
 
   Other GRU (see @{tf.nn.rnn_cell.GRUCell} and @{tf.contrib.rnn.GRUBlockCell}):
   ```python
-  h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_Wh)  # new memory gate
+  # new memory gate
+  \\(h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_{Wh})\\)
   ```
   which is not equivalent to Cudnn GRU: in addition to the extra bias term b_Rh,
   ```python
-  r .* (h * R) != (r .* h) * R
+  \\(r .* (h * R) != (r .* h) * R\\)
   ```
   """
 
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 0458199ff771bc45603106411550a39448e515b8..7bb0dc1c0f695f4d1c7739fa11764ded4ff9410a 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -8,6 +8,11 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
     "tf_gen_op_libs",
+    "if_not_windows",
+)
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
 )
 
 py_library(
@@ -17,6 +22,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:iterator_ops",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/contrib/data/python/ops:readers",
         "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
@@ -26,26 +32,21 @@ py_library(
     ],
 )
 
+cc_library(
+    name = "lib_proto_parsing_for_dataset_ops",
+    deps = if_not_windows(["//tensorflow/core:lib_proto_parsing"]),
+)
+
 tf_custom_op_library(
     name = "_dataset_ops.so",
     srcs = ["ops/dataset_ops.cc"],
-    deps = ["//tensorflow/contrib/data/kernels:dataset_kernels"],
+    deps = ["//tensorflow/contrib/data/kernels:dataset_kernels"] +
+           if_static(
+               extra_deps = [":lib_proto_parsing_for_dataset_ops"],
+               otherwise = [],
+           ),
 )
 
 tf_gen_op_libs(
     op_lib_names = ["dataset_ops"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 9212b69700941c190df1d44ed308147105c56fba..125260b4c1f6b63c8f83f28d1829afe2d9d3ea97 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -25,6 +25,7 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@Counter
 @@SqlDataset
 
+@@assert_element_shape
 @@batch_and_drop_remainder
 @@bucket_by_sequence_length
 @@dense_to_sparse_batch
@@ -32,10 +33,12 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@group_by_window
 @@ignore_errors
 @@make_batched_features_dataset
+@@make_csv_dataset
 @@make_saveable_from_iterator
 @@map_and_batch
 @@padded_batch_and_drop_remainder
 @@parallel_interleave
+@@prefetch_to_device
 @@read_batch_features
 @@rejection_resample
 @@scan
@@ -53,6 +56,7 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 
+from tensorflow.contrib.data.python.ops.batching import assert_element_shape
 from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
 from tensorflow.contrib.data.python.ops.batching import map_and_batch
@@ -67,7 +71,9 @@ from tensorflow.contrib.data.python.ops.grouping import group_by_window
 from tensorflow.contrib.data.python.ops.interleave_ops import parallel_interleave
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
+from tensorflow.contrib.data.python.ops.prefetching_ops import prefetch_to_device
 from tensorflow.contrib.data.python.ops.readers import make_batched_features_dataset
+from tensorflow.contrib.data.python.ops.readers import make_csv_dataset
 from tensorflow.contrib.data.python.ops.readers import read_batch_features
 from tensorflow.contrib.data.python.ops.readers import SqlDataset
 from tensorflow.contrib.data.python.ops.resampling import rejection_resample
@@ -80,3 +86,6 @@ from tensorflow.python.ops.parsing_ops import parse_single_example_v2 as parse_s
 
 from tensorflow.python.util.all_util import remove_undocumented
 remove_undocumented(__name__)
+
+# A constant that can be used to enable auto-tuning.
+AUTOTUNE = -1
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD
index c87da7dfaa5943f7918c370f63362673844c7f0e..83ada6fb67dcbff595a38ce9e8609bdd1219b075 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/contrib/data/kernels/BUILD
@@ -61,14 +61,3 @@ cc_library(
         "@protobuf_archive//:protobuf_headers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
index 1baac3ea5239659e65881e5b2dea4fe1a8c49d1b..a2bfce03620a1482f5b21cbf23c66833bc5cd480 100644
--- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc
+++ b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
@@ -40,8 +40,7 @@ class FunctionBufferingResource : public ResourceBase {
                             const NameAttrList& func, int64 buffer_size,
                             const string& source_device,
                             const string& target_device,
-                            const std::vector<Tensor>& func_args,
-                            int64 thread_pool_size)
+                            const std::vector<Tensor>& func_args)
       : lib_(lib),
         pflr_(std::move(pflr)),
         func_(func),
@@ -49,27 +48,13 @@ class FunctionBufferingResource : public ResourceBase {
         source_device_(source_device),
         target_device_(target_device),
         func_args_(func_args),
-        thread_pool_(new thread::ThreadPool(Env::Default(), ThreadOptions(),
-                                            "buffer_resource", thread_pool_size,
-                                            false /* low_latency_hint */)),
         handle_(kInvalidHandle),
         is_buffering_(false),
         end_of_sequence_(false),
-        cancelled_(false) {
-    runner_ = [this](std::function<void()> c) {
-      thread_pool_->Schedule(std::move(c));
-    };
-  }
+        cancelled_(false) {}
 
   ~FunctionBufferingResource() override {
     Cancel();
-    {
-      mutex_lock l(mu_);
-      while (is_buffering_) {
-        cond_var_.wait(l);
-      }
-    }
-    delete thread_pool_;
   }
 
   string DebugString() override {
@@ -103,6 +88,20 @@ class FunctionBufferingResource : public ResourceBase {
   void Cancel() LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     cancelled_ = true;
+    while (is_buffering_) {
+      cond_var_.wait(l);
+    }
+  }
+
+  // Cancels all pending operations and then clears out the state.
+  void Reset() LOCKS_EXCLUDED(mu_) {
+    Cancel();
+    mutex_lock l(mu_);
+    buffer_.clear();
+    requests_.clear();
+    is_buffering_ = false;
+    end_of_sequence_ = false;
+    cancelled_ = false;
   }
 
   // If the buffer has anything, runs `callback` on the first element in the
@@ -167,15 +166,12 @@ class FunctionBufferingResource : public ResourceBase {
       for (int i = 0; i < cancellation_callbacks.size(); ++i) {
         cancellation_callbacks[i](cancellation_buffer_elements[i]);
       }
-      // We only wait on cond_var_ in the destructor, so there would atmost be
-      // one waiter to notify.
-      cond_var_.notify_one();
+      cond_var_.notify_all();
       return;
     }
     FunctionLibraryRuntime::Options opts;
     // Copied from CapturedFunction::generate_step_id();
     opts.step_id = -std::abs(static_cast<int64>(random::New64()));
-    opts.runner = &runner_;
     opts.source_device = source_device_;
     AllocatorAttributes arg_alloc_attr;
     arg_alloc_attr.set_on_host(true);
@@ -194,13 +190,12 @@ class FunctionBufferingResource : public ResourceBase {
                   mutex_lock l(mu_);
                   BufferElement buffer_element;
                   buffer_element.status = status;
-                  if (!status.ok()) {
+                  if (status.ok()) {
+                    buffer_element.value.swap(*rets);
+                  } else {
                     end_of_sequence_ = true;
                     is_buffering_ = false;
-                    buffer_.push_back(std::move(buffer_element));
-                    return;
                   }
-                  buffer_element.value.swap(*rets);
                   buffer_.push_back(std::move(buffer_element));
                   if (!requests_.empty()) {
                     buffer_front = std::move(buffer_.front());
@@ -208,9 +203,16 @@ class FunctionBufferingResource : public ResourceBase {
                     callback = std::move(requests_.front());
                     requests_.pop_front();
                   }
-                  if (buffer_.size() < buffer_size_) {
+                  if (buffer_.size() < buffer_size_ && !end_of_sequence_) {
                     restart_buffering = true;
                   } else {
+                    // When the buffer is full, we don't want to call
+                    // FillBuffer() unless we're in cancellation phase in which
+                    // case FillBuffer() will do the final cleanup post
+                    // cancellation.
+                    if (cancelled_) {
+                      restart_buffering = true;
+                    }
                     is_buffering_ = false;
                   }
                 }
@@ -231,11 +233,9 @@ class FunctionBufferingResource : public ResourceBase {
   const string source_device_;
   const string target_device_;
   const std::vector<Tensor> func_args_;
-  thread::ThreadPool* thread_pool_;
   FunctionLibraryRuntime::Handle handle_ GUARDED_BY(mu_);
   std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
   std::deque<FunctionBufferCallback> requests_ GUARDED_BY(mu_);
-  std::function<void(std::function<void()>)> runner_ = nullptr;
   bool is_buffering_ GUARDED_BY(mu_);
   bool end_of_sequence_ GUARDED_BY(mu_);
   bool cancelled_ GUARDED_BY(mu_);
@@ -250,7 +250,6 @@ class FunctionBufferResourceHandleOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &buffer_size_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("thread_pool_size", &thread_pool_size_));
   }
 
   ~FunctionBufferResourceHandleOp() override {
@@ -298,9 +297,10 @@ class FunctionBufferResourceHandleOp : public OpKernel {
                this](FunctionBufferingResource** ptr) {
                 *ptr = new FunctionBufferingResource(
                     clone_lib, std::move(pflr), func_, buffer_size_,
-                    source_device, target_device, func_args, thread_pool_size_);
+                    source_device, target_device, func_args);
                 return Status::OK();
               }));
+      core::ScopedUnref s(buffer);
       OP_REQUIRES_OK(ctx, buffer->Instantiate());
       initialized_ = true;
     }
@@ -319,7 +319,6 @@ class FunctionBufferResourceHandleOp : public OpKernel {
   int64 buffer_size_;
   string container_;
   string name_;
-  int64 thread_pool_size_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResource")
@@ -360,25 +359,27 @@ class FunctionBufferingResourceGetNextOp : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(
         ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer),
         done);
-    core::ScopedUnref s(buffer);
 
     if (buffer->Finished()) {
+      buffer->Unref();
       ctx->SetStatus(errors::OutOfRange("end_of_sequence"));
       done();
       return;
     }
 
     FunctionBufferCallback callback =
-        [ctx, done](const BufferElement& buffer_element) {
+        [ctx, buffer, done](const BufferElement& buffer_element) {
           Status s = buffer_element.status;
           if (!s.ok()) {
             ctx->SetStatus(s);
+            buffer->Unref();
             done();
             return;
           }
           for (size_t i = 0; i < buffer_element.value.size(); ++i) {
             ctx->set_output(i, buffer_element.value[i]);
           }
+          buffer->Unref();
           done();
         };
     buffer->MaybeGet(std::move(callback));
@@ -400,4 +401,62 @@ REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceGetNext")
                         FunctionBufferingResourceGetNextOp);
 #endif  // TENSORFLOW_USE_SYCL
 
+// Resets the FunctionBufferingResource, cancelling all pending requests and
+// clearing out the buffer.
+class FunctionBufferingResourceResetOp : public OpKernel {
+ public:
+  explicit FunctionBufferingResourceResetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  ~FunctionBufferingResourceResetOp() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    ResourceHandle handle;
+    OP_REQUIRES_OK(ctx,
+                   HandleFromInput(ctx, "function_buffer_resource", &handle));
+    FunctionBufferingResource* buffer = nullptr;
+    OP_REQUIRES_OK(
+        ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer));
+    core::ScopedUnref s(buffer);
+
+    buffer->Reset();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceReset")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("function_buffer_resource"),
+                        FunctionBufferingResourceResetOp);
+REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceReset")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("function_buffer_resource"),
+                        FunctionBufferingResourceResetOp);
+#if TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResourceReset")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("function_buffer_resource"),
+                        FunctionBufferingResourceResetOp);
+#endif  // TENSORFLOW_USE_SYCL
+
+class IteratorGetDeviceOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    // NOTE(mrry): We do not currently Validate that the handle
+    // corresponds to a real IteratorResource, because that symbol is
+    // not exposed from the framework library.
+    Tensor* device_name_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &device_name_t));
+    // NOTE(mrry): Since the operation's input is a resource, we must be
+    // colocated with it, and so we can simply return the current device's
+    // name without looking at the input.
+    device_name_t->scalar<string>()() = ctx->device()->name();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("IteratorGetDevice").Device(DEVICE_CPU),
+                        IteratorGetDeviceOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index a4c1212da11a2410461a120ed5f7116e80e4b903..cf0a8bbccb5813c799e7e6db91d73e2ecf4107f8 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -37,6 +37,14 @@ REGISTER_OP("UniqueDataset")
 Creates a dataset that contains the unique elements of `input_dataset`.
 )doc");
 
+REGISTER_OP("IteratorGetDevice")
+    .Input("resource: resource")
+    .Output("device: string")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Returns the name of the device on which `resource` has been placed.
+)doc");
+
 REGISTER_OP("FunctionBufferingResource")
     .Input("string_arg: string")
     .Input("target_device: string")
@@ -45,7 +53,6 @@ REGISTER_OP("FunctionBufferingResource")
     .Attr("container: string")
     .Attr("f: func")
     .Attr("buffer_size: int")
-    .Attr("thread_pool_size: int")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Creates a resource that fills up a buffer by making function calls.
@@ -55,7 +62,6 @@ target_device: Target device to execute the function on.
 resource: Handle to the resource created.
 f: Function to be executed.
 buffer_size: Size of the buffer.
-thread_pool_size: Size of the threadpool doing the prefetching.
 container: If non-empty, this resource is placed in the given container.
   Otherwise, a default container is used.
 shared_name: If non-empty, this resource will be shared under the given name
@@ -75,6 +81,15 @@ output: A list of return values.
 output_types: The type list for the return values.
 )doc");
 
+REGISTER_OP("FunctionBufferingResourceReset")
+    .Input("function_buffer_resource: resource")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Resets the FunctionBufferingResource.
+
+function_buffer_resource: The FunctionBufferingResource handle.
+)doc");
+
 REGISTER_OP("ThreadPoolDataset")
     .Input("input_dataset: variant")
     .Input("thread_pool: resource")
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 2c4d4adfdad6d2b3268896cb91cd0357b2b814d9..7270d533c69002ad6b318645f1ef07ebb45a85c3 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -22,6 +22,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
@@ -294,9 +295,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
@@ -479,10 +478,6 @@ py_test(
     size = "small",
     srcs = ["prefetching_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_oss",  # b/68785503
-    ],
     deps = [
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
@@ -514,17 +509,3 @@ tf_py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 71dc1c1172c9d515d4c85f85257c952135098329..413d8737978b695ac443c92036d6641e5c73f28c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -28,8 +28,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -311,10 +313,10 @@ class BatchDatasetTest(test.TestCase):
     self.assertEqual([None], dataset.output_shapes[1][0].as_list())
     self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
 
-  def _testBatchAndMapDatasetHelper(self, num_parallel_batches=1):
+  def _testMapAndBatchDatasetHelper(self, num_parallel_batches=1):
     """Test a dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset ->
-    # RepeatDataset(count) -> BatchAndMapDataset(square_3, batch_size).
+    # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
                   np.array(37.0) * np.arange(7))
@@ -381,11 +383,51 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
-  def testBatchAndMapDataset(self):
-    return self._testBatchAndMapDatasetHelper()
+  def testMapAndBatchDataset(self):
+    return self._testMapAndBatchDatasetHelper()
 
-  def testBatchAndMapDatasetWithParallelBatching(self):
-    return self._testBatchAndMapDatasetHelper(num_parallel_batches=10)
+  def testMapAndBatchDatasetWithParallelBatching(self):
+    return self._testMapAndBatchDatasetHelper(num_parallel_batches=10)
+
+  def _testMapAndBatchPartialBatchHelper(self, drop_remainder=False):
+    iterator = (
+        dataset_ops.Dataset.range(10).apply(
+            batching.map_and_batch(
+                lambda x: array_ops.reshape(x * x, [1]),
+                batch_size=4,
+                drop_remainder=drop_remainder)).make_one_shot_iterator())
+    if drop_remainder:
+      self.assertEqual([4, 1], iterator.output_shapes.as_list())
+    else:
+      self.assertEqual([None, 1], iterator.output_shapes.as_list())
+    next_element = iterator.get_next()
+    with self.test_session() as sess:
+      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+      if not drop_remainder:
+        self.assertAllEqual([[64], [81]], sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testMapAndBatchPartialBatch(self):
+    return self._testMapAndBatchPartialBatchHelper()
+
+  def testMapAndBatchPartialBatchDropRemainder(self):
+    return self._testMapAndBatchPartialBatchHelper(drop_remainder=True)
+
+  def testMapAndBatchYieldsPartialBatch(self):
+    iterator = (dataset_ops.Dataset.range(10)
+                .apply(batching.map_and_batch(
+                    lambda x: array_ops.reshape(x * x, [1]), 4))
+                .make_one_shot_iterator())
+    self.assertEqual([None, 1], iterator.output_shapes.as_list())
+    next_element = iterator.get_next()
+    with self.test_session() as sess:
+      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+      self.assertAllEqual([[64], [81]], sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
 
   def testMapAndBatchSparse(self):
 
@@ -411,7 +453,7 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testBatchAndMapDatasetFails(self):
+  def testMapAndBatchDatasetFails(self):
     """Test a dataset that maps a TF function across its input elements."""
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.check_numerics(
@@ -425,7 +467,7 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
         sess.run(init_op, feed_dict={batch_size: 14})
 
-  def testBatchAndMapDatasetShapeMismatch(self):
+  def testMapAndBatchDatasetShapeMismatch(self):
     """Test a dataset that maps a TF function across its input elements."""
 
     def generator():
@@ -539,5 +581,73 @@ class PaddedBatchDatasetSerializationTest(
                         lambda: build_dataset(seq_lens2), 8)
 
 
+class RestructuredDatasetTest(test.TestCase):
+
+  def test_assert_element_shape(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
+                                           np.zeros((3, 4), dtype=np.int32)),
+                                [x],
+                                [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    expected_shapes = (tensor_shape.TensorShape(2),
+                       tensor_shape.TensorShape((3, 4)))
+    result = dataset.apply(batching.assert_element_shape(expected_shapes))
+    self.assertEqual(expected_shapes, result.output_shapes)
+
+    iterator = result.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def test_assert_wrong_element_shape(self):
+
+    def create_dataset(_):
+      return (array_ops.ones(2, dtype=dtypes.float32),
+              array_ops.zeros((3, 4), dtype=dtypes.int32))
+
+    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    with self.assertRaises(ValueError):
+      dataset.apply(batching.assert_element_shape(wrong_shapes))
+
+  def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
+                                           np.zeros((3, 4), dtype=np.int32)),
+                                [x],
+                                [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    iterator = (
+        dataset.apply(batching.assert_element_shape(wrong_shapes))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 94f800e8a58bc34eef3034cd976b931528c01940..6002cc73c8b41c2f20beaf0158af813807e58c90 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -104,6 +104,21 @@ class GroupByWindowTest(test.TestCase):
       self.assertAllEqual([0, 0, 0], sess.run(get_next))
       self.assertAllEqual([1], sess.run(get_next))
 
+  def testEmpty(self):
+    iterator = (
+        dataset_ops.Dataset.range(4).apply(
+            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "Window size must be greater than zero, but got 0."):
+        print(sess.run(get_next))
+
   def testReduceFuncError(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
 
@@ -468,6 +483,31 @@ class BucketBySequenceLength(test.TestCase):
     self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
     self.assertEqual(sorted(boundaries), sorted(lengths_val))
 
+  def testTupleElements(self):
+
+    def elements_gen():
+      text = [[1, 2, 3], [3, 4, 5, 6, 7], [1, 2], [8, 9, 0, 2, 3]]
+      label = [1, 2, 1, 2]
+      for x, y in zip(text, label):
+        yield (x, y)
+
+    def element_length_fn(x, y):
+      del y
+      return array_ops.shape(x)[0]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator=elements_gen,
+        output_shapes=(tensor_shape.TensorShape([None]),
+                       tensor_shape.TensorShape([])),
+        output_types=(dtypes.int32, dtypes.int32))
+    dataset = dataset.apply(grouping.bucket_by_sequence_length(
+        element_length_func=element_length_fn,
+        bucket_batch_sizes=[2, 2, 2],
+        bucket_boundaries=[0, 8]))
+    shapes = dataset.output_shapes
+    self.assertEqual([None, None], shapes[0].as_list())
+    self.assertEqual([None], shapes[1].as_list())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index dc3e38db59301bf1819999f479171af35930e9d2..4b5026067007e7ef0051f1647da1151be3a5631c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
 import threading
 
 from tensorflow.contrib.data.python.ops import prefetching_ops
@@ -26,6 +25,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -33,30 +33,34 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
-class StagingAreaOpsTest(test.TestCase):
+class PrefetchingKernelsOpsTest(test.TestCase):
 
   def setUp(self):
     self._event = threading.Event()
 
-  def _prefetch_fn_helper(self, buffer_name, device0, device1):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 2
+  def _create_ds_and_iterator(self, device0, initializable=False):
 
     def gen():
-      for i in itertools.count(start=1, step=1):
-        yield [i + 0.0]
+      for i in range(1, 10):
+        yield [float(i)]
         if i == 6:
           self._event.set()
 
     with ops.device(device0):
-      dataset_3 = dataset_ops.Dataset.from_generator(gen, (dtypes.float32))
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
+      ds = dataset_ops.Dataset.from_generator(gen, (dtypes.float32))
+      if initializable:
+        ds_iterator = ds.make_initializable_iterator()
+      else:
+        ds_iterator = ds.make_one_shot_iterator()
+      return (ds, ds_iterator)
+
+  def _create_ops(self, ds, ds_iterator, buffer_name, device0, device1):
+    ds_iterator_handle = ds_iterator.string_handle()
 
     @function.Defun(dtypes.string)
     def _remote_fn(h):
       remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
+          h, ds.output_types, ds.output_shapes)
       return remote_iterator.get_next()
 
     target = constant_op.constant(device0)
@@ -64,15 +68,28 @@ class StagingAreaOpsTest(test.TestCase):
       buffer_resource_handle = prefetching_ops.function_buffering_resource(
           f=_remote_fn,
           target_device=target,
-          string_arg=iterator_3_handle,
+          string_arg=ds_iterator_handle,
           buffer_size=3,
-          thread_pool_size=2,
           shared_name=buffer_name)
 
     with ops.device(device1):
       prefetch_op = prefetching_ops.function_buffering_resource_get_next(
           function_buffer_resource=buffer_resource_handle,
           output_types=[dtypes.float32])
+      reset_op = prefetching_ops.function_buffering_resource_reset(
+          function_buffer_resource=buffer_resource_handle)
+      destroy_op = resource_variable_ops.destroy_resource_op(
+          buffer_resource_handle, ignore_lookup_error=True)
+
+    return (prefetch_op, reset_op, destroy_op)
+
+  def _prefetch_fn_helper_one_shot(self, buffer_name, device0, device1):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+
+    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=False)
+    prefetch_op, _, destroy_op = self._create_ops(ds, ds_iterator, buffer_name,
+                                                  device0, device1)
 
     with self.test_session(config=worker_config) as sess:
       elem = sess.run(prefetch_op)
@@ -86,26 +103,240 @@ class StagingAreaOpsTest(test.TestCase):
       self._event.wait()
       elem = sess.run(prefetch_op)
       self.assertEqual(elem, [5.0])
-      sess.run(
-          resource_variable_ops.destroy_resource_op(
-              buffer_resource_handle, ignore_lookup_error=True))
+      sess.run(destroy_op)
 
   def testSameDeviceCPU(self):
-    self._prefetch_fn_helper("same_device_cpu",
-                             "/job:localhost/replica:0/task:0/cpu:0",
-                             "/job:localhost/replica:0/task:0/cpu:0")
+    self._prefetch_fn_helper_one_shot("same_device_cpu",
+                                      "/job:localhost/replica:0/task:0/cpu:0",
+                                      "/job:localhost/replica:0/task:0/cpu:0")
 
   def testDifferentDeviceCPU(self):
-    self._prefetch_fn_helper("diff_device_cpu",
-                             "/job:localhost/replica:0/task:0/cpu:0",
-                             "/job:localhost/replica:0/task:0/cpu:1")
+    self._prefetch_fn_helper_one_shot("diff_device_cpu",
+                                      "/job:localhost/replica:0/task:0/cpu:0",
+                                      "/job:localhost/replica:0/task:0/cpu:1")
 
   def testDifferentDeviceCPUGPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
 
-    self._prefetch_fn_helper("cpu_gpu", "/job:localhost/replica:0/task:0/cpu:0",
-                             "/job:localhost/replica:0/task:0/gpu:0")
+    self._prefetch_fn_helper_one_shot("cpu_gpu",
+                                      "/job:localhost/replica:0/task:0/cpu:0",
+                                      "/job:localhost/replica:0/task:0/gpu:0")
+
+  def testReinitialization(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+
+    device0 = "/job:localhost/replica:0/task:0/cpu:0"
+    device1 = "/job:localhost/replica:0/task:0/cpu:1"
+    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
+    prefetch_op, reset_op, destroy_op = self._create_ops(
+        ds, ds_iterator, "reinit", device0, device1)
+
+    with self.test_session(config=worker_config) as sess:
+      sess.run(ds_iterator.initializer)
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [1.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [2.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [3.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [4.0])
+      self._event.wait()
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [5.0])
+      # Lets reset the function buffering resource and reinitialize the
+      # iterator. Should be able to go through this again.
+      self._event.clear()
+      sess.run(reset_op)
+      sess.run(ds_iterator.initializer)
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [1.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [2.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [3.0])
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [4.0])
+      self._event.wait()
+      elem = sess.run(prefetch_op)
+      self.assertEqual(elem, [5.0])
+      sess.run(destroy_op)
+
+  def testReinitializationOutOfRange(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+
+    device0 = "/job:localhost/replica:0/task:0/cpu:0"
+    device1 = "/job:localhost/replica:0/task:0/cpu:1"
+    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
+    prefetch_op, reset_op, destroy_op = self._create_ops(
+        ds, ds_iterator, "reinit", device0, device1)
+
+    with self.test_session(config=worker_config) as sess:
+      sess.run(ds_iterator.initializer)
+      for i in range(1, 10):
+        elem = sess.run(prefetch_op)
+        self.assertEqual(elem, [float(i)])
+      # Try fetching after its over twice to test out end of sequence.
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+
+      # Now reset everything and try it out again.
+      self._event.clear()
+      sess.run(reset_op)
+      sess.run(ds_iterator.initializer)
+      for i in range(1, 10):
+        elem = sess.run(prefetch_op)
+        self.assertEqual(elem, [float(i)])
+      # Try fetching after its over twice to test out end of sequence.
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(prefetch_op)
+
+      sess.run(destroy_op)
+
+
+class PrefetchToDeviceTest(test.TestCase):
+
+  def testPrefetchToDevice(self):
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+    self.assertEqual([], next_element.shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchDictToDevice(self):
+    host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element["a"].dtype)
+    self.assertEqual([], next_element["a"].shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        self.assertEqual({"a": i}, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceWithReInit(self):
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_initializable_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+    self.assertEqual([], next_element.shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceGpuWithReInit(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    iterator = device_dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 699e8e7865502facd05e0c4d6d4f01b80f7c050c..6ee1b572f121a9a40dfd638f7a858d5f1176ea3c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -35,9 +35,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -568,12 +566,20 @@ class MakeCsvDatasetTest(test.TestCase):
       dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64, dtypes.string
   ]
   COLUMNS = ["col%d" % i for i in range(len(COLUMN_TYPES))]
+  DEFAULT_VALS = [[], [], [], [], ["NULL"]]
+  DEFAULTS = [
+      constant_op.constant([], dtype=dtypes.int32),
+      constant_op.constant([], dtype=dtypes.int64),
+      constant_op.constant([], dtype=dtypes.float32),
+      constant_op.constant([], dtype=dtypes.float64),
+      constant_op.constant(["NULL"], dtype=dtypes.string)
+  ]
   LABEL = COLUMNS[0]
 
   def setUp(self):
     super(MakeCsvDatasetTest, self).setUp()
     self._num_files = 2
-    self._num_records = 7
+    self._num_records = 11
     self._test_filenames = self._create_files()
 
   def _csv_values(self, fileno, recordno):
@@ -588,49 +594,63 @@ class MakeCsvDatasetTest(test.TestCase):
   def _csv_record(self, fileno, recordno):
     return ",".join(str(v) for v in self._csv_values(fileno, recordno))
 
+  def _create_file(self, fileno, header=True, comment=True):
+    fn = os.path.join(self.get_temp_dir(), "csv_file%d.csv" % fileno)
+    f = open(fn, "w")
+    if header:
+      f.write(",".join(self.COLUMNS) + "\n")
+    for recno in range(self._num_records):
+      f.write(self._csv_record(fileno, recno) + "\n")
+      if comment:
+        f.write("# Some comment goes here. Should be ignored!\n")
+    f.close()
+    return fn
+
   def _create_files(self):
     filenames = []
     for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "csv_file%d.csv" % i)
-      filenames.append(fn)
-      f = open(fn, "w")
-      f.write(",".join(self.COLUMNS) + "\n")  # header line
-      for j in range(self._num_records):
-        f.write(self._csv_record(i, j) + "\n")
-        f.write("# Some comment goes here. Should be ignored!\n")
-      f.close()
+      filenames.append(self._create_file(i))
     return filenames
 
-  def _make_csv_dataset(self,
-                        filenames,
-                        defaults,
-                        label_key=LABEL,
-                        batch_size=1,
-                        num_epochs=1,
-                        shuffle=False,
-                        shuffle_seed=None):
+  def _make_csv_dataset(
+      self,
+      filenames,
+      defaults,
+      column_names=COLUMNS,
+      label_name=LABEL,
+      batch_size=1,
+      num_epochs=1,
+      shuffle=False,
+      shuffle_seed=None,
+      header=True,
+      comment="#",
+      na_value="",
+      default_float_type=dtypes.float32,
+  ):
     return readers.make_csv_dataset(
         filenames,
-        column_keys=self.COLUMNS,
-        column_defaults=defaults,
-        label_key=label_key,
         batch_size=batch_size,
+        column_names=column_names,
+        column_defaults=defaults,
+        label_name=label_name,
         num_epochs=num_epochs,
         shuffle=shuffle,
         shuffle_seed=shuffle_seed,
-        skip=1,
-        filter_fn=
-        lambda line: math_ops.not_equal(string_ops.substr(line, 0, 1), "#"),
+        header=header,
+        comment=comment,
+        na_value=na_value,
+        default_float_type=default_float_type,
     )
 
-  def _next_actual_batch(self, file_indices, batch_size, num_epochs):
+  def _next_actual_batch(self, file_indices, batch_size, num_epochs, defaults):
     features = {col: list() for col in self.COLUMNS}
     for _ in range(num_epochs):
       for i in file_indices:
         for j in range(self._num_records):
           values = self._csv_values(i, j)
-          if not values[-1]:
-            values[-1] = "NULL"  # null values in csv are interpreted as default
+          for n, v in enumerate(values):
+            if v == "":  # pylint: disable=g-explicit-bool-comparison
+              values[n] = defaults[n][0]
           values[-1] = values[-1].encode("utf-8")
 
           # Regroup lists by column instead of row
@@ -651,7 +671,8 @@ class MakeCsvDatasetTest(test.TestCase):
       sess,
       dataset,
       file_indices,
-      label_key=LABEL,
+      defaults=tuple(DEFAULT_VALS),
+      label_name=LABEL,
       batch_size=1,
       num_epochs=1,
   ):
@@ -659,11 +680,11 @@ class MakeCsvDatasetTest(test.TestCase):
     get_next = iterator.get_next()
 
     for expected_features in self._next_actual_batch(file_indices, batch_size,
-                                                     num_epochs):
+                                                     num_epochs, defaults):
       actual_features = sess.run(get_next)
 
-      if label_key is not None:
-        expected_labels = expected_features.pop(label_key)
+      if label_name is not None:
+        expected_labels = expected_features.pop(label_name)
         # Compare labels
         self.assertAllEqual(expected_labels, actual_features[1])
         actual_features = actual_features[0]  # Extract features dict from tuple
@@ -676,10 +697,7 @@ class MakeCsvDatasetTest(test.TestCase):
       sess.run(get_next)
 
   def test_make_csv_dataset(self):
-    defaults = [
-        constant_op.constant([], dtype=d) for d in self.COLUMN_TYPES[:-1]
-    ]
-    defaults.append(constant_op.constant(["NULL"], dtype=dtypes.string))
+    defaults = self.DEFAULTS
 
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
@@ -705,11 +723,26 @@ class MakeCsvDatasetTest(test.TestCase):
         self._verify_records(
             sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
 
+  def test_make_csv_dataset_with_bad_columns(self):
+    """Tests that exception is raised when input is malformed.
+    """
+    dupe_columns = self.COLUMNS[:-1] + self.COLUMNS[:1]
+    defaults = self.DEFAULTS
+
+    # Duplicate column names
+    with self.assertRaises(ValueError):
+      self._make_csv_dataset(
+          self._test_filenames, defaults, column_names=dupe_columns)
+
+    # Label key not one of column names
+    with self.assertRaises(ValueError):
+      self._make_csv_dataset(
+          self._test_filenames, defaults, label_name="not_a_real_label")
+
   def test_make_csv_dataset_with_no_label(self):
-    defaults = [
-        constant_op.constant([], dtype=d) for d in self.COLUMN_TYPES[:-1]
-    ]
-    defaults.append(constant_op.constant(["NULL"], dtype=dtypes.string))
+    """Tests that CSV datasets can be created when no label is specified.
+    """
+    defaults = self.DEFAULTS
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         # Read from both files. Make sure this works with no label key supplied.
@@ -718,16 +751,64 @@ class MakeCsvDatasetTest(test.TestCase):
             defaults,
             batch_size=2,
             num_epochs=10,
-            label_key=None)
+            label_name=None)
         self._verify_records(
             sess,
             dataset,
             range(self._num_files),
             batch_size=2,
             num_epochs=10,
-            label_key=None)
+            label_name=None)
+
+  def test_make_csv_dataset_with_no_comments(self):
+    """Tests that datasets can be created from CSV files with no header line.
+    """
+    defaults = self.DEFAULTS
+    file_without_header = self._create_file(
+        len(self._test_filenames), comment=False)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            file_without_header,
+            defaults,
+            batch_size=2,
+            num_epochs=10,
+            comment=None,
+        )
+        self._verify_records(
+            sess,
+            dataset,
+            [len(self._test_filenames)],
+            batch_size=2,
+            num_epochs=10,
+        )
+
+  def test_make_csv_dataset_with_no_header(self):
+    """Tests that datasets can be created from CSV files with no header line.
+    """
+    defaults = self.DEFAULTS
+    file_without_header = self._create_file(
+        len(self._test_filenames), header=False)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            file_without_header,
+            defaults,
+            batch_size=2,
+            num_epochs=10,
+            header=False,
+        )
+        self._verify_records(
+            sess,
+            dataset,
+            [len(self._test_filenames)],
+            batch_size=2,
+            num_epochs=10,
+        )
 
   def test_make_csv_dataset_with_types(self):
+    """Tests that defaults can be a dtype instead of a Tensor for required vals.
+    """
     defaults = [d for d in self.COLUMN_TYPES[:-1]]
     defaults.append(constant_op.constant(["NULL"], dtype=dtypes.string))
     with ops.Graph().as_default() as g:
@@ -735,10 +816,109 @@ class MakeCsvDatasetTest(test.TestCase):
         dataset = self._make_csv_dataset(self._test_filenames, defaults)
         self._verify_records(sess, dataset, range(self._num_files))
 
+  def test_make_csv_dataset_with_no_col_names(self):
+    """Tests that datasets can be created when column names are not specified.
+
+    In that case, we should infer the column names from the header lines.
+    """
+    defaults = self.DEFAULTS
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        # Read from both files. Exercise the `batch` and `num_epochs` parameters
+        # of make_csv_dataset and make sure they work.
+        dataset = self._make_csv_dataset(
+            self._test_filenames,
+            defaults,
+            column_names=None,
+            batch_size=2,
+            num_epochs=10)
+        self._verify_records(
+            sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
+
+  def test_make_csv_dataset_type_inference(self):
+    """Tests that datasets can be created when no defaults are specified.
+
+    In that case, we should infer the types from the first N records.
+    """
+    # Test that it works with standard test files (with comments, header, etc)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            self._test_filenames, defaults=None, batch_size=2, num_epochs=10)
+        self._verify_records(
+            sess,
+            dataset,
+            range(self._num_files),
+            batch_size=2,
+            num_epochs=10,
+            defaults=[[], [], [], [], [""]])
+
+    # Test on a deliberately tricky file
+    fn = os.path.join(self.get_temp_dir(), "file.csv")
+    expected_dtypes = [
+        dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float32,
+        dtypes.string, dtypes.string
+    ]
+    rows = [[0, 0, 0, "NAN", "", "a"], [1, 2**31 + 1, 2**64, 123, "NAN", ""],
+            ['"123"', 2, 2**64, 123.4, "NAN", '"cd,efg"']]
+    expected = [[0, 0, 0, 0, "", "a"], [1, 2**31 + 1, 2**64, 123, "", ""],
+                [123, 2, 2**64, 123.4, "", "cd,efg"]]
+    for row in expected:
+      row[-1] = row[-1].encode("utf-8")  # py3 expects byte strings
+      row[-2] = row[-2].encode("utf-8")  # py3 expects byte strings
+    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
+    with open(fn, "w") as f:
+      f.write(",".join(col_names))
+      f.write("\n")
+      for row in rows:
+        f.write(",".join([str(v) if v else "" for v in row]) + "\n")
+
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            batch_size=1,
+            num_epochs=1,
+            label_name=None,
+            na_value="NAN",
+            default_float_type=dtypes.float32,
+        )
+        features = dataset.make_one_shot_iterator().get_next()
+        # Check that types match
+        for i in range(len(expected_dtypes)):
+          assert features["col%d" % i].dtype == expected_dtypes[i]
+        for i in range(len(rows)):
+          assert sess.run(features) == dict(zip(col_names, expected[i]))
+
+    # With float64 as default type for floats
+    expected_dtypes = [
+        dtypes.int32, dtypes.int64, dtypes.float64, dtypes.float64,
+        dtypes.string, dtypes.string
+    ]
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            batch_size=1,
+            num_epochs=1,
+            label_name=None,
+            na_value="NAN",
+            default_float_type=dtypes.float64,
+        )
+        features = dataset.make_one_shot_iterator().get_next()
+        # Check that types match
+        for i in range(len(expected_dtypes)):
+          assert features["col%d" % i].dtype == expected_dtypes[i]
+        for i in range(len(rows)):
+          assert sess.run(features) == dict(zip(col_names, expected[i]))
+
   def test_make_csv_dataset_with_shuffle(self):
     total_records = self._num_files * self._num_records
-    defaults = [d for d in self.COLUMN_TYPES[:-1]]
-    defaults.append(constant_op.constant(["NULL"], dtype=dtypes.string))
+    defaults = self.DEFAULTS
     for batch_size in [1, 2]:
       with ops.Graph().as_default() as g:
         with self.test_session(graph=g) as sess:
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
index 3c7b46629edb13459766b5ef3f392e8d00ad4db8..5f47dcb33999119a690bd633f0c97a12a1ae1c84 100644
--- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
@@ -21,7 +21,10 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import resampling
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -45,12 +48,10 @@ class ResampleTest(test.TestCase):
                 target_dist=target_dist,
                 initial_dist=initial_dist,
                 class_func=lambda c, _: c,
-                seed=27)).make_initializable_iterator())
-    init_op = iterator.initializer
+                seed=27)).make_one_shot_iterator())
     get_next = iterator.get_next()
 
     with self.test_session() as sess:
-      sess.run(init_op)
       returned = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
@@ -70,6 +71,43 @@ class ResampleTest(test.TestCase):
     returned_dist = class_counts / total_returned
     self.assertAllClose(target_dist, returned_dist, atol=1e-2)
 
+  def testRandomClasses(self):
+    init_dist = [0.25, 0.25, 0.25, 0.25]
+    target_dist = [0.0, 0.0, 0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test a dirac-delta target distribution
+    num_samples = 100
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(data_np)
+
+    # Apply a random mapping that preserves the data distribution.
+    def _remap_fn(_):
+      return math_ops.cast(random_ops.random_uniform([1]) * num_classes,
+                           dtypes.int32)[0]
+    dataset = dataset.map(_remap_fn)
+
+    # Reshape distribution.
+    dataset = dataset.apply(
+        resampling.rejection_resample(
+            class_func=lambda x: x,
+            target_dist=target_dist,
+            initial_dist=init_dist))
+
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.test_session() as sess:
+      returned = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          returned.append(sess.run(get_next))
+
+    classes, _ = zip(*returned)
+    bincount = np.bincount(
+        np.array(classes),
+        minlength=num_classes).astype(np.float32) / len(classes)
+
+    self.assertAllClose(target_dist, bincount, atol=1e-2)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index 36ddf3004237ed042f21d691d83eafbaa20621e6..b13ad9ba4e533e1bcef5161d983c8e6578d549b2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -47,6 +47,11 @@ class SequenceDatasetSerializationTest(
     # Skip nothing
     self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10)
 
+  def testInvalidSkip(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), None, 0)
+
   def _build_take_dataset(self, count):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(count)
@@ -69,6 +74,11 @@ class SequenceDatasetSerializationTest(
     # Take nothing
     self.run_core_tests(lambda: self._build_take_dataset(0), None, 0)
 
+  def testInvalidTake(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_take_dataset([1, 2]), None, 0)
+
   def _build_repeat_dataset(self, count, take_count=3):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(
@@ -100,6 +110,12 @@ class SequenceDatasetSerializationTest(
     # Test repeat empty dataset
     self.run_core_tests(lambda: self._build_repeat_dataset(-1, 0), None, 0)
 
+  def testInvalidRepeat(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_repeat_dataset([1, 2], 0),
+                          None, 0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index c3331e963602d60fe27dd44b0cc06dfb20ca2b6a..a1a5c9ed05ff226086885e4e204875d3ca933590 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -72,14 +72,18 @@ py_library(
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -115,6 +119,7 @@ py_library(
     deps = [
         ":contrib_op_loader",
         ":gen_dataset_ops",
+        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
@@ -173,17 +178,9 @@ py_library(
     srcs = ["prefetching_ops.py"],
     deps = [
         ":contrib_op_loader",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 6eb512dec67cb7b9c8c4518d03aee0b436205f9a..1eba010b562a60ec9469f808fd657ca330a8f5d9 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.framework import with_shape
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
@@ -345,16 +346,61 @@ class _RestructuredDataset(dataset_ops.Dataset):
     return self._output_shapes
 
 
+def assert_element_shape(expected_shapes):
+  """Assert the shape of this `Dataset`.
+
+  ```python
+  shapes = [tf.TensorShape([16, 256]), tf.TensorShape(None)]
+  result = dataset.apply(tf.contrib.data.assert_element_shape(shapes))
+  print(result.output_shapes)  # ==> "((16, 256), <unknown>)"
+  ```
+
+  If dataset shapes and expected_shape, are fully defined, assert they match.
+  Otherwise, add assert op that will validate the shapes when tensors are
+  evaluated, and set shapes on tensors, respectively.
+
+  Args:
+    expected_shapes: A nested structure of `tf.TensorShape` objects.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}
+  """
+
+  def _check_shape(*elements):
+    flatten_tensors = nest.flatten(elements)
+    flatten_shapes = nest.flatten(expected_shapes)
+    checked_tensors = [with_shape(shape, tensor)
+                       for shape, tensor in zip(flatten_shapes,
+                                                flatten_tensors)]
+    return nest.pack_sequence_as(elements, checked_tensors)
+
+  def _apply_fn(dataset):
+    return _RestructuredDataset(
+        dataset.map(_check_shape),
+        dataset.output_types,
+        output_shapes=expected_shapes,
+        output_classes=dataset.output_classes)
+
+  return _apply_fn
+
+
 class _MapAndBatchDataset(dataset_ops.MapDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
-  def __init__(self, input_dataset, map_func, batch_size, num_parallel_batches):
+  def __init__(self, input_dataset, map_func, batch_size, num_parallel_batches,
+               drop_remainder):
     """See `Dataset.map()` for details."""
     super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
-    self._batch_size = ops.convert_to_tensor(
+    self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
-    self._num_parallel_batches = ops.convert_to_tensor(
+    self._num_parallel_batches_t = ops.convert_to_tensor(
         num_parallel_batches, dtype=dtypes.int64, name="num_parallel_batches")
+    self._drop_remainder_t = ops.convert_to_tensor(
+        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
+
+    self._batch_size = batch_size
+    self._drop_remainder = drop_remainder
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
@@ -363,8 +409,9 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
         input_resource,
         self._map_func.captured_inputs,
         f=self._map_func,
-        batch_size=self._batch_size,
-        num_parallel_batches=self._num_parallel_batches,
+        batch_size=self._batch_size_t,
+        num_parallel_batches=self._num_parallel_batches_t,
+        drop_remainder=self._drop_remainder_t,
         output_types=nest.flatten(
             sparse.as_dense_types(self.output_types, self.output_classes)),
         output_shapes=nest.flatten(
@@ -373,9 +420,9 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
 
   @property
   def output_shapes(self):
+    dim = self._batch_size if self._drop_remainder else None
     return nest.pack_sequence_as(self._output_shapes, [
-        tensor_shape.vector(tensor_util.constant_value(
-            self._batch_size)).concatenate(s)
+        tensor_shape.vector(dim).concatenate(s)
         for s in nest.flatten(self._output_shapes)
     ])
 
@@ -384,7 +431,10 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
     return self._output_types
 
 
-def map_and_batch(map_func, batch_size, num_parallel_batches=1):
+def map_and_batch(map_func,
+                  batch_size,
+                  num_parallel_batches=1,
+                  drop_remainder=False):
   """Fused implementation of `map` and `batch`.
 
   Maps `map_func` across `batch_size` consecutive elements of this dataset
@@ -404,6 +454,9 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
       number of batches to create in parallel. On one hand, higher values can
       help mitigate the effect of stragglers. On the other hand, higher values
       can increase contention if CPU is scarce.
+    drop_remainder: A `tf.bool` scalar `tf.Tensor`, representing whether the
+      last batch should be dropped in case its size is smaller than desired;
+      the default behavior is not to drop the smaller batch.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -412,6 +465,6 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
 
   def _apply_fn(dataset):
     return _MapAndBatchDataset(dataset, map_func, batch_size,
-                               num_parallel_batches)
+                               num_parallel_batches, drop_remainder)
 
   return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/counter.py b/tensorflow/contrib/data/python/ops/counter.py
index 63226fe78163c59025623a362d17c400fbe57c67..6ef65f9624601286691505a795a86dd6226eead1 100644
--- a/tensorflow/contrib/data/python/ops/counter.py
+++ b/tensorflow/contrib/data/python/ops/counter.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import ops
 
 
 def Counter(start=0, step=1, dtype=dtypes.int64):
-  """Creates a `Dataset` of a `step`-separated count startin from `start`.
+  """Creates a `Dataset` that counts from `start` in steps of size `step`.
 
   For example:
 
@@ -38,12 +38,13 @@ def Counter(start=0, step=1, dtype=dtypes.int64):
   ```
 
   Args:
-    start: starting value for count.
-    step: step size.
-    dtype: counter data type.
+    start: (Optional.) The starting value for the counter. Defaults to 0.
+    step: (Optional.) The step size for the counter. Defaults to 1.
+    dtype: (Optional.) The data type for counter elements. Defaults to
+      `tf.int64`.
 
   Returns:
-    A `Dataset` of scalar elements.
+    A `Dataset` of scalar `dtype` elements.
   """
   with ops.name_scope("counter"):
     start = ops.convert_to_tensor(start, dtype=dtype, name="start")
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index a19be222545ef0242502ec07badbdae5c7634a0c..36591c055ae8f2c54981525ffcc3df128a990a61 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -42,7 +42,7 @@ def group_by_window(key_func,
   This transformation maps each consecutive element in a dataset to a key
   using `key_func` and groups the elements by key. It then applies
   `reduce_func` to at most `window_size_func(key)` elements matching the same
-  key. All execpt the final window for each key will contain
+  key. All except the final window for each key will contain
   `window_size_func(key)` elements; the final window may be smaller.
 
   You may provide either a constant `window_size` or a window size determined by
@@ -140,9 +140,9 @@ def bucket_by_sequence_length(element_length_func,
 
     batch_sizes = constant_op.constant(bucket_batch_sizes, dtype=dtypes.int64)
 
-    def element_to_bucket_id(element):
+    def element_to_bucket_id(*args):
       """Return int64 id of the length bucket for this element."""
-      seq_length = element_length_func(element)
+      seq_length = element_length_func(*args)
 
       boundaries = list(bucket_boundaries)
       buckets_min = [np.iinfo(np.int32).min] + boundaries
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 7059b358f349e0ec847e85c37652012d48ed910a..77e23d0319e7f163f208c90bc0d5643520a4b466 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -17,8 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
 from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 
 
 # TODO(rohanj): Add a python class that constructs resource in the __init__
@@ -27,7 +37,6 @@ def function_buffering_resource(string_arg,
                                 target_device,
                                 f,
                                 buffer_size,
-                                thread_pool_size=1,
                                 container="",
                                 shared_name=None,
                                 name=None):
@@ -39,7 +48,6 @@ def function_buffering_resource(string_arg,
       shared_name=shared_name,
       f=f,
       buffer_size=buffer_size,
-      thread_pool_size=thread_pool_size,
       container=container,
       name=name)
 
@@ -51,3 +59,189 @@ def function_buffering_resource_get_next(function_buffer_resource,
       function_buffer_resource=function_buffer_resource,
       output_types=output_types,
       name=name)
+
+
+def function_buffering_resource_reset(function_buffer_resource, name=None):
+  return gen_dataset_ops.function_buffering_resource_reset(
+      function_buffer_resource=function_buffer_resource, name=name)
+
+
+# pylint: disable=protected-access
+class _PrefetchToDeviceIterator(object):
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               one_shot,
+               device,
+               buffer_size,
+               shared_name=None):
+    self._input_dataset = input_dataset
+    self._get_next_call_count = 0
+    self._one_shot = one_shot
+    if shared_name is None:
+      shared_name = ""
+
+    if self._one_shot:
+      self._input_iterator = input_dataset.make_one_shot_iterator()
+    else:
+      self._input_iterator = iterator_ops.Iterator.from_structure(
+          self._input_dataset.output_types, self._input_dataset.output_shapes,
+          shared_name, self._input_dataset.output_classes)
+    input_iterator_handle = self._input_iterator.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self._input_iterator.output_types,
+          self._input_iterator.output_shapes,
+          self._input_iterator.output_classes)
+      ret = remote_iterator.get_next()
+
+      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      ret = nest.pack_sequence_as(ret, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+      ])
+
+      # Serialize any sparse tensors and convert result to tensors.
+      ret = nest.pack_sequence_as(ret, [
+          ops.convert_to_tensor(t)
+          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
+      ])
+      return nest.flatten(ret)
+
+    with ops.device(device):
+      self._buffering_resource = function_buffering_resource(
+          f=_prefetch_fn,
+          target_device=gen_dataset_ops.iterator_get_device(
+              self._input_iterator._iterator_resource),
+          string_arg=input_iterator_handle,
+          buffer_size=buffer_size,
+          shared_name=shared_name)
+
+    if not self._one_shot:
+      reset_op = function_buffering_resource_reset(self._buffering_resource)
+      with ops.control_dependencies([reset_op]):
+        self._initializer = self._input_iterator.make_initializer(
+            self._input_dataset)
+
+  def get_next(self, name=None):
+    """See @{tf.data.Iterator.get_next}."""
+    self._get_next_call_count += 1
+    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
+
+    flat_ret = gen_dataset_ops.function_buffering_resource_get_next(
+        self._buffering_resource,
+        output_types=nest.flatten(sparse.as_dense_types(
+            self.output_types, self.output_classes)), name=name)
+
+    ret = sparse.deserialize_sparse_tensors(
+        nest.pack_sequence_as(self.output_types, flat_ret),
+        self.output_types, self.output_shapes, self.output_classes)
+
+    for tensor, shape in zip(
+        nest.flatten(ret), nest.flatten(self.output_shapes)):
+      if isinstance(tensor, ops.Tensor):
+        tensor.set_shape(shape)
+
+    return ret
+
+  @property
+  def initializer(self):
+    if self._one_shot:
+      raise NotImplementedError("Can't initialize a one_shot_iterator")
+    return self._initializer
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+# pylint: enable=protected-access
+
+
+class _PrefetchToDeviceDataset(dataset_ops.Dataset):
+  """A `Dataset` whose iterator prefetches elements to another device."""
+
+  def __init__(self, input_dataset, device, buffer_size):
+    self._input_dataset = input_dataset
+    self._device = device
+    self._buffer_size = buffer_size if buffer_size is not None else 1
+
+  def make_one_shot_iterator(self):
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=True,
+        device=self._device,
+        buffer_size=self._buffer_size)
+
+  def make_initializable_iterator(self, shared_name=None):
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=False,
+        device=self._device,
+        buffer_size=self._buffer_size,
+        shared_name=shared_name)
+
+  def _as_variant_tensor(self):
+    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
+    # transformation methods is called.
+    # TODO(mrry): Investigate support for chaining further transformations after
+    # the prefetch, including GPU support.
+    raise NotImplementedError("`prefetch_to_device()` must be the last "
+                              "transformation in a dataset pipeline.")
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+def prefetch_to_device(device, buffer_size=None):
+  """A transformation that prefetches dataset values to the given `device`.
+
+  NOTE: Although the transformation creates a @{tf.data.Dataset}, the
+  transformation must be the final `Dataset` in the input pipeline.
+
+  Args:
+    device: A string. The name of a device to which elements will be prefetched.
+    buffer_size: (Optional.) The number of elements to buffer on `device`.
+      Defaults to an automatically chosen value.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+  def _apply_fn(dataset):
+    return _PrefetchToDeviceDataset(dataset, device, buffer_size)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index f70f9c881df168564cbf2431bbc2ebdf7e7f7ded..9a48aa02fba4813fc670364bda7f91c0ce091a45 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -17,6 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import csv
+from math import ceil
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.data.python.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
@@ -26,8 +32,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import deprecation
 
@@ -35,21 +44,145 @@ _ACCEPTABLE_CSV_TYPES = (dtypes.float32, dtypes.float64, dtypes.int32,
                          dtypes.int64, dtypes.string)
 
 
+def _is_valid_int32(str_val):
+  try:
+    # Checks equality to prevent int32 overflow
+    return dtypes.int32.as_numpy_dtype(str_val) == dtypes.int64.as_numpy_dtype(
+        str_val)
+  except (ValueError, OverflowError):
+    return False
+
+
+def _is_valid_int64(str_val):
+  try:
+    dtypes.int64.as_numpy_dtype(str_val)
+    return True
+  except (ValueError, OverflowError):
+    return False
+
+
+def _is_valid_float(str_val, float_dtype):
+  try:
+    return float_dtype.as_numpy_dtype(str_val) < np.inf
+  except ValueError:
+    return False
+
+
+def _infer_type(str_val, na_value, prev_type, float_dtype):
+  """Given a string, infers its tensor type.
+
+  Infers the type of a value by picking the least 'permissive' type possible,
+  while still allowing the previous type inference for this column to be valid.
+
+  Args:
+    str_val: String value to infer the type of.
+    na_value: Additional string to recognize as a NA/NaN CSV value.
+    prev_type: Type previously inferred based on values of this column that
+      we've seen up till now.
+    float_dtype: Either `tf.float32` or `tf.float64`. Denotes what float type
+      to parse float strings as.
+  Returns:
+    Inferred dtype.
+  """
+  if str_val in ("", na_value):
+    return prev_type
+
+  if _is_valid_int32(str_val) and prev_type in (None, dtypes.int32):
+    return dtypes.int32
+
+  if _is_valid_int64(str_val) and prev_type in (None, dtypes.int32,
+                                                dtypes.int64):
+    return dtypes.int64
+
+  if _is_valid_float(str_val, float_dtype) and prev_type != dtypes.string:
+    return float_dtype
+
+  return dtypes.string
+
+
+def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
+                  comment):
+  for fn in filenames:
+    with file_io.FileIO(fn, "r") as f:
+      rdr = csv.reader(
+          f,
+          delimiter=field_delim,
+          quoting=csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE)
+      if header:
+        next(rdr)  # Skip header lines
+
+      for csv_row in rdr:
+        if comment is not None and csv_row[0].startswith(comment):
+          continue  # Skip comment lines
+
+        if len(csv_row) != num_cols:
+          raise ValueError(
+              "Problem inferring types: CSV row has different number of fields "
+              "than expected.")
+        yield csv_row
+
+
+def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
+                           na_value, header, comment, float_dtype,
+                           rows_for_inference):
+  """Infers column types from the first N valid CSV records of files."""
+  inferred_types = [None] * num_cols
+
+  for rows_read, csv_row in enumerate(
+      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
+                    comment)):
+    if rows_for_inference is not None and rows_read >= rows_for_inference:
+      break
+    for i, str_val in enumerate(csv_row):
+      inferred_types[i] = _infer_type(str_val, na_value, inferred_types[i],
+                                      float_dtype)
+
+  # Replace None's with a default type
+  inferred_types = [t or dtypes.string for t in inferred_types]
+  # Default to 0 or '' for null values
+  return [
+      constant_op.constant([0 if t is not dtypes.string else ""], dtype=t)
+      for t in inferred_types
+  ]
+
+
+def _infer_column_names(filenames, field_delim, use_quote_delim):
+  """Infers column names from first rows of files."""
+  csv_kwargs = {
+      "delimiter": field_delim,
+      "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE
+  }
+  with file_io.FileIO(filenames[0], "r") as f:
+    column_names = next(csv.reader(f, **csv_kwargs))
+
+  for name in filenames[1:]:
+    with file_io.FileIO(name, "r") as f:
+      if next(csv.reader(f, **csv_kwargs)) != column_names:
+        raise ValueError("Files have different column names in the header row.")
+  return column_names
+
+
 def make_csv_dataset(
     file_pattern,
     batch_size,
-    column_keys,
-    column_defaults,
-    label_key=None,
+    column_names=None,
+    column_defaults=None,
+    label_name=None,
     field_delim=",",
     use_quote_delim=True,
-    skip=0,
-    filter_fn=None,
+    na_value="",
+    header=True,
+    comment=None,
     num_epochs=None,
     shuffle=True,
     shuffle_buffer_size=10000,
     shuffle_seed=None,
     prefetch_buffer_size=1,
+    num_parallel_reads=1,
+    num_parallel_parser_calls=2,
+    sloppy=False,
+    default_float_type=dtypes.float32,
+    num_rows_for_inference=100,
 ):
   """Reads CSV files into a dataset.
 
@@ -63,27 +196,36 @@ def make_csv_dataset(
       records. See @{tf.gfile.Glob} for pattern rules.
     batch_size: An int representing the number of consecutive elements of this
       dataset to combine in a single batch.
-    column_keys: A list of strings that corresponds to the CSV columns, in
-      order. One per column of the input record.
-    column_defaults: A list of default values for the CSV fields. One item per
-      column of the input record. Each item in the list is either one of the
-      following dtypes: float32, float64, int32, int64, or string, or a
-      `Tensor` with one of the aforementioned types. One item per column of
-      the input record, with either scalar default value for that column if it
-      is required, or, if the column is required, an empty `Tensor` or a dtype.
-    label_key: A optional string corresponding to the label column. If provided,
-      the data for this column is returned as a separate `Tensor` from the
-      features dictionary, so that the dataset complies with the format expected
-      by a `tf.Estimator.train` or `tf.Estimator.evaluate` input function.
+    column_names: An optional list of strings that corresponds to the CSV
+      columns, in order. One per column of the input record. If this is not
+      provided, infers the column names from the first row of the records.
+      These names will be the keys of the features dict of each dataset element.
+    column_defaults: A optional list of default values for the CSV fields. One
+      item per column of the input record. Each item in the list is either a
+      valid CSV dtype (float32, float64, int32, int64, or string), or a
+      `Tensor` with one of the aforementioned types. The tensor can either be
+      a scalar default value (if the column is optional), or an empty tensor (if
+      the column is required). If a dtype is provided instead of a tensor, the
+      column is also treated as required. If this list is not provided, tries
+      to infer types based on reading the first num_rows_for_inference rows of
+      files specified, and assumes all columns are optional, defaulting to `0`
+      for numeric values and `""` for string values.
+    label_name: A optional string corresponding to the label column. If
+      provided, the data for this column is returned as a separate `Tensor` from
+      the features dictionary, so that the dataset complies with the format
+      expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
+      function.
     field_delim: An optional `string`. Defaults to `","`. Char delimiter to
       separate fields in a record.
     use_quote_delim: An optional bool. Defaults to `True`. If false, treats
       double quotation marks as regular characters inside of the string fields.
-    skip: An integer that corresponds to the number of lines to skip at the
-      head of each CSV file. Defaults to 0.
-    filter_fn: A callable function that takes in a CSV string and returns a
-      boolean that corresponds to whether the record should be included. If
-      None, does not filter records.
+    na_value: Additional string to recognize as NA/NaN.
+    header: A bool that indicates whether the first rows of provided CSV files
+      correspond to header lines with column names, and should not be included
+      in the data.
+    comment: An optional character string that marks lines that should not be
+      parsed as csv records. If this is provided, all lines that start with
+      this character will not be parsed.
     num_epochs: An int specifying the number of times this dataset is repeated.
       If None, cycles through the dataset forever.
     shuffle: A bool that indicates whether the input should be shuffled.
@@ -94,63 +236,124 @@ def make_csv_dataset(
     prefetch_buffer_size: An int specifying the number of feature batches to
       prefetch for performance improvement. Recommended value is the number of
       batches consumed per training step.
+    num_parallel_reads: Number of threads used to read CSV records from files.
+      If >1, the results will be interleaved.
+    num_parallel_parser_calls: Number of parallel invocations of the CSV parsing
+      function on CSV records.
+    sloppy: If `True`, reading performance will be improved at
+      the cost of non-deterministic ordering. If `False`, the order of elements
+      produced is deterministic prior to shuffling (elements are still
+      randomized if `shuffle=True`. Note that if the seed is set, then order
+      of elements after shuffling is deterministic). Defaults to `False`.
+    default_float_type: Either `tf.float32` or `tf.float64`. If defaults are
+      not provided, float-like strings are interpreted to be this type.
+    num_rows_for_inference: Number of rows of a file to use for type inference
+      if record_defaults is not provided. If None, reads all the rows of all
+      the files. Defaults to 100.
 
   Returns:
     A dataset, where each element is a (features, labels) tuple that corresponds
     to a batch of `batch_size` CSV rows. The features dictionary maps feature
     column names to `Tensor`s containing the corresponding column data, and
     labels is a `Tensor` containing the column data for the label column
-    specified by `label_key`.
+    specified by `label_name`.
+
+  Raises:
+    ValueError: If any of the arguments is malformed.
   """
+  # Create dataset of all matching filenames
   filenames = _get_file_names(file_pattern, False)
-  column_defaults = [
-      constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
-      for x in column_defaults
-  ]
-
   dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
-  if label_key is not None:
-    assert label_key in column_keys
+  if shuffle:
+    dataset = dataset.shuffle(len(filenames), shuffle_seed)
+
+  # Clean arguments; figure out column names and defaults
+  if comment is not None and len(comment) != 1:
+    raise ValueError("`comment` arg must be a single-character string or None")
+
+  if column_names is None:
+    if not header:
+      raise ValueError("Cannot infer column names without a header line.")
+    # If column names are not provided, infer from the header lines
+    column_names = _infer_column_names(filenames, field_delim, use_quote_delim)
+  if len(column_names) != len(set(column_names)):
+    raise ValueError("Cannot have duplicate column names.")
+
+  if column_defaults is not None:
+    column_defaults = [
+        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
+        for x in column_defaults
+    ]
+  else:
+    # If column defaults are not provided, infer from records at graph
+    # construction time
+    column_defaults = _infer_column_defaults(
+        filenames, len(column_names), field_delim, use_quote_delim, na_value,
+        header, comment, default_float_type, num_rows_for_inference)
+
+  if label_name is not None and label_name not in column_names:
+    raise ValueError("`label_name` provided must be one of the columns.")
+
+  # Define map and filter functions
+  def filter_fn(line):
+    return math_ops.not_equal(string_ops.substr(line, 0, 1), comment)
 
   def filename_to_dataset(filename):
     ds = core_readers.TextLineDataset(filename)
-    if skip > 0:
-      ds = ds.skip(skip)
-    if filter_fn is not None:
+    if header:
+      ds = ds.skip(1)
+    if comment is not None:
       ds = ds.filter(filter_fn)
     return ds
 
   def decode_csv(line):
-    """Decodes csv line into features.
+    """Decodes CSV line into features.
 
     Args:
       line: String tensor corresponding to one csv record.
     Returns:
       A dictionary of feature names to values for that particular record. If
-      label_key is provided, extracts the label feature to be returned as the
+      label_name is provided, extracts the label feature to be returned as the
       second element of the tuple.
     """
     columns = parsing_ops.decode_csv(
         line,
         column_defaults,
         field_delim=field_delim,
-        use_quote_delim=use_quote_delim)
-    features = dict(zip(column_keys, columns))
-    if label_key is not None:
-      label = features.pop(label_key)
+        use_quote_delim=use_quote_delim,
+        na_value=na_value,
+    )
+    features = dict(zip(column_names, columns))
+    if label_name is not None:
+      label = features.pop(label_name)
       return features, label
     return features
 
-  # TODO(rachelim): interleave records from files for better shuffling
-  dataset = dataset.flat_map(filename_to_dataset)
-  # TODO(rachelim): use fused shuffle_and_repeat for perf
-  if shuffle:
+  # Read files sequentially or in parallel
+  dataset = dataset.apply(
+      interleave_ops.parallel_interleave(
+          filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))
+
+  if num_epochs != 1 and shuffle:
+    # Use shuffle_and_repeat for perf
+    dataset = dataset.apply(
+        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
+                                       shuffle_seed))
+  elif shuffle:
     dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
-  if num_epochs != 1:
+  elif num_epochs != 1:
     dataset = dataset.repeat(num_epochs)
 
-  dataset = dataset.batch(batch_size)
-  dataset = dataset.map(decode_csv)
+  # Use map_and_batch for perf
+  # TODO(b/76425672): use num_parallel_calls for better performance tuning when
+  # that is added
+  dataset = dataset.apply(
+      batching.map_and_batch(
+          map_func=decode_csv,
+          batch_size=batch_size,
+          num_parallel_batches=int(
+              ceil(num_parallel_parser_calls / batch_size))))
+
   dataset = dataset.prefetch(prefetch_buffer_size)
   return dataset
 
@@ -246,12 +449,10 @@ def make_batched_features_dataset(file_pattern,
     `Tensor` or `SparseTensor` objects.
   """
   # Create dataset of all matching filenames
+  filenames = _get_file_names(file_pattern, False)
+  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
   if shuffle:
-    dataset = dataset_ops.Dataset.list_files(file_pattern, shuffle=True)
-  else:
-    # TODO(b/73959787): Use Dataset.list_files() once ordering is deterministic.
-    filenames = _get_file_names(file_pattern, shuffle)
-    dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
+    dataset = dataset.shuffle(len(filenames), shuffle_seed)
 
   # Read `Example` records from files as tensor objects.
   if reader_args is None:
@@ -287,7 +488,7 @@ def make_batched_features_dataset(file_pattern,
       lambda x: parsing_ops.parse_example(x, features),
       num_parallel_calls=parser_num_threads)
 
-  # TODO(rachelim): Add an optional label_key argument for extracting the label
+  # TODO(rachelim): Add an optional label_name argument for extracting the label
   # from the features dictionary, to comply with the type expected by the
   # input_fn to a `tf.Estimator.train` or `tf.Estimator.evaluate` function.
   dataset = dataset.prefetch(prefetch_buffer_size)
diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py
index 56f526a330bfbea7305b0754bfd114c5e97db506..b465397437adbdfaf865efb8ed2f80e57f48fcab 100644
--- a/tensorflow/contrib/data/python/ops/resampling.py
+++ b/tensorflow/contrib/data/python/ops/resampling.py
@@ -54,7 +54,7 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
     dist_estimation_batch_size = 32
-    target_dist_t = ops.convert_to_tensor(target_dist, name="initial_dist")
+    target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist")
     class_values_ds = dataset.map(class_func)
     if initial_dist is not None:
       initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist")
@@ -101,14 +101,16 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
                                                    initial_dist_ds))
                           .map(maybe_warn_on_large_rejection))
 
-    current_probabilities_ds = dataset_ops.Dataset.zip(
-        (acceptance_dist_ds, class_values_ds)).map(array_ops.gather)
+    def _gather_and_copy(class_val, acceptance_prob, data):
+      return (class_val, array_ops.gather(acceptance_prob, class_val), data)
+    current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
+        (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
     filtered_ds = (
-        dataset_ops.Dataset.zip((class_values_ds, current_probabilities_ds,
-                                 dataset))
+        current_probabilities_and_class_and_data_ds
         .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
     return filtered_ds.map(lambda class_value, _, data: (class_value, data))
 
+
   return _apply_fn
 
 
@@ -151,7 +153,7 @@ def _calculate_acceptance_probs(initial_probs, target_probs):
   ```
 
 
-  A solution for a_i in terms of the other variabes is the following:
+  A solution for a_i in terms of the other variables is the following:
     ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
   """
   # Add tiny to initial_probs to avoid divide by zero.
diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index ae3847b8b62452b1afbe472fcb6369181ec60b73..3b50a48336d77ebd9327fa24e5612a95d5d0c372 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -13,14 +13,6 @@ load(
     "tf_pyclif_proto_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "generic_tree_model",
     srcs = ["generic_tree_model.proto"],
diff --git a/tensorflow/contrib/deprecated/BUILD b/tensorflow/contrib/deprecated/BUILD
index 3dfbbf55273848afb8ad74ad444f0d85b45610bd..401527f1e74f7725d02a3b92a2c661d8ffc11e21 100644
--- a/tensorflow/contrib/deprecated/BUILD
+++ b/tensorflow/contrib/deprecated/BUILD
@@ -30,15 +30,3 @@ py_test(
         "//tensorflow/python:logging_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..74b2cd90a187159fd2da8ce236c14e813cc43c49
--- /dev/null
+++ b/tensorflow/contrib/distribute/BUILD
@@ -0,0 +1,36 @@
+# Implementation of a prototype TF distributed computation library.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "distribute",
+    srcs = ["__init__.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/contrib/distribute/python:cross_tower_ops",
+        "//tensorflow/contrib/distribute/python:mirrored_strategy",
+        "//tensorflow/contrib/distribute/python:monitor",
+        "//tensorflow/contrib/distribute/python:one_device_strategy",
+        "//tensorflow/contrib/distribute/python:step_fn",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..28483f4c88504b1fb90f2afc927442018648fdca
--- /dev/null
+++ b/tensorflow/contrib/distribute/README.md
@@ -0,0 +1,144 @@
+# Distribution Strategy
+
+> *NOTE*: This is a experimental feature. The API and performance
+> characteristics are subject to change.
+
+## Overview
+
+[`DistributionStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/DistributionStrategy)
+API is an easy way to distribute your training
+across multiple devices/machines. Our goal is to allow users to use existing
+models and training code with minimal changes to enable distributed training.
+Moreover, we've design the API in such a way that it works with both eager and
+graph execution.
+
+Currently we support one type of strategy, called
+[`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy).
+It does in-graph replication with synchronous training
+on many GPUs on one machine. Essentially, we create copies of all variables in
+the model's layers on each device. We then use all-reduce to combine gradients
+across the devices before applying them to the variables to keep them in sync.
+In the future, we intend to support other kinds of training configurations such
+as multi-node, synchronous,
+[asynchronous](https://www.tensorflow.org/deploy/distributed#putting_it_all_together_example_trainer_program),
+parameter servers and model parallelism.
+
+## Example
+
+Let's demonstrate how to use this API with a simple example. We will use the
+[`Estimator`](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)
+approach, and show you how to scale your model to run on multiple GPUs on one
+machine using `MirroredStrategy`.
+
+Let's consider a very simple model function which tries to learn a simple
+function.
+
+```python
+def model_fn(features, labels, mode):
+  layer = tf.layers.Dense(1)
+  logits = layer(features)
+
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {"logits": logits}
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+  loss = tf.losses.mean_squared_error(
+      labels=labels, predictions=tf.reshape(logits, []))
+
+  if mode == tf.estimator.ModeKeys.EVAL:
+    return tf.estimator.EstimatorSpec(mode, loss=loss)
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss_fn())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+```
+
+Let's also define a simple input function to feed data for training this model.
+Note that we require using
+[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
+with `DistributionStrategy`.
+
+
+```python
+def input_fn():
+  features = tf.data.Dataset.from_tensors([[1.]]).repeat(100)
+  labels = tf.data.Dataset.from_tensors(1.).repeat(100)
+  return dataset_ops.Dataset.zip((features, labels))
+```
+
+Now that we have a model function and input function defined, we can define the
+estimator. To use `MirroredStrategy`, all we need to do is:
+
+* Create an instance of the `MirroredStrategy` class.
+* Pass it to the
+[`RunConfig`](https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig)
+parameter of `Estimator`.
+
+
+```python
+distribution = tf.contrib.distribute.MirroredStrategy()
+config = tf.estimator.RunConfig(train_distribute=distribution)
+classifier = tf.estimator.Estimator(model_fn=model_fn, config=config)
+classifier.train(input_fn=input_fn)
+```
+
+That's it! This change will now configure estimator to run on all GPUs on your
+machine, with the `MirroredStrategy` approach. It will take care of distributing
+the input dataset, replicating layers and variables on each device, and
+combining and applying gradients.
+
+The model and input functions do not have to change because we have changed the
+underlying components of TensorFlow (such as
+optimizer, batch norm and summaries) to become distribution-aware.
+That means those components know how to
+combine their state across devices. Further, saving and checkpointing works
+seamlessly, so you can save with one or no distribution strategy and resume with
+another.
+
+Above, we showed the easiest way to use [`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy#__init__).
+There are few things you can customize in practice:
+
+* You can specify a list of specific GPUs (using param `devices`) or the number
+of GPUs (using param `num_gpus`), in case you don't want auto detection.
+* You can specify various parameters for all reduce with the `cross_tower_ops`
+param, such as the all reduce algorithm to use, and gradient repacking.
+
+## Performance Tips
+
+We've tried to make it such that you get the best performance for your existing
+model. We also recommend you follow the tips from
+[Input Pipeline Performance Guide](https://www.tensorflow.org/performance/datasets_performance).
+Specifically, we found using [`map_and_batch`](https://www.tensorflow.org/performance/datasets_performance#map_and_batch)
+and [`dataset.prefetch`](https://www.tensorflow.org/performance/datasets_performance#pipelining)
+in the input function gives a solid boost in performance. When using
+`dataset.prefetch`, use `buffer_size=None` to let it detect optimal buffer size.
+
+## Caveats
+This feature is in early stages and there are a lot of improvements forthcoming:
+
+* Metrics are not yet supported during distributed training.
+* Summaries are currently computed in every tower.
+* Evaluation is not yet distributed.
+* Eager support is in the works; performance can be more challenging with eager
+execution.
+* As mentioned earlier, multi-node and other distributed strategies will be
+introduced in the future.
+* If you are [`batching`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch)
+your input data, we will place one batch on each GPU in each step. So your
+effective batch size will be `num_gpus * batch_size`. Therefore, consider
+adjusting your learning rate or batch size according to the number of GPUs.
+We are working on addressing this limitation by splitting each batch across GPUs
+instead.
+* Dictionaries inside dataset in the input are not supported when prefetching
+on GPUs is turned on. (If you need to use dictionaries in the dataset, turn off
+prefetching on GPUs by passing param `prefetch_on_device=False` to
+`MirroredStrategy`)
+* PartitionedVariables are not supported yet.
+
+## What's next?
+
+Please give distribution strategies a try. This feature is in early stages and
+is evolving, so we welcome your feedback via
+[issues on GitHub](https://github.com/tensorflow/tensorflow/issues/new).
+
+
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..76711baf3a11c8978fbb5770ec173ff74a153158
--- /dev/null
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -0,0 +1,52 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Prototype of a distributed computation library for TF."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.distribute.python.cross_tower_ops import *
+from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
+from tensorflow.contrib.distribute.python.monitor import Monitor
+from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
+from tensorflow.contrib.distribute.python.step_fn import *
+from tensorflow.python.training.distribute import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+_allowed_symbols = [
+    'AllReduceCrossTowerOps',
+    'CrossTowerOps',
+    'DistributionStrategy',
+    'MirroredStrategy',
+    'Monitor',
+    'OneDeviceStrategy',
+    'ReductionToOneDeviceCrossTowerOps',
+    'Step',
+    'StandardInputStep',
+    'StandardSingleLossStep',
+    'TowerContext',
+    'get_cross_tower_context',
+    'get_distribution_strategy',
+    'get_loss_reduction',
+    'get_tower_context',
+    'has_distribution_strategy',
+    'require_tower_context',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..78b2b0054aa95701ad192b4fb9a0727ce287de4b
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -0,0 +1,444 @@
+# Implementation of a prototype TF distributed computation library.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# TODO(priyag): Figure out testonly issues that are preventing us from
+# including our tests in pip for now.
+
+py_library(
+    name = "values",
+    srcs = ["values.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":prefetching_ops_v2",
+        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/eager/python:datasets",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpointable",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+cuda_py_test(
+    name = "values_test",
+    srcs = ["values_test.py"],
+    additional_deps = [
+        ":mirrored_strategy",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/estimator:model_fn",
+    ],
+)
+
+py_library(
+    name = "mirrored_strategy",
+    srcs = ["mirrored_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":cross_tower_ops",
+        ":shared_variable_creator",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "one_device_strategy",
+    srcs = ["one_device_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":values",
+        "//tensorflow/contrib/eager/python:datasets",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "strategy_test_lib",
+    testonly = 1,
+    srcs = ["strategy_test_lib.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "combinations",
+    testonly = 1,
+    srcs = ["combinations.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":mirrored_strategy",
+        ":one_device_strategy",
+        "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "combinations_test",
+    srcs = ["combinations_test.py"],
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":combinations",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_test(
+    name = "mirrored_strategy_test",
+    srcs = ["mirrored_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":mirrored_strategy",
+        ":strategy_test_lib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_test(
+    name = "one_device_strategy_test",
+    srcs = ["one_device_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":one_device_strategy",
+        ":strategy_test_lib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+cuda_py_test(
+    name = "mirrored_strategy_multigpu_test",
+    srcs = ["mirrored_strategy_multigpu_test.py"],
+    additional_deps = [
+        ":mirrored_strategy",
+        ":values",
+        ":strategy_test_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "guitar",
+        "no_pip",
+        "multi_and_single_gpu",
+        # Do not perform the extra analysis on this test, because it is already
+        # performed for the `:mirrored_strategy_test` target.
+        "no_oss",
+        "noasan",
+        "notap",
+        "notsan",
+    ],
+)
+
+py_library(
+    name = "step_fn",
+    srcs = ["step_fn.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:backprop",
+    ],
+)
+
+cuda_py_test(
+    name = "minimize_loss_test",
+    srcs = ["minimize_loss_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":single_loss_example",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops/losses",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+cuda_py_test(
+    name = "optimizer_v2_test",
+    srcs = ["optimizer_v2_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":single_loss_example",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+cuda_py_test(
+    name = "estimator_integration_test",
+    srcs = ["estimator_integration_test.py"],
+    additional_deps = [
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/estimator:dnn_linear_combined",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+py_library(
+    name = "single_loss_example",
+    srcs = ["single_loss_example.py"],
+    deps = [
+        ":step_fn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "step_fn_test",
+    srcs = ["step_fn_test.py"],
+    additional_deps = [
+        ":single_loss_example",
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+py_library(
+    name = "monitor",
+    srcs = ["monitor.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "monitor_test",
+    srcs = ["monitor_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":monitor",
+        ":one_device_strategy",
+        ":single_loss_example",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+py_library(
+    name = "shared_variable_creator",
+    srcs = ["shared_variable_creator.py"],
+    visibility = ["//tensorflow:internal"],
+)
+
+py_test(
+    name = "shared_variable_creator_test",
+    srcs = ["shared_variable_creator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":shared_variable_creator",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "cross_tower_utils",
+    srcs = ["cross_tower_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/nccl:nccl_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "cross_tower_ops",
+    srcs = ["cross_tower_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cross_tower_utils",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "cross_tower_ops_test",
+    srcs = ["cross_tower_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":combinations",
+        ":cross_tower_ops",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "prefetching_ops_v2",
+    srcs = ["prefetching_ops_v2.py"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:contrib_op_loader",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+cuda_py_test(
+    name = "prefetching_ops_v2_test",
+    srcs = ["prefetching_ops_v2_test.py"],
+    additional_deps = [
+        ":prefetching_ops_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
new file mode 100644
index 0000000000000000000000000000000000000000..02b1e7ef9fcd4767c59898bd343e712e285e67d5
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -0,0 +1,297 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Facilities for creating multiple test combinations.
+
+Here is an example of testing various optimizers in Eager and Graph mode:
+
+class AdditionExample(test.TestCase, parameterized.TestCase):
+  @combinations.generate(
+     combinations.combine(mode=["graph", "eager"],
+                          optimizer=[AdamOptimizer(),
+                                     GradientDescentOptimizer()]))
+  def testOptimizer(self, optimizer):
+    ... f(optimizer)...
+
+This will run `testOptimizer` 4 times with the specified optimizers: 2 in
+Eager and 2 in Graph mode.
+The test will be provided with arguments that match the arguments of combine
+by name.  It is necessary to request all arguments, except for `mode`, which is
+optional.
+
+`combine()` function is available for creating a cross product of various
+options.  `times()` function exists for creating a product of N `combine()`-ed
+results.  See below.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+import sys
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.optimizer_v2 import adam as adam_v2
+from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.util import tf_inspect
+
+
+GPU_TEST = "test_gpu" in sys.argv[0]
+
+
+def generate(combinations):
+  """A decorator for generating test cases of a test method or a test class.
+
+  Args:
+    combinations: a list of dictionaries created using combine() and times().
+
+  Restrictions:
+   -- there should always be a "mode" argument.  Accepted values are "eager"
+      and "graph".
+   -- arguments of the test method must match by name to get the corresponding
+      value of the combination.  Tests must accept all arguments (except "mode",
+      which is optional).
+   -- distribution argument is special.  It is meant for passing instances of
+      DistributionStrategy.  Each instance is to be passed as `(<int>,
+      <DistributionStrategy>)` tuple, where <int> is the number of required
+      GPUs.  If the required number of GPUs for the DistributionStrategy isn't
+      available then the test case is going to be skipped.
+
+  Returns:
+    a decorator that will cause the test method to be run under the specified
+    conditions.
+
+  Raises:
+    ValueError - if "mode" argument wasn't either "eager" or "graph.
+  """
+
+  def decorator(test_function):
+    """The decorator to be returned."""
+
+    # Generate good test names that can be used with --test_filter.
+    for combination in combinations:
+      # We use OrderedDicts in `combine()` and `times()` to ensure stable
+      # order of keys in each dictionary.
+      assert isinstance(combination, OrderedDict)
+      name = "".join([
+          "_{}_{}".format(
+              "".join(filter(str.isalnum, key)),
+              "".join(filter(str.isalnum, str(value))))
+          for key, value in combination.items()
+      ])
+      combination.update({"testcase_name": "_test{}".format(name)})
+
+    @parameterized.named_parameters(*combinations)
+    def decorated(self, **kwargs):
+      """A wrapped test method that sets up `test_function`."""
+      assert "mode" in kwargs
+      mode = kwargs["mode"]
+
+      if "distribution" in kwargs:
+        distribution = kwargs["distribution"]
+        kwargs["distribution"] = distribution.strategy
+        if not distribution.required_gpus:
+          if GPU_TEST:
+            self.skipTest("Test that doesn't require GPUs.")
+        elif context.num_gpus() < distribution.required_gpus:
+          self.skipTest(
+              "{} GPUs are not available for this test. {} GPUs are available".
+              format(distribution.required_gpus, context.num_gpus()))
+
+      requested_arguments = tf_inspect.getfullargspec(test_function).args
+      missing_arguments = set(list(kwargs.keys()) + ["self"]).difference(
+          set(requested_arguments + ["mode"]))
+      if missing_arguments:
+        raise ValueError("The test is missing arguments {} .".format(
+            missing_arguments))
+
+      kwargs_to_pass = {}
+      for arg in requested_arguments:
+        if arg == "self":
+          kwargs_to_pass[arg] = self
+        else:
+          kwargs_to_pass[arg] = kwargs[arg]
+
+      if mode == "eager":
+        with context.eager_mode(), ops.Graph().as_default():
+          test_function(**kwargs_to_pass)
+      elif mode == "graph":
+        with context.graph_mode(), ops.Graph().as_default():
+          test_function(**kwargs_to_pass)
+      else:
+        raise ValueError(
+            "'mode' has to be either 'eager' or 'graph' and not {}".format(
+                mode))
+
+    return decorated
+  return decorator
+
+
+def combine(**kwargs):
+  """Generate combinations based on its keyword arguments.
+
+  Two sets of returned combinations can be concatenated using +.  Their product
+  can be computed using `times()`.
+
+  Args:
+    **kwargs: keyword arguments of form `option=[possibilities, ...]`.
+
+  Returns:
+    a list of dictionaries for each combination. Keys in the dictionaries are
+    the keyword argument names.  Each key has one value - one of the
+    corresponding keyword argument values.
+  """
+  if not kwargs:
+    return [OrderedDict()]
+
+  sort_by_key = lambda k: k[0][0]
+  kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
+  first = list(kwargs.items())[0]
+
+  rest = dict(list(kwargs.items())[1:])
+  rest_combined = combine(**rest)
+
+  key = first[0]
+  values = first[1]
+
+  return [
+      OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
+      for v in values
+      for combined in rest_combined
+  ]
+
+
+def times(*combined):
+  """Generate a product of N sets of combinations.
+
+  times(combine(a=[1,2]), combine(b=[3,4])) == combine(a=[1,2], b=[3,4])
+
+  Args:
+    *combined: N lists of dictionaries that specify combinations.
+
+  Returns:
+    a list of dictionaries for each combination.
+
+  Raises:
+    ValueError: if some of the inputs have overlapping keys.
+  """
+  assert combined
+
+  if len(combined) == 1:
+    return combined[0]
+
+  first = combined[0]
+  rest_combined = times(*combined[1:])
+
+  combined_results = []
+  for a in first:
+    for b in rest_combined:
+      if set(a.keys()).intersection(set(b.keys())):
+        raise ValueError("Keys need to not overlap: {} vs {}".format(
+            a.keys(), b.keys()))
+
+      combined_results.append(OrderedDict(list(a.items()) + list(b.items())))
+  return combined_results
+
+
+class NamedObject(object):
+  """A class that translates an object into a good test name."""
+
+  def __init__(self, name, obj):
+    self._name = name
+    self._obj = obj
+
+  def __getattr__(self, name):
+    return getattr(self._obj, name)
+
+  def __call__(self, *args, **kwargs):
+    return self._obj(*args, **kwargs)
+
+  def __repr__(self):
+    return self._name
+
+
+class NamedDistribution(object):
+  """Translates DistributionStrategy and its data into a good name."""
+
+  def __init__(self, name, distribution, required_gpus):
+    self._distribution = distribution
+    self._name = name
+    self._required_gpus = required_gpus
+
+  def __repr__(self):
+    return self._name
+
+  @property
+  def strategy(self):
+    return self._distribution
+
+  @property
+  def required_gpus(self):
+    return self._required_gpus
+
+
+one_device_strategy = NamedDistribution(
+    "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
+    None)
+mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
+    "MirroredCPUAndGPU",
+    mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1)
+mirrored_strategy_without_prefetch = NamedDistribution(
+    "MirroredCPUAndGPUNoPrefetch",
+    mirrored_strategy.MirroredStrategy(
+        ["/gpu:0", "/cpu:0"], prefetch_on_device=False), 1)
+mirrored_strategy_with_two_gpus = NamedDistribution(
+    "Mirrored2GPUs",
+    mirrored_strategy.MirroredStrategy(["/gpu:0", "/gpu:1"]), 2)
+
+adam_optimizer_v1_fn = NamedObject(
+    "AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1))
+gradient_descent_optimizer_v1_fn = NamedObject(
+    "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2))
+
+adam_optimizer_v2_fn = NamedObject(
+    "AdamV2", lambda: adam_v2.AdamOptimizer(0.2, epsilon=1))
+gradient_descent_optimizer_v2_fn = NamedObject(
+    "GradientDescentV2",
+    lambda: gradient_descent_v2.GradientDescentOptimizer(0.2))
+
+graph_and_eager_modes = ["graph", "eager"]
+
+
+def distributions_and_v1_optimizers():
+  """A common set of combination with DistributionStrategies and Optimizers."""
+  return combine(
+      distribution=[
+          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus
+      ],
+      optimizer_fn=[adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn])
+
+
+def distributions_and_v2_optimizers():
+  """DistributionStrategies and V2 Optimizers."""
+  return combine(
+      distribution=[
+          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus
+      ],
+      optimizer_fn=[adam_optimizer_v2_fn, gradient_descent_optimizer_v2_fn])
diff --git a/tensorflow/contrib/distribute/python/combinations_test.py b/tensorflow/contrib/distribute/python/combinations_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..219b24160f3902fcfa5363cc39a8fc5b30d00308
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/combinations_test.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for some testing utils from strategy_test_lib."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.python.eager import test
+
+
+class TestingCombinationsTest(test.TestCase):
+
+  def test_combine(self):
+    self.assertEqual([{
+        "a": 1,
+        "b": 2
+    }, {
+        "a": 1,
+        "b": 3
+    }, {
+        "a": 2,
+        "b": 2
+    }, {
+        "a": 2,
+        "b": 3
+    }], combinations.combine(a=[1, 2], b=[2, 3]))
+
+  def test_add(self):
+    self.assertEqual(
+        [{
+            "a": 1
+        }, {
+            "a": 2
+        }, {
+            "b": 2
+        }, {
+            "b": 3
+        }],
+        combinations.combine(a=[1, 2]) +
+        combinations.combine(b=[2, 3]))
+
+  def test_times(self):
+    c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
+    c2 = combinations.combine(mode=["eager"], loss=["callable"])
+    c3 = combinations.combine(distribution=["d1", "d2"])
+    c4 = combinations.times(c3, c1 + c2)
+    self.assertEqual([
+        OrderedDict([("distribution", "d1"), ("loss", "callable"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d1"), ("loss", "tensor"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d1"), ("loss", "callable"),
+                     ("mode", "eager")]),
+        OrderedDict([("distribution", "d2"), ("loss", "callable"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d2"), ("loss", "tensor"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d2"), ("loss", "callable"),
+                     ("mode", "eager")])
+    ], c4)
+
+  def test_times_variable_arguments(self):
+    c1 = combinations.combine(mode=["graph", "eager"])
+    c2 = combinations.combine(optimizer=["adam", "gd"])
+    c3 = combinations.combine(distribution=["d1", "d2"])
+    c4 = combinations.times(c3, c1, c2)
+    self.assertEqual([
+        OrderedDict([("distribution", "d1"), ("mode", "graph"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d1"), ("mode", "graph"),
+                     ("optimizer", "gd")]),
+        OrderedDict([("distribution", "d1"), ("mode", "eager"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d1"), ("mode", "eager"),
+                     ("optimizer", "gd")]),
+        OrderedDict([("distribution", "d2"), ("mode", "graph"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d2"), ("mode", "graph"),
+                     ("optimizer", "gd")]),
+        OrderedDict([("distribution", "d2"), ("mode", "eager"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d2"), ("mode", "eager"),
+                     ("optimizer", "gd")])
+    ], c4)
+    self.assertEqual(
+        combinations.combine(
+            mode=["graph", "eager"],
+            optimizer=["adam", "gd"],
+            distribution=["d1", "d2"]), c4)
+
+  def test_overlapping_keys(self):
+    c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
+    c2 = combinations.combine(mode=["eager"], loss=["callable"])
+    with self.assertRaisesRegexp(ValueError, ".*Keys.+overlap.+"):
+      _ = combinations.times(c1, c2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5e877d59518056db3fea251cdae0ed854d0e4
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -0,0 +1,585 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for different algorithms of reduction and broadcasting."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.distribute.python import cross_tower_utils
+from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.client import device_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import device_util
+
+
+def _validate_destinations(destinations):
+  if not isinstance(destinations,
+                    (value_lib.DistributedValues, six.string_types, list)):
+    raise ValueError("destinations must be one of a `DistributedValues` object,"
+                     " a device string, a list of device strings or None")
+
+  if not destinations:
+    raise ValueError("destinations can not be empty")
+
+
+def _validate_value_destination_pairs(value_destination_pairs):
+  # pylint: disable=g-missing-docstring
+  if not value_destination_pairs: return False
+  if not isinstance(value_destination_pairs, (list, tuple)): return False
+  if not all([isinstance(pair, tuple) for pair in value_destination_pairs]):
+    return False
+  if not all([isinstance(v[0], value_lib.PerDevice)
+              for v in value_destination_pairs]):
+    return False
+  return True
+
+
+def _get_devices_from(destinations):
+  if isinstance(destinations, value_lib.DistributedValues):
+    return list(destinations.devices)
+  elif isinstance(destinations, six.string_types):
+    return [device_util.canonicalize(destinations)]
+  else:
+    return [
+        device_util.canonicalize(destination) for destination in destinations
+    ]
+
+
+def _devices_match(left, right):
+  return set(_get_devices_from(left)) == set(_get_devices_from(right))
+
+
+def _all_devices_match(value_destination_pairs):
+  if not all([d is None or _devices_match(v, d)
+              for v, d in value_destination_pairs]):
+    return False
+  if not all([_devices_match(v, value_destination_pairs[0][0])
+              for v, _ in value_destination_pairs[1:]]):
+    return False
+  return True
+
+
+def _simple_broadcast(tensor, destinations):
+  index = {}
+  devices = _get_devices_from(destinations)
+  for d in devices:
+    with ops.device(d):
+      index[d] = array_ops.identity(tensor)
+  return value_lib.Mirrored(index)
+
+
+def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn,
+                   method_string):
+  # pylint: disable=g-missing-docstring
+  all_values = []
+  count = 0
+  for v in per_device_value._index.values():  # pylint: disable=protected-access
+    if isinstance(v, value_lib.MapOutput):
+      v_list = v.get()
+      if not v_list:
+        continue
+      count += len(v_list)
+      # Sum within each device before aggregating across devices.
+      v = math_ops.add_n(v_list)
+    else:
+      count += 1
+    all_values.append(v)
+  if not all_values:
+    raise ValueError("`per_device_value` must be non-empty")
+
+  with ops.device(reduce_to_device):
+    with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+      if method_string == "sum":
+        reduced = accumulation_fn(all_values)
+      elif method_string == "mean":
+        reduced = accumulation_fn(all_values) / count
+      else:
+        raise ValueError("`method_string` must be 'sum' or 'mean'")
+  return reduced
+
+
+class CrossTowerOps(object):
+  """Base class for cross-tower reduction and broadcasting algorithms."""
+
+  def __init__(self):
+    pass
+
+  def reduce(self, method_string, per_device_value, destinations=None):
+    """Reduce `per_device_value` to `destinations`.
+
+    It runs the reduction operation defined by `method_string` and put the
+    result on `destinations`.
+
+    Args:
+      method_string: either 'sum' or 'mean' specifying the reduction method.
+      per_device_value: a PerDevice object.
+      destinations: the reduction destinations.
+
+    Returns:
+      a Mirrored object.
+
+    Raises:
+      ValueError: if per_device_value is not a PerDevice object.
+    """
+    if not isinstance(per_device_value, value_lib.PerDevice):
+      raise ValueError("`per_device_value` must be a `PerDevice` object.")
+    if destinations is not None:
+      _validate_destinations(destinations)
+    return self._reduce(method_string, per_device_value, destinations)
+
+  def batch_reduce(self, method_string, value_destination_pairs):
+    """Reduce PerDevice objects in a batch.
+
+    Reduce each first element in `value_destination_pairs` to each second
+    element which indicates the destinations.
+
+    Args:
+      method_string: either 'sum' or 'mean' specifying the reduction method.
+      value_destination_pairs: a list or a tuple of tuples of PerDevice objects
+        and destinations. If a destination is None, then the destinations
+        are set to match the devices of the input PerDevice object.
+
+    Returns:
+      a list of Mirrored objects.
+
+    Raises:
+      ValueError: if `value_destination_pairs` is not a list or a tuple of
+        tuples of PerDevice objects and destinations
+    """
+    if not _validate_value_destination_pairs(value_destination_pairs):
+      raise ValueError("`value_destination_pairs` must be a list or a tuple of "
+                       "tuples of PerDevice objects and destinations")
+    for _, d in value_destination_pairs:
+      if d is not None:
+        _validate_destinations(d)
+
+    return self._batch_reduce(method_string, value_destination_pairs)
+
+  def broadcast(self, tensor, destinations):
+    """Broadcast the `tensor` to destinations.
+
+    Args:
+      tensor: the tensor to broadcast.
+      destinations: the broadcast destinations.
+
+    Returns:
+      a Mirrored object.
+    """
+    _validate_destinations(destinations)
+    return self._broadcast(tensor, destinations)
+
+  def _reduce(self, method_string, per_device_value, destinations):
+    raise NotImplementedError(
+        "_reduce method must be implemented in descendants.")
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    raise NotImplementedError(
+        "_batch_reduce method must be implemented in descendants.")
+
+  def _broadcast(self, tensor, destinations):
+    return _simple_broadcast(tensor, destinations)
+
+
+class ReductionToOneDeviceCrossTowerOps(CrossTowerOps):
+  """Always do reduction to one device first and then do broadcasting.
+
+    Batch reduction is done by reduction on each element one by one.
+  """
+
+  def __init__(self, reduce_to_device=None, accumulation_fn=math_ops.add_n):
+    """Constructor.
+
+    Args:
+      reduce_to_device: the intermediate device to reduce to. If None, reduce
+        to the first device in `destinations` of the reduce() method.
+      accumulation_fn: a function that does accumulation.
+    """
+    self.reduce_to_device = reduce_to_device
+    self.accumulation_fn = accumulation_fn
+    super(ReductionToOneDeviceCrossTowerOps, self).__init__()
+
+  def _reduce(self, method_string, per_device_value, destinations):
+    devices = _get_devices_from(destinations or per_device_value)
+    reduce_to_device = self.reduce_to_device or devices[0]
+    reduced = _simple_reduce(per_device_value, reduce_to_device,
+                             self.accumulation_fn, method_string)
+    return self.broadcast(reduced, devices)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    return [self._reduce(method_string, t, destinations=v)
+            for t, v in value_destination_pairs]
+
+
+def _group_value_by_device(per_device_values):
+  """Group values into sublists by their devices.
+
+  This grouping is needed to call the all-reduce library.
+
+  Args:
+    per_device_values: a list of PerDevice obejcts.
+
+  Returns:
+    a list of lists, each sublist has components for its corresponding device of
+      PerDevice objects, paired with a None.
+  """
+  destinations = per_device_values[0].devices
+  grouped = [[] for _ in range(len(destinations))]
+  for per_device_value in per_device_values:
+    # pylint: disable=protected-access
+    for i, v in enumerate(per_device_value._index.values()):
+      assert per_device_value.devices == destinations
+      grouped[i].append((v, None))
+  return grouped
+
+
+def _ungroup_and_make_mirrored(grouped_reduced, destinations, method_string):
+  """Ungroup results from all-reduce and make Mirrored objects.
+
+  Each all-reduce result will be divided by the number of destinations before
+  Mirrored objects are created if method_string is "mean".
+
+  Args:
+    grouped_reduced: a list of lists, each sublist has components for each
+      device, paired with a None. It is the result from
+      cross_tower_utils.aggregate_gradients_using*.
+    destinations: a list of device strings for returned Mirrored objects.
+    method_string: "mean" or "sum".
+
+  Returns:
+    a list of Mirrored objects.
+  """
+  index = [{} for _ in range(len(grouped_reduced[0]))]
+  for d, per_device_reduced in enumerate(grouped_reduced):
+    for i, (v, _) in enumerate(per_device_reduced):
+      if method_string == "mean":
+        index[i][destinations[d]] = v / len(destinations)
+      else:
+        index[i][destinations[d]] = v
+  return [value_lib.Mirrored(v) for v in index]
+
+
+class ConcatAndSplitPacker(object):
+  """Concatenate and split tensors for reduction."""
+
+  def __init__(self, num_packs=1):
+    """Initialize the ConcatAndSplitPacker object.
+
+    Args:
+      num_packs: specifies the number of split packs that will be
+        formed.
+
+    Raises:
+      ValueError: if num_packs is not greater than 0.
+    """
+    if num_packs <= 0:
+      raise ValueError("num_packs must be greater than zero.")
+    self.num_packs = num_packs
+
+  def pack(self, grouped_grads_and_vars):
+    """Pack tensors."""
+    self.grouped_grads_and_vars = grouped_grads_and_vars
+    self.all_tower_shapes = []
+    self.all_tower_sizes = []
+
+    device_grad_packs = []
+    for tower_grads_and_vars in grouped_grads_and_vars:
+      with ops.colocate_with(tower_grads_and_vars[0][0]):
+        # Flatten all the grads.
+        flat_grads = [
+            array_ops.reshape(g, [-1]) for g, _ in tower_grads_and_vars
+        ]
+        # Remember the original shape of all the grads.
+        tower_shapes = [array_ops.shape(g) for g, _ in tower_grads_and_vars]
+        # Remember the original sizes of all the grads.
+        tower_sizes = [array_ops.size(g) for g, _ in tower_grads_and_vars]
+        # Concat all the flat grads into a big flat tensor.
+        concat_grads = array_ops.concat(flat_grads, 0)
+
+        # Split the big tensor into num_splits packs. In cases where the
+        # total size is not divisible num_splits, the last pack gets
+        # more elements.
+        # TODO(zhengxq): it is also possible to optimize away all the concat
+        # as well.
+        num_splits = self.num_packs
+        total_grad_size = array_ops.size(concat_grads)
+        split_size = total_grad_size // num_splits
+        split_size_last = total_grad_size - split_size * (num_splits - 1)
+        split_sizes = [split_size] * (num_splits - 1) + [split_size_last]
+        grad_packs = array_ops.split(concat_grads, split_sizes)
+
+        # Ready to aggregate the repacked gradients, with fake variables.
+        # TODO(zhengxq): It is hacky to have to use fake variables.
+        # We should remove the need for variables in
+        # aggregate_gradients_using*.
+        device_grad_packs.append(zip(grad_packs, [None] * num_splits))
+        self.all_tower_shapes.append(tower_shapes)
+        self.all_tower_sizes.append(tower_sizes)
+
+    return device_grad_packs
+
+  def unpack(self, summed_device_grad_packs):
+    """Reverse the pack."""
+    aggregated_device_grads = []
+    for (summed_tower_grad_packs,
+         tower_grads_and_vars, tower_shapes, tower_sizes) in zip(
+             summed_device_grad_packs, self.grouped_grads_and_vars,
+             self.all_tower_shapes, self.all_tower_sizes):
+      # pylint: enable=line-too-long
+      # Reverse the packing operations in the previous steps. Form the
+      # summed gradients back into their original shapes.
+      with ops.colocate_with(summed_tower_grad_packs[0][0]):
+        # Form a list of the summed grad packs.
+        device_grad_packs = [g for g, _ in summed_tower_grad_packs]
+
+        # Concat them back into a big flat tensor.
+        device_grads_concat = array_ops.concat(device_grad_packs, 0)
+
+        # Split the tensors back into their original sizes.
+        grads_with_sizes = array_ops.split(device_grads_concat, tower_sizes)
+
+        # Reshape the tensors back into their original shapes.
+        grads_with_shapes = [
+            array_ops.reshape(grad, shape)
+            for shape, grad in zip(tower_shapes, grads_with_sizes)
+        ]
+
+        # Form the list with the original list of variables.
+        summed_tower_grads = [
+            (g, v) for g, (_, v) in zip(grads_with_shapes, tower_grads_and_vars)
+        ]
+        aggregated_device_grads.append(summed_tower_grads)
+    return aggregated_device_grads
+
+
+class AggregateSmallTensorPacker(object):
+  """Concatenate small gradient tensors together for reduction."""
+
+  def __init__(self,
+               agg_small_grads_max_bytes=1048576,
+               agg_small_grads_max_group=16):
+    """Initialize the AggregateSmallTensorPacker object.
+
+    Args:
+      agg_small_grads_max_bytes: largest tensor eligible for aggregation,
+        in number of bytes.
+      agg_small_grads_max_group: largest permitted aggregation of small
+        tensors.
+
+    Raises:
+      ValueError: if `agg_small_grads_max_bytes` or `agg_small_grads_max_group`
+        is not greater than 0.
+    """
+    if agg_small_grads_max_bytes <= 0 or agg_small_grads_max_group <= 0:
+      raise ValueError("agg_small_grads_max_bytes and agg_small_grads_max_group"
+                       " should both be greater than zero.")
+    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self.agg_small_grads_max_group = agg_small_grads_max_group
+
+  def pack(self, grouped_grads_and_vars):
+    """Aggregate small tensors."""
+    if (self.agg_small_grads_max_bytes > 0 and
+        self.agg_small_grads_max_group > 0):
+      tower_grads, self.packing = cross_tower_utils.pack_small_tensors(
+          grouped_grads_and_vars,
+          max_bytes=self.agg_small_grads_max_bytes,
+          max_group=self.agg_small_grads_max_group)
+    return tower_grads
+
+  def unpack(self, summed_device_grad_packs):
+    """Reverse the aggregation process."""
+    return cross_tower_utils.unpack_small_tensors(summed_device_grad_packs,
+                                                  self.packing)
+
+
+class AllReduceCrossTowerOps(CrossTowerOps):
+  """Reduction using all reduce."""
+
+  def __init__(self,
+               all_reduce_alg="nccl",
+               num_packs=1,
+               agg_small_grads_max_bytes=0,
+               agg_small_grads_max_group=10):
+    """All-reduce implementation of CrossTowerOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation:
+      1) If `num_packs` is non-zero, pack values into
+        `num_packs` splits.
+      2) Otherwise, if `agg_small_grads_max_bytes` > 0 and
+        `agg_small_grads_max_group` > 0, aggregate values smaller than
+        `agg_small_grads_max_bytes` into groups with at most
+        `agg_small_grads_max_group` values.
+      3) Otherwise, no repacking or grouping will happen.
+
+    Args:
+      all_reduce_alg: the all-reduce algorithm to use, currently only "nccl" or
+        "hierarchical_copy" are supported.
+      num_packs: see above.
+      agg_small_grads_max_bytes: see above.
+      agg_small_grads_max_group: see above.
+        tensors.
+    """
+    self.all_reduce_alg = all_reduce_alg
+    self.num_packs = num_packs
+    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self.agg_small_grads_max_group = agg_small_grads_max_group
+    super(AllReduceCrossTowerOps, self).__init__()
+
+  def _reduce(self, method_string, per_device_value, destinations):
+    if ((destinations is None or _devices_match(per_device_value, destinations))
+        and not context.executing_eagerly()):
+      return self._batch_all_reduce(method_string, [per_device_value])[0]
+    else:
+      devices = _get_devices_from(destinations or per_device_value)
+      reduce_to_device = devices[0]
+      reduced = _simple_reduce(per_device_value, reduce_to_device,
+                               math_ops.add_n, method_string)
+      return self.broadcast(reduced, devices)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    if (_all_devices_match(value_destination_pairs) and
+        not context.executing_eagerly()):
+      return self._batch_all_reduce(method_string,
+                                    [v[0] for v in value_destination_pairs])
+    else:
+      if not context.executing_eagerly():
+        logging.warning("Efficient batch_reduce is not supported if "
+                        "destinations are different.")
+      return [
+          self._reduce(method_string, t, destinations=v)
+          for t, v in value_destination_pairs
+      ]
+
+  def _batch_all_reduce(self, method_string, per_device_values):
+    """All reduce algorithm in a batch."""
+    destinations = per_device_values[0].devices
+    grouped = _group_value_by_device(per_device_values)
+    if self.num_packs > 0:
+      logging.info(
+          "batch_all_reduce invoked for batches size = %d with "
+          "algorithm = %s and num_packs = %d", len(per_device_values),
+          self.all_reduce_alg, self.num_packs)
+      tensor_packer = ConcatAndSplitPacker(self.num_packs)
+      device_grad_packs = tensor_packer.pack(grouped)
+    elif (self.agg_small_grads_max_bytes > 0 and
+          self.agg_small_grads_max_group > 0):
+      logging.info(
+          "batch_all_reduce invoked for batches size = %d with "
+          "algorithm = %s, agg_small_grads_max_bytes = %d and "
+          "agg_small_grads_max_group = %d", len(per_device_values),
+          self.all_reduce_alg, self.agg_small_grads_max_bytes,
+          self.agg_small_grads_max_group)
+      tensor_packer = AggregateSmallTensorPacker(100, 10)
+      device_grad_packs = tensor_packer.pack(grouped)
+    else:
+      logging.info(
+          "batch_all_reduce invoked for batches size = %d with algorithm = %s",
+          len(per_device_values), self.all_reduce_alg)
+      tensor_packer = None
+      device_grad_packs = grouped
+
+    # The actual aggregation of the repacked gradients. Note that they are
+    # sharded among different aggregation trees. So it is important to strike
+    # the balance on num_splits.
+    if self.all_reduce_alg == "nccl":
+      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
+          device_grad_packs)
+    else:
+      # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
+      # order.
+      reduced = (
+          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
+              destinations, device_grad_packs))
+
+    if tensor_packer:
+      reduced = tensor_packer.unpack(reduced)
+
+    return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
+                                      method_string)
+
+
+_dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
+               [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
+
+
+def _has_dgx1_like_links(gpu_links):
+  if not gpu_links:
+    return False
+  # TODO(yuefengz): figure out the right topology for hierarchial copy if
+  # number of gpus are less than 8.
+  if len(gpu_links) < 8:
+    return False
+  for i, (gpu_link, dgx1_link) in enumerate(zip(gpu_links, _dgx1_links)):
+    if (set(gpu_link) != set(dgx1_link) and
+        set(gpu_link) != set(dgx1_link + [i])):
+      return False
+  return True
+
+
+def _choose_all_reduce_algorithm(device_links):
+  if _has_dgx1_like_links(device_links):
+    logging.info("Configured hierarchical_copy with num_packs=%d",
+                 len(device_links))
+    return AllReduceCrossTowerOps(
+        "hierarchical_copy", num_packs=len(device_links))
+  else:
+    logging.info("Configured nccl all-reduce.")
+    return AllReduceCrossTowerOps("nccl", num_packs=1)
+
+
+def choose_the_best(devices, session_config=None):
+  """Find the best subclass of CrossTowerOps given a tensorflow session.
+
+  Args:
+    devices: a list of devices passed for distribute strategy.
+    session_config: a tensorflow session config or None. If None, it will make
+      deciesion based on all local devices.
+
+  Returns:
+    a subclass of CrossTowerOps.
+  """
+  requested_devices = set([device_util.canonicalize(d) for d in devices])
+  machine_devices = device_lib.list_local_devices(session_config=session_config)
+  using_devices = []
+  for d in machine_devices:
+    if device_util.canonicalize(d.name) in requested_devices:
+      using_devices.append(d)
+    else:
+      logging.info(
+          "Device is available but not used by distribute strategy: %s", d.name)
+
+  if len(using_devices) != len(requested_devices):
+    logging.warning("Not all devices in distribute strategy are visible by "
+                    "TensorFlow sessions.")
+    return ReductionToOneDeviceCrossTowerOps()
+
+  if any([d.device_type.lower() != "gpu" for d in using_devices]):
+    logging.warning("Not all devices in DistributionStrategy are visible to "
+                    "TensorFlow session.")
+    return ReductionToOneDeviceCrossTowerOps()
+
+  device_links = [[] for _ in range(len(using_devices))]
+  for i, device in enumerate(using_devices):
+    for link in device.locality.links.link:
+      device_links[i].append(link.device_id)
+
+  return _choose_all_reduce_algorithm(device_links)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c7b0870887465ec2fe40007695d099277db38bf
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
@@ -0,0 +1,221 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CrossTowerOps."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _make_per_device(values, devices):
+  devices = cross_tower_ops_lib._get_devices_from(devices)
+  assert len(values) == len(devices)
+  index = {}
+  for d, v in zip(devices, values):
+    with ops.device(d):
+      placed_v = array_ops.identity(v)
+    index[d] = placed_v
+  return value_lib.PerDevice(index)
+
+
+# pylint: disable=g-doc-args,g-doc-return-or-yield
+def _fake_mirrored(value, devices):
+  """Create a faked Mirrored object for testing.
+
+  All components of the returned Mirrored have the same objects, which is not
+  true in reality.
+  """
+  devices = cross_tower_ops_lib._get_devices_from(devices)
+  return value_lib.Mirrored(
+      {d: v for d, v in zip(devices, [value] * len(devices))})
+
+
+_cpu_device = "/device:CPU:0"
+
+
+class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
+
+  def _assert_value_equal(self, left, right):
+    if isinstance(left, list):
+      for l, r in zip(left, right):
+        self._assert_value_equal(l, r)
+    else:
+      self.assertEqual(type(left), type(right))
+      self.assertEqual(left.devices, right.devices)
+      if context.executing_eagerly():
+        self.assertEqual([v.numpy() for v in left._index.values()],
+                         list(right._index.values()))
+      else:
+        with self.test_session() as sess:
+          self.assertEqual(
+              sess.run(list(left._index.values())), list(right._index.values()))
+
+  # TODO(yuefengz): decouple the num_gpus check from distribution in
+  # combinations module so that we can pass in devices instead of a distribution
+  # strategy.
+  reduction_to_one_combinations = combinations.combine(
+      cross_tower_ops=[
+          combinations.NamedObject(
+              "DefaultReductionToOneDeviceCrossTowerOps",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps()),
+          combinations.NamedObject(
+              "ReductionToCPUDeviceCrossTowerOps",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+                  reduce_to_device=_cpu_device)),
+          combinations.NamedObject(
+              "AccumulateNCrossTowerOp",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+                  accumulation_fn=math_ops.accumulate_n)),
+      ],
+      distribution=[
+          combinations.one_device_strategy,
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.mirrored_strategy_with_two_gpus
+      ],
+      mode=["graph", "eager"])
+  allreduce_combinations = combinations.combine(
+      cross_tower_ops=[
+          combinations.NamedObject(
+              "AllReduce",
+              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 1, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_tower_ops_lib.AllReduceCrossTowerOps(
+                  "hierarchical_copy", 8, 0, 0)),
+          combinations.NamedObject(
+              "AllReduceNoGradientRepacking",
+              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 0, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopyAggregateSmallTensors",
+              cross_tower_ops_lib.AllReduceCrossTowerOps(
+                  "hierarchical_copy", 0, 100, 10))
+      ],
+      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      mode=["graph", "eager"])
+
+  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
+  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+    devices = distribution.worker_devices
+
+    values = [constant_op.constant(float(d)) for d in range(len(devices))]
+    per_device = _make_per_device(values, devices)
+    mean = (len(devices) - 1.) / 2.
+
+    values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
+    per_device_2 = _make_per_device(values_2, devices)
+    mean_2 = mean + 1.
+
+    destination_mirrored = _fake_mirrored(1., devices)
+    destination_different = _fake_mirrored(1., _cpu_device)
+    destination_str = _cpu_device
+    destination_list = devices
+
+    all_destinations = [
+        None, destination_mirrored, destination_different, destination_str,
+        destination_list
+    ]
+
+    # test reduce()
+    for destinations in all_destinations:
+      self._assert_value_equal(
+          cross_tower_ops.reduce("mean", per_device, destinations=destinations),
+          _fake_mirrored(mean, destinations or per_device))
+      self._assert_value_equal(
+          cross_tower_ops.reduce(
+              "mean", per_device_2, destinations=destinations),
+          _fake_mirrored(mean_2, destinations or per_device))
+      self._assert_value_equal(
+          cross_tower_ops.reduce("sum", per_device, destinations=destinations),
+          _fake_mirrored(mean * len(devices), destinations or per_device))
+      self._assert_value_equal(
+          cross_tower_ops.reduce(
+              "sum", per_device_2, destinations=destinations),
+          _fake_mirrored(mean_2 * len(devices), destinations or per_device))
+
+    # test batch_reduce()
+    for d1, d2 in itertools.product(all_destinations, all_destinations):
+      self._assert_value_equal(
+          cross_tower_ops.batch_reduce(
+              "mean", [(per_device, d1), (per_device_2, d2)]),
+          [_fake_mirrored(mean, d1 or per_device),
+           _fake_mirrored(mean_2, d2 or per_device_2)])
+      self._assert_value_equal(
+          cross_tower_ops.batch_reduce(
+              "sum", [(per_device, d1), (per_device_2, d2)]),
+          [_fake_mirrored(mean * len(devices), d1 or per_device),
+           _fake_mirrored(mean_2 * len(devices), d2 or per_device_2)])
+
+    # test broadcast()
+    for destinations in all_destinations:
+      if destinations is None:
+        continue
+      else:
+        self._assert_value_equal(
+            cross_tower_ops.broadcast(constant_op.constant(1.), destinations),
+            _fake_mirrored(1., destinations))
+
+  def testChooseAlgorithm(self):
+    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
+                    [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result.num_packs, 8)
+
+    # if there are only 4 devices
+    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "nccl")
+    self.assertEqual(result.num_packs, 1)
+
+    # if devices links contain each device itself
+    device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
+                    [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
+                    [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result.num_packs, 8)
+
+    # if not dgx1-like links
+    device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
+                    [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "nccl")
+    self.assertEqual(result.num_packs, 1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/contrib/distribute/python/cross_tower_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc04e2195f6d305e0f7c642f24c355286f1a8cfa
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils.py
@@ -0,0 +1,339 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for cross_tower_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as pycoll
+
+from tensorflow.contrib import nccl
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def aggregate_gradients_using_nccl(tower_grads):
+  """Aggregate gradients using nccl allreduce."""
+  agg_all_g_and_v = []
+  for single_g_and_v in zip(*tower_grads):
+    single_grads = [g for g, _ in single_g_and_v]
+    agg_grads = nccl.all_sum(single_grads)
+    agg_all_g_and_v.append(
+        [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)])
+
+  agg_all_g_and_v = list(zip(*agg_all_g_and_v))
+
+  return agg_all_g_and_v
+
+
+def aggregate_gradients_using_hierarchical_copy(avail_devices, tower_grads):
+  """Aggregate gradients using hierarchical copies.
+
+  Args:
+    avail_devices: available GPU devices.
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over towers. The inner list is over individual gradients.
+
+  Returns:
+    The list of (aggregated_gradient, variable), where the gradient has been
+      summed across all towers and the variable is chosen from the first tower.
+  """
+  # This only works for DGX-1 type of machine topology
+  # Device peer to peer matrix
+  # DMA: 0 1 2 3 4 5 6 7
+  # 0:   Y Y Y Y Y N N N
+  # 1:   Y Y Y Y N Y N N
+  # 2:   Y Y Y Y N N Y N
+  # 3:   Y Y Y Y N N N Y
+  # 4:   Y N N N Y Y Y Y
+  # 5:   N Y N N Y Y Y Y
+  # 6:   N N Y N Y Y Y Y
+  # 7:   N N N Y Y Y Y Y
+  agg_grads = []
+  num_devices = len(avail_devices)
+  # In the special case of DGX-1 machine topology, the two groups have equal
+  # size.
+  group_size = num_devices // 2
+  for i, single_grads in enumerate(zip(*tower_grads)):
+    group_0_main_device = i % num_devices
+    group_1_main_device = (group_0_main_device + group_size) % num_devices
+    if group_0_main_device < group_size:
+      group_0_begin = 0
+      group_1_begin = group_size
+    else:
+      group_0_begin = group_size
+      group_1_begin = 0
+
+    # Aggregate the first group.
+    group_0_device_grads = single_grads[group_0_begin:
+                                        group_0_begin + group_size]
+    with ops.device(avail_devices[group_0_main_device]):
+      group_0_agg_grads, _ = aggregate_single_gradient_using_copy(
+          group_0_device_grads, False, False)
+
+    # Aggregate the second group.
+    group_1_device_grads = single_grads[group_1_begin:
+                                        group_1_begin + group_size]
+    with ops.device(avail_devices[group_1_main_device]):
+      group_1_agg_grads, _ = aggregate_single_gradient_using_copy(
+          group_1_device_grads, False, False)
+
+    # Aggregate between the groups.
+    with ops.device(avail_devices[group_0_main_device]):
+      (agg_total_grads, _), _ = aggregate_single_gradient_using_copy(
+          [group_0_agg_grads, group_1_agg_grads], False, False)
+
+    # Broadcast the result back into the root of each group.
+    with ops.device(avail_devices[group_0_main_device]):
+      group_0_agg_grads_bcast = array_ops.identity(agg_total_grads)
+    with ops.device(avail_devices[group_1_main_device]):
+      group_1_agg_grads_bcast = array_ops.identity(agg_total_grads)
+
+    agg_grads_bcast = []
+    for j in range(len(single_grads)):
+      with ops.device(avail_devices[j]):
+        # Broadcast the result back to each member in the group from the root.
+        if (group_0_main_device < group_size) == (j < group_size):
+          src_device_grad = group_0_agg_grads_bcast
+        else:
+          src_device_grad = group_1_agg_grads_bcast
+        agg_grads_bcast.append(array_ops.identity(src_device_grad))
+
+    agg_grads.append(
+        [(g, v) for g, (_, v) in zip(agg_grads_bcast, single_grads)])
+
+  agg_grads = list(zip(*agg_grads))
+
+  return agg_grads
+
+
+def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
+                                         check_inf_nan):
+  """Calculate the average gradient for a shared variable across all towers.
+
+  Note that this function provides a synchronization point across all towers.
+
+  Args:
+    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
+      (gradient, variable) pair within the outer list represents the gradient
+      of the variable calculated for a single tower, and the number of pairs
+      equals the number of towers.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  grads = [g for g, _ in grad_and_vars]
+  grad = math_ops.add_n(grads)
+
+  if use_mean and len(grads) > 1:
+    grad = array_ops.multiply(grad, 1.0 / len(grads))
+
+  v = grad_and_vars[0][1]
+  if check_inf_nan:
+    has_nan_or_inf = array_ops.logical_not(
+        array_ops.reduce_all(array_ops.is_finite(grads)))
+    return (grad, v), has_nan_or_inf
+  else:
+    return (grad, v), None
+
+
+def extract_ranges(index_list, range_size_limit=32):
+  """Extract consecutive ranges and singles from index_list.
+
+  Args:
+    index_list: List of monotone increasing non-negative integers.
+    range_size_limit: Largest size range to return.  If a larger
+      consecutive range exists, it will be returned as multiple
+      ranges.
+
+  Returns:
+    (ranges, singles) where ranges is a list of [first, last] pairs of
+      consecutive elements in index_list, and singles is all of the
+      other elements, in original order.
+  """
+  if not index_list:
+    return [], []
+  first = index_list[0]
+  last = first
+  ranges = []
+  singles = []
+  for i in index_list[1:]:
+    if i == last + 1 and (last - first) <= range_size_limit:
+      last = i
+    else:
+      if last > first:
+        ranges.append([first, last])
+      else:
+        singles.append(first)
+      first = i
+      last = i
+  if last > first:
+    ranges.append([first, last])
+  else:
+    singles.append(first)
+  return ranges, singles
+
+
+GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
+
+
+def pack_range(key, packing, grad_vars, rng):
+  """Form the concatenation of a specified range of gradient tensors.
+
+  Args:
+    key: Value under which to store meta-data in packing that will be used
+      later to restore the grad_var list structure.
+    packing: Dict holding data describing packed ranges of small tensors.
+    grad_vars: List of (grad, var) pairs for one tower.
+    rng: A pair of integers giving the first, last indices of a consecutive
+      range of tensors to be packed.
+
+  Returns:
+    A tensor that is the concatenation of all the specified small tensors.
+  """
+  to_pack = grad_vars[rng[0]:rng[1] + 1]
+  members = []
+  variables = []
+  restore_shapes = []
+  with ops.name_scope('pack'):
+    for g, v in to_pack:
+      variables.append(v)
+      restore_shapes.append(g.shape)
+      with ops.device(g.device):
+        members.append(array_ops.reshape(g, [-1]))
+    packing[key] = GradPackTuple(
+        indices=range(rng[0], rng[1] + 1),
+        vars=variables,
+        shapes=restore_shapes)
+    with ops.device(members[0].device):
+      return array_ops.concat(members, 0)
+
+
+def unpack_grad_tuple(gv, gpt):
+  """Unpack a previously packed collection of gradient tensors.
+
+  Args:
+    gv: A (grad, var) pair to be unpacked.
+    gpt: A GradPackTuple describing the packing operation that produced gv.
+
+  Returns:
+    A list of (grad, var) pairs corresponding to the values that were
+     originally packed into gv, maybe following subsequent operations like
+     reduction.
+  """
+  elt_widths = [x.num_elements() for x in gpt.shapes]
+  with ops.device(gv[0][0].device):
+    with ops.name_scope('unpack'):
+      splits = array_ops.split(gv[0], elt_widths)
+      unpacked_gv = []
+      for idx, s in enumerate(splits):
+        unpacked_gv.append((array_ops.reshape(s, gpt.shapes[idx]),
+                            gpt.vars[idx]))
+  return unpacked_gv
+
+
+def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
+  """Concatenate small gradient tensors together for reduction.
+
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples.
+    max_bytes: Int giving max number of bytes in a tensor that
+      may be considered small.
+    max_group: Int giving max number of small tensors that may be
+      concatenated into one new tensor.
+
+  Returns:
+    new_tower_grads, packing where new_tower_grads is identical to
+      tower_grads except that all feasible small_tensors have been removed
+      from their places and concatenated into larger tensors that are
+      now in the front of the list for each tower, and packing contains
+      the data necessary to restore the tower_grads structure.
+
+  Look through the first tower for gradients of the same type (float),
+  and small size, that are all sequential.  For each such group,
+  replace by a new tensor that is a flattened concatenation.  Note
+  that the corresponding variable will be absent, which doesn't matter
+  because it isn't used during all-reduce.
+
+  Requires:
+    Every gv_list in towers must have isomorphic structure including identical
+      tensor sizes and types.
+  """
+  small_indices = []
+  large_indices = []
+  for idx, (g, _) in enumerate(tower_grads[0]):
+    if g.dtype == dtypes.float32 and (4 * g.shape.num_elements()) <= max_bytes:
+      small_indices.append(idx)
+    else:
+      large_indices.append(idx)
+  small_ranges, small_singles = extract_ranges(
+      small_indices, range_size_limit=max_group)
+  large_indices = sorted(large_indices + small_singles)
+  num_gv = len(tower_grads[0])
+  packing = {}
+  if small_ranges:
+    new_tower_grads = []
+    for dev_idx, gv_list in enumerate(tower_grads):
+      assert len(gv_list) == num_gv
+      new_gv_list = []
+      for r in small_ranges:
+        key = '%d:%d' % (dev_idx, len(new_gv_list))
+        new_gv_list.append((pack_range(key, packing, gv_list, r),
+                            'packing_var_placeholder'))
+      for i in large_indices:
+        new_gv_list.append(gv_list[i])
+      new_tower_grads.append(new_gv_list)
+    return new_tower_grads, packing
+  else:
+    return tower_grads, None
+
+
+def unpack_small_tensors(tower_grads, packing):
+  """Undo the structure alterations to tower_grads done by pack_small_tensors.
+
+  Args:
+    tower_grads: List of List of (grad, var) tuples.
+    packing: A dict generated by pack_small_tensors describing the changes
+      it made to tower_grads.
+
+  Returns:
+    new_tower_grads: identical to tower_grads except that concatenations
+      of small tensors have been split apart and returned to their original
+      positions, paired with their original variables.
+  """
+  if not packing:
+    return tower_grads
+  new_tower_grads = []
+  num_devices = len(tower_grads)
+  num_packed = len(packing.keys()) // num_devices
+  for dev_idx, gv_list in enumerate(tower_grads):
+    gv_list = list(gv_list)
+    new_gv_list = gv_list[num_packed:]
+    for i in xrange(0, num_packed):
+      k = '%d:%d' % (dev_idx, i)
+      gpt = packing[k]
+      gv = unpack_grad_tuple(gv_list[i], gpt)
+      for gi, idx in enumerate(gpt.indices):
+        assert idx == gpt.indices[gi]
+        new_gv_list.insert(idx, gv[gi])
+    new_tower_grads.append(new_gv_list)
+  return new_tower_grads
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b49b8f4ef2937c7cffbdbd36ca50f6b0db8c1b0
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -0,0 +1,127 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that show that DistributionStrategy works with canned Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.optimizer_v2 import adagrad
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import test
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.canned import dnn_linear_combined
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.summary.writer import writer_cache
+
+
+class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
+                                                 parameterized.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def dataset_input_fn(self, x, y, batch_size, shuffle):
+
+    def input_fn():
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      if shuffle:
+        dataset = dataset.shuffle(batch_size)
+      dataset = dataset.repeat(10).batch(batch_size)
+      return dataset
+
+    return input_fn
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          distribution=[
+              combinations.one_device_strategy,
+              combinations.mirrored_strategy_without_prefetch
+          ]))
+  def test_complete_flow_with_mode(self, distribution):
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    train_input_fn = self.dataset_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size // len(distribution.worker_devices),
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, y=data, batch_size=batch_size, shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, batch_size=batch_size, shuffle=False)
+
+    linear_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    dnn_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
+        linear_feature_columns=linear_feature_columns,
+        dnn_hidden_units=(2, 2),
+        dnn_feature_columns=dnn_feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir,
+        # TODO(isaprykin): Work around the colocate_with error.
+        dnn_optimizer=adagrad.AdagradOptimizer(0.001),
+        linear_optimizer=adagrad.AdagradOptimizer(0.001),
+        config=run_config.RunConfig(train_distribute=distribution))
+
+    num_steps = 10
+    estimator.train(train_input_fn, steps=num_steps)
+
+    scores = estimator.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in estimator.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
+                                             serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/BUILD b/tensorflow/contrib/distribute/python/examples/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..cbfd17850212a1c007e2edb9dd3986b3109f040d
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/BUILD
@@ -0,0 +1,30 @@
+# Example TensorFlow models that use DistributionStrategy for training.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_binary(
+    name = "simple_estimator_example",
+    srcs = ["simple_estimator_example.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "simple_tfkeras_example",
+    srcs = [
+        "simple_tfkeras_example.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..00c25c7a2482a559c8b94ff3be86c4961dfb439f
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
@@ -0,0 +1,87 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple example to test the a DistributionStrategy with Estimators.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def build_model_fn_optimizer():
+  """Simple model_fn with optimizer."""
+  # TODO(anjalisridhar): Move this inside the model_fn once OptimizerV2 is
+  # done?
+  optimizer = tf.train.GradientDescentOptimizer(0.2)
+
+  def model_fn(features, labels, mode):  # pylint: disable=unused-argument
+    """model_fn which uses a single unit Dense layer."""
+    # You can also use the Flatten layer if you want to test a model without any
+    # weights.
+    layer = tf.layers.Dense(1, use_bias=True)
+    logits = layer(features)
+
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      predictions = {"logits": logits}
+      return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+    def loss_fn():
+      y = tf.reshape(logits, []) - tf.constant(1.)
+      return y * y
+
+    if mode == tf.estimator.ModeKeys.EVAL:
+      return tf.estimator.EstimatorSpec(mode, loss=loss_fn())
+
+    assert mode == tf.estimator.ModeKeys.TRAIN
+
+    global_step = tf.train.get_global_step()
+    train_op = optimizer.minimize(loss_fn(), global_step=global_step)
+    return tf.estimator.EstimatorSpec(mode, loss=loss_fn(), train_op=train_op)
+
+  return model_fn
+
+
+def main(_):
+  distribution = tf.contrib.distribute.MirroredStrategy(
+      ["/device:GPU:0", "/device:GPU:1"])
+  config = tf.estimator.RunConfig(train_distribute=distribution)
+
+  def input_fn():
+    features = tf.data.Dataset.from_tensors([[1.]]).repeat(10)
+    labels = tf.data.Dataset.from_tensors([1.]).repeat(10)
+    return tf.data.Dataset.zip((features, labels))
+
+  estimator = tf.estimator.Estimator(
+      model_fn=build_model_fn_optimizer(), config=config)
+  estimator.train(input_fn=input_fn, steps=10)
+
+  eval_result = estimator.evaluate(input_fn=input_fn)
+  print("Eval result: {}".format(eval_result))
+
+  def predict_input_fn():
+    predict_features = tf.data.Dataset.from_tensors([[1.]]).repeat(10)
+    return predict_features
+
+  predictions = estimator.predict(input_fn=predict_input_fn)
+  # TODO(anjalsridhar): This returns a generator object, figure out how to get
+  # meaningful results here.
+  print("Prediction results: {}".format(predictions))
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..b87224251ca3844fc81c6f32a893d2c71664a955
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An example tf.keras model that is trained using MirroredStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from sys import argv
+import numpy as np
+import tensorflow as tf
+
+
+def input_fn():
+  x = np.random.random((1024, 10))
+  y = np.random.randint(2, size=(1024, 1))
+  x = tf.cast(x, tf.float32)
+  dataset = tf.data.Dataset.from_tensor_slices((x, y))
+  dataset = dataset.repeat(10)
+  dataset = dataset.batch(32)
+  return dataset
+
+
+def main(args):
+  if len(args) < 2:
+    print('You must specify  model_dir for checkpoints such as'
+          ' /tmp/tfkeras_example./')
+    return
+
+  print('Using %s to store checkpoints.' % args[1])
+
+  strategy = tf.contrib.distribute.MirroredStrategy(
+      ['/device:GPU:0', '/device:GPU:1'])
+  config = tf.estimator.RunConfig(train_distribute=strategy)
+  optimizer = tf.train.GradientDescentOptimizer(0.2)
+
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)))
+  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+
+  model.compile(loss='binary_crossentropy', optimizer=optimizer)
+  model.summary()
+  tf.keras.backend.set_learning_phase(True)
+  keras_estimator = tf.keras.estimator.model_to_estimator(
+      keras_model=model, config=config, model_dir=args[1])
+
+  keras_estimator.train(input_fn=input_fn, steps=10)
+  eval_result = keras_estimator.evaluate(input_fn=input_fn)
+  print('Eval result: {}'.format(eval_result))
+
+if __name__ == '__main__':
+  tf.app.run(argv=argv)
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fa90df79bbcd621fe7b7d0da04256b7a59d5bfe
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -0,0 +1,279 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for running legacy optimizer code with DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
+from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.ops.losses import losses_impl
+
+
+class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+  def testTrainNetwork(self, distribution, optimizer_fn,
+                       use_callable_loss=True):
+    with distribution.scope():
+      model_fn, dataset, layer = minimize_loss_example(
+          optimizer_fn,
+          use_bias=True,
+          use_callable_loss=use_callable_loss)
+
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(), run_concurrently=layer.built))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(10):
+        run_step()
+
+        weights.append(self.evaluate(distribution.fetch(layer.kernel)))
+        biases.append(self.evaluate(distribution.fetch(layer.bias)))
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers() +
+          combinations.distributions_and_v2_optimizers(),
+          combinations.combine(mode=["graph", "eager"])))
+  def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
+    created_variables = []
+    trainable_variables = []
+
+    def appending_creator(next_creator, *args, **kwargs):
+      v = next_creator(*args, **kwargs)
+      created_variables.append(v.name)
+      if "trainable" in kwargs and kwargs["trainable"]:
+        trainable_variables.append(v.name)
+      return v
+
+    # Creator scope needs to be set before it's used inside
+    # `distribution.scope`.
+    with variable_scope.variable_creator_scope(
+        appending_creator), distribution.scope():
+      model_fn, dataset, layer = minimize_loss_example(
+          optimizer_fn,
+          use_bias=True,
+          use_callable_loss=True,
+          create_optimizer_inside_model_fn=True)
+
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(), run_concurrently=layer.built))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      run_step()
+
+      def get_expected_variables(optimizer_fn, num_parameter_devices):
+        variables_map = {
+            "GradientDescent": ["dense/kernel", "dense/bias"],
+            "Adam": [
+                "dense/kernel", "dense/bias", "beta1_power", "beta2_power",
+                "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam",
+                "dense/bias/Adam_1"
+            ]
+        }
+        variables = variables_map[optimizer_fn().get_name()]
+        variables.extend([
+            v + "/replica_{}".format(replica)
+            for v in variables
+            for replica in range(1, num_parameter_devices)
+        ])
+        return set([v + ":0" for v in variables])
+
+      self.assertEqual(
+          get_expected_variables(optimizer_fn,
+                                 len(distribution.parameter_devices)),
+          set(created_variables))
+
+  @combinations.generate(
+      combinations.times(combinations.distributions_and_v1_optimizers(),
+                         combinations.combine(
+                             mode=["graph", "eager"],
+                             momentum=[0.8, 0.9, 0.99],
+                             renorm=[False, True])))
+  def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
+                                    renorm):
+    """Verifies that moving mean updates are reduced across towers."""
+    with distribution.scope():
+      num_towers = len(distribution.worker_devices)
+      model_fn, dataset, batchnorm = batchnorm_example(
+          optimizer_fn,
+          batch_per_epoch=num_towers,
+          momentum=momentum,
+          renorm=renorm)
+
+      # Disable prefetching since that makes the specific input on each device
+      # to be non deterministic, and this test relies on specific input being
+      # on each device.
+      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
+        distribution._prefetch_on_device = False
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return control_flow_ops.group(
+            distribution.unwrap(
+                distribution.call_for_each_tower(
+                    model_fn,
+                    iterator.get_next(),
+                    run_concurrently=batchnorm.built)) +
+            ops.get_collection(ops.GraphKeys.UPDATE_OPS))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      expected_moving_means = [0.] * 8
+
+      def averaged_batch_mean(i):
+        # Each batch has shape [16, 8] where the ith element in jth list is
+        # (8 * j + i + tower_id * 100). So the batch mean in each tower is
+        # (60 + i + tower_id * 100). So here comes its batch mean over all
+        # towers:
+        return 60. + i + (num_towers - 1.) / 2. * 100.
+
+      for _ in range(10):
+        run_step()
+        moving_means = self.evaluate(distribution.fetch(batchnorm.moving_mean))
+
+        # We make sure that the moving_mean is updated as if the sample mean is
+        # calculated over all towers.
+        for i, expected_moving_mean in enumerate(expected_moving_means):
+          expected_moving_means[i] -= ((
+              expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
+          self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(
+              distribution=[combinations.one_device_strategy,
+                            combinations.mirrored_strategy_with_gpu_and_cpu,
+                            combinations.mirrored_strategy_with_two_gpus],
+              optimizer_fn=[combinations.gradient_descent_optimizer_v1_fn,
+                            combinations.gradient_descent_optimizer_v2_fn],
+              loss_reduction=[losses_impl.Reduction.SUM,
+                              losses_impl.Reduction.MEAN,
+                              losses_impl.Reduction.SUM_OVER_BATCH_SIZE,
+                              losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS]),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+  def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
+                    use_callable_loss):
+    with distribution.scope():
+      all_vars = []
+
+      def model_fn(x, y):
+
+        def loss_fn():
+          # Use fixed initialization to make the steps deterministic.
+          w = variable_scope.get_variable("w", initializer=[[2.]])
+          all_vars.append(w)
+          predict = math_ops.matmul(x, w)
+          return losses_impl.mean_squared_error(
+              y, predict, reduction=loss_reduction)
+
+        optimizer = optimizer_fn()  # GradientDescent with 0.2 learning rate
+
+        if use_callable_loss:
+          return optimizer.minimize(loss_fn)
+        else:
+          return optimizer.minimize(loss_fn())
+
+      features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
+      labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
+      dataset = dataset_ops.Dataset.zip((features, labels)).repeat()
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, *iterator.get_next(), run_concurrently=False))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      run_step()
+
+      self.assertEqual(distribution.num_towers, len(all_vars))
+      v = all_vars[0]
+      self.assertTrue(all([v is vi for vi in all_vars[1:]]))
+      weight = numpy.squeeze(self.evaluate(distribution.fetch(v)))
+      # Our model is:
+      #   predict = x * w
+      #   loss = (predict - y)^2
+      #   dloss/dpredict = 2*(predict - y)
+      #   dloss/dw = 2 * x^T @ (predict - y)
+      # For our batch size of 2, assuming sum loss reduction:
+      #   x = [2, 7]
+      #   y = [6, 21]
+      #   w_initial = 2
+      #   predict = [4, 14]
+      #   predict - y = [-2, -7]
+      #   dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106
+      # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
+      # with sum loss reduction, or 10.6 with mean.
+      if loss_reduction == losses_impl.Reduction.SUM:
+        # Note that the "distribution.num_towers" factor will go away once
+        # we split the input across towers, instead of pulling a complete
+        # batch of input per tower.
+        self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001)
+      else:
+        # One of the mean loss reductions.
+        self.assertNear(weight, 2 + 10.6, 0.0001)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb0edb3a11df7788991ca14f957494d87593a449
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -0,0 +1,497 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class MirroredStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+import six
+
+from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.contrib.distribute.python import values
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+def _cpu_device(device):
+  cpu_device = tf_device.DeviceSpec.from_string(device)
+  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
+  return cpu_device.to_string()
+
+
+class _RequestedStop(Exception):
+  pass
+
+
+class MirroredStrategy(distribute_lib.DistributionStrategy):
+  """Mirrors vars to distribute across multiple devices on a single machine.
+
+  This strategy uses one tower per device and sync replication.
+  """
+
+  def __init__(self,
+               devices=None,
+               num_gpus=None,
+               cross_tower_ops=None,
+               prefetch_on_device=None):
+    super(MirroredStrategy, self).__init__()
+    # Convert `num_gpus` into `devices`, shouldn't specify both.
+    if devices is None:
+      if num_gpus is None:
+        num_gpus = context.num_gpus()
+      devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
+    elif num_gpus is not None:
+      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = devices
+    self._canonical_device_set = set(
+        [device_util.canonicalize(d) for d in devices])
+    self._device_index = values.PerDevice(
+        dict((d, i) for i, d in enumerate(devices)))
+    self._cross_tower_ops = cross_tower_ops
+    self._prefetch_on_device = prefetch_on_device
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    """Create a mirrored variable. See `DistributionStrategy.scope`."""
+    # Figure out what collections this variable should be added to.
+    # We'll add the MirroredVariable to those collections instead.
+    collections = kwargs.pop("collections", None)
+    if collections is None:
+      collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+    kwargs["collections"] = []
+
+    colocate_with = kwargs.pop("colocate_with", None)
+    devices = self._get_devices_from(colocate_with)
+
+    tower_local = kwargs.pop("tower_local_reduce_method", None)
+    if tower_local is not None:
+      kwargs["trainable"] = False
+
+    # TODO(josh11b,apassos): It would be better if variable initialization
+    # was never recorded on the tape instead of having to do this manually
+    # here.
+    with tape.stop_recording():
+      index = {}
+      for i, d in enumerate(devices):
+        with ops.device(d):
+          if i > 0:
+            # Give replicas meaningful distinct names:
+            var0name = index[devices[0]].name.split(":")[0]
+            kwargs["name"] = "%s/replica_%d" % (var0name, i)
+            # Initialize replicas with the same value:
+            if context.executing_eagerly():
+              initial_value = index[devices[0]].value()
+            else:
+              initial_value = index[devices[0]].initial_value
+            kwargs["initial_value"] = array_ops.identity(initial_value)
+          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+            v = next_creator(*args, **kwargs)
+          assert not isinstance(v, values.DistributedVariable)
+          index[d] = v
+
+      if tower_local is None:
+        result = values.MirroredVariable(index, index[devices[0]])
+      else:
+        result = values.TowerLocalVariable(
+            index, index[devices[0]], tower_local)
+
+    if not context.executing_eagerly():
+      g = ops.get_default_graph()
+      # If "trainable" is True, next_creator() will add the member variables
+      # to the TRAINABLE_VARIABLES collection, so we manually remove
+      # them and replace with the MirroredVariable. We can't set
+      # "trainable" to False for next_creator() since that causes functions
+      # like implicit_gradients to skip those variables.
+      if kwargs.get("trainable", True):
+        collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+        l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+        for v in index.values():
+          l.remove(v)
+      g.add_to_collections(collections, result)
+    return result
+
+  def distribute_dataset(self, dataset):
+    per_device_dataset = values.PerDeviceDataset(
+        dataset, self._devices, self._prefetch_on_device)
+    return per_device_dataset.make_one_shot_iterator()
+
+  def _broadcast(self, tensor, destinations):
+    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
+    return self._get_cross_tower_ops().broadcast(tensor, destinations or
+                                                 self._devices)
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    """Run `fn` in separate threads, once per tower/worker device.
+
+    Args:
+      fn: function to run (will be run once per device, each in its own thread).
+      *args: positional arguments for `fn`
+      **kwargs: keyword arguments for `fn`.
+          `"run_concurrently"`: Boolean indicating whether executions of `fn`
+             can be run concurrently (under eager execution only), defaults to
+             `True`.
+
+    Returns:
+      Merged return value of `fn` across all towers.
+
+    Raises:
+      RuntimeError: If fn() calls get_tower_context().merge_call() a different
+          number of times for when called for different devices.
+    """
+    run_concurrently = kwargs.pop("run_concurrently", True)
+    if not context.executing_eagerly():
+      # Lots of TF library code isn't thread-safe in graph mode, and
+      # there is little to be gained by turning on multithreading when
+      # constructing a graph.
+      run_concurrently = False
+      # Needed for per-thread device, etc. contexts in graph mode.
+      ops.get_default_graph().switch_to_thread_local()
+    elif run_concurrently is None:
+      run_concurrently = True
+
+    coord = coordinator.Coordinator(
+        clean_stop_exception_types=(_RequestedStop,))
+
+    shared_variable_store = {}
+
+    # TODO(isaprykin): Create these threads once instead of during every run()
+    # call.
+    threads = []
+    for index, d in enumerate(self._devices):
+      variable_creator_fn = shared_variable_creator.make_fn(
+          shared_variable_store, index)
+      t = MirroredStrategy._MirroredTowerThread(
+          self, coord, d, variable_creator_fn, fn,
+          *values.select_device(d, args), **values.select_device(d, kwargs))
+      threads.append(t)
+
+    for t in threads:
+      t.start()
+
+    # When `fn` starts `should_run` event is set on _MirroredTowerThread
+    # (`MTT`) threads. The execution waits until
+    # `MTT.has_paused` is set, which indicates that either `fn` is
+    # complete or a `get_tower_context().merge_call()` is called.  If `fn` is
+    # complete, then `MTT.done` is set to True.  Otherwise, arguments
+    # of `get_tower_context().merge_call` from all paused threads are grouped
+    # and the `merge_fn` is performed.  Results of the
+    # `get_tower_context().merge_call` are then set to `MTT.merge_result`.
+    # Each such `get_tower_context().merge_call` call returns the
+    # `MTT.merge_result` for that thread when `MTT.should_run` event
+    # is reset again. Execution of `fn` resumes.
+
+    try:
+      with coord.stop_on_exception():
+        all_done = False
+        while not all_done and not coord.should_stop():
+          done = []
+          if run_concurrently:
+            for t in threads:
+              t.should_run.set()
+            for t in threads:
+              t.has_paused.wait()
+              t.has_paused.clear()
+              if coord.should_stop():
+                return None
+              done.append(t.done)
+          else:
+            for t in threads:
+              t.should_run.set()
+              t.has_paused.wait()
+              t.has_paused.clear()
+              if coord.should_stop():
+                return None
+              done.append(t.done)
+          if coord.should_stop():
+            return None
+          all_done = all(done)
+          if not all_done:
+            if any(done):
+              raise RuntimeError("Some towers made a different number of "
+                                 "tower_context().merge_call() calls.")
+            # get_tower_context().merge_call() case
+            merge_args = values.regroup(
+                {t.device: t.merge_args for t in threads})
+            merge_kwargs = values.regroup(
+                {t.device: t.merge_kwargs for t in threads})
+            merge_result = threads[0].merge_fn(
+                self, *merge_args, **merge_kwargs)
+            for t in threads:
+              t.merge_result = values.select_device(t.device, merge_result)
+    finally:
+      for t in threads:
+        t.should_run.set()
+      coord.join(threads)
+
+    return values.regroup({t.device: t.main_result for t in threads})
+
+  def map(self, map_over, fn, *args, **kwargs):
+    # TODO(josh11b): In eager mode, use one thread per device.
+    index = {}
+    i = 0
+    for m in map_over:
+      d = self._devices[i % len(self._devices)]
+      with ops.device(d):
+        l = index.get(d, [])
+        l.append(fn(m,
+                    *values.select_device_mirrored(d, args),
+                    **values.select_device_mirrored(d, kwargs)))
+        index[d] = l
+    # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput
+    # in addition to PerDevice data.
+    return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
+
+  def configure(self, session_config=None):
+    if self._cross_tower_ops is None:
+      self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
+          self._devices, session_config=session_config)
+
+  def _get_cross_tower_ops(self):
+    if self._cross_tower_ops is None:
+      self._cross_tower_ops = (
+          cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps())
+    return self._cross_tower_ops
+
+  def _reduce(self, method_string, value, destinations):
+    if len(self._devices) == 1 and not isinstance(value, values.PerDevice):
+      value = values.PerDevice({self._devices[0]: value})
+    assert isinstance(value, values.PerDevice)
+
+    return self._get_cross_tower_ops().reduce(
+        method_string, value, destinations=destinations)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    return self._get_cross_tower_ops().batch_reduce(method_string,
+                                                    value_destination_pairs)
+
+  def _update(self, var, fn, *args, **kwargs):
+    # TODO(josh11b): Also support TowerLocalVariables here? If so, args and
+    # kwargs don't need to be mirrored.
+    assert isinstance(var, values.MirroredVariable)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d, v in var._index.items():  # pylint: disable=protected-access
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(v,
+                        *values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.regroup(updates, values.Mirrored)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    assert isinstance(colocate_with, list)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d in colocate_with:
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(*values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.regroup(updates, values.Mirrored)
+
+  def _fetch(self, val, destination, fn):
+    """Return a copy of `val` or `fn(val)` on `destination`."""
+    assert isinstance(destination, six.string_types)
+    if isinstance(val, values.TowerLocalVariable):
+      val = self.reduce(val.reduce_method, val, destinations=destination)
+      with ops.device(destination):
+        return fn(self.unwrap(val)[0])
+
+    assert isinstance(val, values.Mirrored), (
+        "val = %s (type %s)" % (val, val.__class__.__name__))
+    if val.on_device(destination):
+      with ops.device(destination):
+        # Use an identity here to make sure we are returning a tensor
+        # instead of e.g. a variable object.
+        return array_ops.identity(fn(val.get(destination)))
+    device = None
+    for d in self._devices:
+      if val.on_device(d):
+        device = d
+        break
+    assert device is not None, (
+        "Could not find destination %s in list of devices %s." %
+        (destination, val.devices))
+    with ops.device(device):
+      v = fn(val.get(device))
+    with ops.device(destination):
+      return array_ops.identity(v)
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      if set(val.devices) == self._canonical_device_set:
+        return [val.get(device=d) for d in self._devices]
+      return [val.get(device=d) for d in sorted(val.devices)]
+    return [val]
+
+  @property
+  def is_single_tower(self):
+    return len(self._devices) == 1
+
+  @property
+  def num_towers(self):
+    return len(self._devices)
+
+  def _worker_device_index(self):
+    return self._device_index
+
+  @property
+  def worker_devices(self):
+    # Make a copy to prevent users from accidentally mutating our copy.
+    return list(self._devices)
+
+  @property
+  def parameter_devices(self):
+    return list(self._devices)
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return list(self._devices)
+
+  def _get_devices_from(self, colocate_with=None):
+    if colocate_with is None:
+      return self._devices
+    elif isinstance(colocate_with, values.DistributedValues):
+      # pylint: disable=protected-access
+      return list(colocate_with._index.keys())
+    elif isinstance(colocate_with, six.string_types):
+      return [colocate_with]
+    else:
+      return colocate_with
+
+  class _MirroredTowerThread(threading.Thread):
+    """A thread that runs() a function on a device."""
+
+    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
+                 **kwargs):
+      super(MirroredStrategy._MirroredTowerThread, self).__init__()  # pylint: disable=protected-access
+      self.coord = coord
+      self.distribution = dist
+      self.device = device
+      self.tower_id = dist.worker_devices.index(device)
+      self.variable_creator_fn = variable_creator_fn
+      # State needed to run and return the results of `fn`.
+      self.main_fn = fn
+      self.main_args = args
+      self.main_kwargs = kwargs
+      self.main_result = None
+      self.done = False
+      # State needed to run the next merge_call() (if any) requested via
+      # TowerContext.
+      self.merge_fn = None
+      self.merge_args = None
+      self.merge_kwargs = None
+      self.merge_result = None
+      # We use a thread.Event for the main thread to signal when this
+      # thread should start running (`should_run`), and another for
+      # this thread to transfer control back to the main thread
+      # (`has_paused`, either when it gets to a
+      # `get_tower_context().merge_call` or when `fn` returns). In
+      # either case the event starts cleared, is signaled by calling
+      # set(). The receiving thread waits for the signal by calling
+      # wait() and then immediately clearing the event using clear().
+      self.should_run = threading.Event()
+      self.has_paused = threading.Event()
+      # These fields have to do with inheriting various contexts from the
+      # parent thread:
+      # pylint: disable=protected-access
+      self.context_mode = context.context()._eager_context.mode
+      if not context.context()._context_handle:
+        context.context()._initialize_handle_and_devices()
+      self.context_device_policy = (
+          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+              context.context()._context_handle))
+      self.graph = ops.get_default_graph()
+      self._variable_creator_stack = self.graph._variable_creator_stack[:]
+      self._captured_var_scope = variable_scope.get_variable_scope()
+      # Adding a "/" at end lets us re-enter this scope later.
+      self._captured_name_scope = self.graph.get_name_scope()
+      if self._captured_name_scope:
+        self._captured_name_scope += "/"
+      if self.tower_id > 0:
+        if not self._captured_name_scope:
+          self._captured_name_scope = ""
+        self._captured_name_scope += "tower_%d/" % self.tower_id
+
+    def run(self):
+      # pylint: disable=protected-access
+      self.graph._variable_creator_stack = self._variable_creator_stack
+      self.should_run.wait()
+      self.should_run.clear()
+      try:
+        if self.coord.should_stop():
+          return
+        with self.coord.stop_on_exception(), \
+            context.context()._mode(self.context_mode), \
+            context.context().device_policy(self.context_device_policy), \
+            self.graph.as_default(), \
+            MirroredTowerContext(self.distribution, self.tower_id), \
+            ops.device(self.device), \
+            ops.name_scope(self._captured_name_scope), \
+            variable_scope.variable_scope(
+                self._captured_var_scope, reuse=self.tower_id > 0), \
+            variable_scope.variable_creator_scope(self.variable_creator_fn):
+          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
+          self.done = True
+      finally:
+        self.has_paused.set()
+
+
+class MirroredTowerContext(distribute_lib.TowerContext):
+  """TowerContext used in MirroredStrategy.call_for_each_tower().
+
+  Opened in `_MirroredTowerThread`, to allow the user to invoke
+  `MirroredStrategy`'s specific implementation of `merge_call()`,
+  which works by delegating the function and its arguments to
+  the main thread (the one that invoked
+  `MirroredStrategy.call_for_each_tower()`).
+  """
+
+  def _merge_call(self, fn, *args, **kwargs):
+    """Delegate to the main thread to actually perform merge_call()."""
+    t = threading.current_thread()  # a _MirroredTowerThread
+    t.merge_fn = fn
+    t.merge_args = args
+    t.merge_kwargs = kwargs
+    t.has_paused.set()
+    t.should_run.wait()
+    t.should_run.clear()
+    if t.coord.should_stop():
+      raise _RequestedStop()
+    return t.merge_result
+
+  @property
+  def device(self):
+    distribute_lib.require_tower_context(self)
+    return self._distribution_strategy.worker_devices[self._tower_id]
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9f06da8e2ed185c2c32f79a5a4f5407165fb1d
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -0,0 +1,435 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multi-GPU tests for MirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.contrib.distribute.python import values
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import core
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
+
+GPU_TEST = "test_gpu" in sys.argv[0]
+
+
+class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    if GPU_TEST:
+      self.assertGreater(context.num_gpus(), 0)
+      if context.num_gpus() > 1:
+        devices = ["/device:GPU:0", "/device:GPU:1"]
+    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
+    return mirrored_strategy.MirroredStrategy(devices)
+
+  def testMinimizeLossEager(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_minimize_loss_eager(self._get_distribution_strategy())
+
+  def testMinimizeLossGraph(self):
+    soft_placement = not GPU_TEST
+    print("testMinimizeLossGraph soft_placement:", soft_placement)
+    self._test_minimize_loss_graph(
+        self._get_distribution_strategy(), soft_placement=soft_placement)
+
+  def testMapReduce(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_map_reduce(self._get_distribution_strategy())
+
+  def testDeviceIndex(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_device_index(self._get_distribution_strategy())
+
+  def testTowerId(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_tower_id(self._get_distribution_strategy())
+
+  def testNumTowers(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self.assertEqual(2, self._get_distribution_strategy().num_towers)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCallAndMergeExceptions(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testRunRegroupError(self):
+
+    def run_fn(device_id):
+      # Generates a list with different lengths on different devices.
+      # Will fail in _regroup() (if more than one device).
+      return list(range(device_id))
+
+    dist = self._get_distribution_strategy()
+    with dist.scope(), self.assertRaises(AssertionError):
+      dist.call_for_each_tower(run_fn, dist.worker_device_index)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testReduceToCpu(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+
+    def run_fn(device_id):
+      return device_id
+
+    dist = self._get_distribution_strategy()
+    with dist.scope():
+      result = dist.call_for_each_tower(run_fn, dist.worker_device_index)
+      reduced = dist.reduce("sum", result, destinations="/device:CPU:0")
+      unwrapped = dist.unwrap(reduced)
+      self.assertEqual(1, len(unwrapped))
+      expected = sum(range(len(dist.worker_devices)))
+      self.assertEqual(expected, self.evaluate(unwrapped[0]))
+
+
+@test_util.with_c_api
+class MirroredStrategyVariableCreationTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  def _skip_eager_if_gpus_less_than(self, num_gpus):
+    if context.num_gpus() < num_gpus and context.executing_eagerly():
+      self.skipTest("Enough GPUs not available for this test in eager mode.")
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSingleVariable(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      # This variable should be created only once across the threads because of
+      # special variable_creator functions used by `dist.call_for_each_tower`.
+      v = variable_scope.variable(1.0, name="foo")
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEquals("foo:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testUnnamedVariable(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      v = variable_scope.variable(1.0)
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      # Default name of "Variable" will be used.
+      self.assertEquals("Variable:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleVariables(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      vs = []
+      for i in range(5):
+        vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return vs
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      for i, v in enumerate(result):
+        self.assertIsInstance(v, values.MirroredVariable)
+        self.assertEquals("foo" + str(i) + ":0", v.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleVariablesWithSameCanonicalName(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      vs = []
+      vs.append(variable_scope.variable(1.0, name="foo/bar"))
+      vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
+      vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
+      vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return vs
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      for v in result:
+        self.assertIsInstance(v, values.MirroredVariable)
+      self.assertEquals(4, len(result))
+      self.assertEquals("foo/bar:0", result[0].name)
+      self.assertEquals("foo_1/bar:0", result[1].name)
+      self.assertEquals("foo_1/bar_1:0", result[2].name)
+      self.assertEquals("foo/bar_1:0", result[3].name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testVariableWithSameCanonicalNameAcrossThreads(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn(device_id):
+      v = variable_scope.variable(1.0, name="foo_" + str(device_id))
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(
+          model_fn, dist.worker_device_index, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      # The resulting mirrored variable will use the name from the first device.
+      self.assertEquals("foo_0:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testWithLayers(self):
+    self._skip_eager_if_gpus_less_than(1)
+    def model_fn(features):
+      with variable_scope.variable_scope("common"):
+        layer1 = core.Dense(1)
+        layer1(features)
+        layer2 = core.Dense(1)
+        layer2(features)
+        # This will pause the current thread, and execute the other thread.
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        layer3 = core.Dense(1)
+        layer3(features)
+        return [(layer1.kernel, layer1.bias),
+                (layer2.kernel, layer2.bias),
+                (layer3.kernel, layer3.bias)]
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+    features = dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
+    features = dist.distribute_dataset(features).get_next()
+
+    with dist.scope():
+      result = dist.call_for_each_tower(
+          model_fn, features, run_concurrently=False)
+      suffixes = ["", "_1", "_2"]
+      for (kernel, bias), suffix in zip(result, suffixes):
+        self.assertIsInstance(kernel, values.MirroredVariable)
+        self.assertEquals("common/dense" + suffix + "/kernel:0", kernel.name)
+        self.assertIsInstance(bias, values.MirroredVariable)
+        self.assertEquals("common/dense" + suffix + "/bias:0", bias.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testWithGetVariableAndVariableScope(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      v0 = variable_scope.get_variable("var-thread0", [1])
+      with variable_scope.variable_scope("common"):
+        v1 = variable_scope.get_variable("var-thread1", [1])
+        # This will pause the current thread, and execute the other thread.
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        v2 = variable_scope.get_variable("var-thread2", [1])
+
+      return v0, v1, v2
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dist = mirrored_strategy.MirroredStrategy(devices)
+    with dist.scope():
+      with variable_scope.variable_scope("main"):
+        v = variable_scope.get_variable("var-main0", [1])
+        self.assertEquals("main/var-main0:0", v.name)
+
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+        self.assertEquals(3, len(result))
+        v0, v1, v2 = result
+        self.assertIsInstance(v0, values.MirroredVariable)
+        self.assertEquals("main/var-thread0:0", v0.name)
+        self.assertIsInstance(v1, values.MirroredVariable)
+        self.assertEquals("main/common/var-thread1:0", v1.name)
+        self.assertIsInstance(v2, values.MirroredVariable)
+        self.assertEquals("main/common/var-thread2:0", v2.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testThreeDevices(self):
+    self._skip_eager_if_gpus_less_than(2)
+
+    def model_fn():
+      v = variable_scope.variable(1.0, name="foo")
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEquals("foo:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testNonMatchingVariableCreation(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn(name):
+      v = variable_scope.variable(1.0, name=name)
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      names = values.DistributedValues({
+          "/device:CPU:0": "foo",
+          "/device:GPU:0": "bar"
+      })
+      with self.assertRaises(RuntimeError):
+        _ = dist.call_for_each_tower(model_fn, names, run_concurrently=False)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testTowerLocalVariable(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    all_v_sum = {}
+    all_v_mean = {}
+
+    def model_fn(device_id):
+      tower_context = distribute_lib.get_tower_context()
+      with tower_context.tower_local_var_scope("sum"):
+        v_sum = variable_scope.variable(1.0)
+      with tower_context.tower_local_var_scope("mean"):
+        v_mean = variable_scope.variable(4.0)
+      self.assertTrue(isinstance(v_sum, values.TowerLocalVariable))
+      self.assertTrue(isinstance(v_mean, values.TowerLocalVariable))
+      updates = [v_sum.assign_add(2.0 + device_id),
+                 v_mean.assign(6.0 * device_id)]
+      all_v_sum[device_id] = v_sum
+      all_v_mean[device_id] = v_mean
+      return updates, v_sum, v_mean
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      # Create "sum" and "mean" versions of TowerLocalVariables.
+      ret_ops, ret_v_sum, ret_v_mean = dist.call_for_each_tower(
+          model_fn, dist.worker_device_index, run_concurrently=False)
+      # Should see the same wrapping instance in all towers.
+      self.assertIs(all_v_sum[0], ret_v_sum)
+      self.assertIs(all_v_mean[0], ret_v_mean)
+      for i in range(1, dist.num_towers):
+        self.assertIs(all_v_sum[0], all_v_sum[1])
+        self.assertIs(all_v_mean[0], all_v_mean[1])
+
+      # Apply updates
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate([y for x in ret_ops for y in dist.unwrap(x)])
+      expected_sum = 0.0
+      expected_mean = 0.0
+      for i, d in enumerate(dist.worker_devices):
+        # Test access within a device scope, should see different values.
+        with ops.device(d):
+          v_sum_value = self.evaluate(ret_v_sum.read_value())
+          v_mean_value = self.evaluate(ret_v_mean.read_value())
+          expected = i + 3.0
+          self.assertEqual(expected, v_sum_value)
+          expected_sum += expected
+          expected = i * 6.0
+          self.assertEqual(expected, v_mean_value)
+          expected_mean += expected
+
+      # fetch() should return the value you get by applying the
+      # reduction across all towers.
+      self.assertEqual(expected_sum, self.evaluate(dist.fetch(ret_v_sum)))
+      expected_mean /= len(dist.worker_devices)
+      self.assertEqual(expected_mean, self.evaluate(dist.fetch(ret_v_mean)))
+
+  # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
+  # testing this in eager mode.
+
+  def testNameScope(self):
+    def model_fn():
+      with ops.name_scope("foo"):
+        a = constant_op.constant(1.0, name="a")
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        b = constant_op.constant(1.0, name="b")
+      return a, b
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      with ops.name_scope("main"):
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+        self.assertEquals(2, len(result))
+        for v, name in zip(result, ["a", "b"]):
+          self.assertIsInstance(v, values.DistributedValues)
+          v0, v1 = dist.unwrap(v)
+          self.assertEquals("main/foo/" + name + ":0", v0.name)
+          self.assertEquals("main/tower_1/foo/" + name + ":0", v1.name)
+
+  def testWithDefaultName(self):
+    def model_fn():
+      with ops.name_scope(None, "foo"):
+        a = constant_op.constant(1.0, name="a")
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        b = constant_op.constant(2.0, name="b")
+      return a, b
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertEquals(2, len(result))
+      for v, name in zip(result, ["a", "b"]):
+        self.assertIsInstance(v, values.DistributedValues)
+        v0, v1 = dist.unwrap(v)
+        self.assertEquals("foo/" + name + ":0", v0.name)
+        self.assertEquals("tower_1/foo/" + name + ":0", v1.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1ef0ecc77a8e8432dfa4eb6da7c324b371dab70
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
@@ -0,0 +1,91 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class MirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import distribute as distribute_lib
+
+
+@test_util.with_c_api
+class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    return mirrored_strategy.MirroredStrategy(["/device:CPU:0"])
+
+  def testMinimizeLossEager(self):
+    self._test_minimize_loss_eager(self._get_distribution_strategy())
+
+  def testMinimizeLossGraph(self):
+    self._test_minimize_loss_graph(self._get_distribution_strategy())
+
+  def testMapReduce(self):
+    self._test_map_reduce(self._get_distribution_strategy())
+
+  def testDeviceIndex(self):
+    self._test_device_index(self._get_distribution_strategy())
+
+  def testTowerId(self):
+    self._test_tower_id(self._get_distribution_strategy())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCallAndMergeExceptions(self):
+    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
+
+
+@test_util.with_c_api
+class VariableCreatorStackTest(test.TestCase):
+
+  def testCreatorStacksAreThreadLocal(self):
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dist = mirrored_strategy.MirroredStrategy(devices)
+
+    def model_fn(device_id):
+      assert isinstance(device_id, int)
+      def thread_creator_fn(next_creator, *args, **kwargs):
+        return next_creator(*args, **kwargs) + ":thread_" + str(device_id)
+
+      with variable_scope.variable_creator_scope(thread_creator_fn):
+        # Create a variable in this scope.
+        v = variable_scope.variable(1.0)
+
+        # This will pause the current thread, and execute the other thread.
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    def main_thread_creator(next_creator, *args, **kwargs):
+      # We are not using the underlying next_creator for test purposes.
+      del next_creator, args, kwargs
+      return "main_thread"
+
+    with context.graph_mode(), \
+        dist.scope(), \
+        variable_scope.variable_creator_scope(main_thread_creator):
+      result = dist.call_for_each_tower(model_fn, dist.worker_device_index)
+      result = dist.unwrap(result)
+      expected = ["main_thread:thread_0", "main_thread:thread_1"]
+      self.assertEquals(expected, result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/monitor.py b/tensorflow/contrib/distribute/python/monitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7644acedc99361d7287a91832d76bc68cbc6ac0a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/monitor.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Monitor is responsible for training, checkpointing and recovery."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import variables
+
+
+class Monitor(object):
+  """Executes training steps, recovers and checkpoints.
+
+  Note that this class is particularly preliminary, experimental, and
+  expected to change.
+  """
+  # TODO(isaprykin): Support step functions that need multiple session calls.
+  # TODO(isaprykin): Support extra arguments to the step function.
+  # TODO(isaprykin): Support recovery, checkpointing and summaries.
+
+  def __init__(self, step_callable, session=None):
+    """Initialize the Monitor with components for executing training steps.
+
+    Args:
+      step_callable: a training `Step` that's capable of signaling when done.
+      session: a `Session` instance that's needed for graph mode.
+
+    Raises:
+      ValueError: if `session` was provided for eager mode or not provided for
+        graph mode.
+    """
+    if context.executing_eagerly():
+      if session is not None:
+        raise ValueError("Should not provide a `session` in Eager mode.")
+      self._run_step = step_callable
+    else:
+      if session is None:
+        raise ValueError("Should provide a `session` in Graph mode.")
+      self._run_step = session.make_callable(step_callable())
+      session.run(variables.global_variables_initializer())
+
+  def run_steps(self, num_steps=None):
+    step = 0
+    while num_steps is None or step < num_steps:
+      try:
+        self._run_step()
+        step += 1
+      except errors.OutOfRangeError:
+        break
diff --git a/tensorflow/contrib/distribute/python/monitor_test.py b/tensorflow/contrib/distribute/python/monitor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8277e1e7919e86ef616b31d0986589dcc9c49bbd
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/monitor_test.py
@@ -0,0 +1,84 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class Monitor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import monitor as monitor_lib
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python.single_loss_example import single_loss_example
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.training import gradient_descent
+
+
+class MonitorTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=combinations.graph_and_eager_modes)))
+  def testTrainNetwork(self, distribution, optimizer_fn):
+    with distribution.scope():
+      single_loss_step, layer = single_loss_example(optimizer_fn, distribution)
+
+      if context.executing_eagerly():
+        monitor = monitor_lib.Monitor(single_loss_step, None)
+      else:
+        with self.test_session() as sess:
+          monitor = monitor_lib.Monitor(single_loss_step, sess)
+
+      monitor.run_steps(1)
+
+      self.assertEqual(1, len(layer.trainable_variables))
+      mirrored_weight_variable = layer.trainable_variables[0]
+      start_error = self.evaluate(distribution.fetch(mirrored_weight_variable))
+      start_error = abs(numpy.array(start_error) - 1)
+
+      monitor.run_steps(9)
+      end_error = self.evaluate(distribution.fetch(mirrored_weight_variable))
+      end_error = abs(numpy.array(end_error) - 1)
+      self.assertGreaterEqual(start_error, end_error)
+
+  def testPassingASessionInEager(self):
+    distribution = one_device_strategy.OneDeviceStrategy(
+        "/device:CPU:0")
+    step_function, _ = single_loss_example(
+        lambda: gradient_descent.GradientDescentOptimizer(0.2), distribution)
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(ValueError, "Should not provide"):
+        _ = monitor_lib.Monitor(step_function, sess)
+
+  def testNotPassingASessionInGraph(self):
+    distribution = one_device_strategy.OneDeviceStrategy(
+        "/device:CPU:0")
+    step_function, _ = single_loss_example(
+        lambda: gradient_descent.GradientDescentOptimizer(0.2), distribution)
+
+    with context.graph_mode(), ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError, "Should provide"):
+        _ = monitor_lib.Monitor(step_function, session=None)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..39c49442b9c3245cfd0b67a51be68773a6fd3ff4
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -0,0 +1,148 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class OneDeviceStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.distribute.python import values
+from tensorflow.contrib.eager.python import datasets
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import distribute as distribute_lib
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+class OneDeviceStrategy(distribute_lib.DistributionStrategy):
+  """A distribution strategy for running on a single device."""
+  # TODO(josh11b): Do we wrap values in types to generate errors if you are
+  # doing something that won't work with other DistributionStrategy
+  # implementations?
+
+  def __init__(self, device):
+    super(OneDeviceStrategy, self).__init__()
+    self._device = device
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    # No need to distinguish tower-local variables when not mirroring,
+    # we just enforce that they are not trainable.
+    if kwargs.pop("tower_local_reduce_method", None) is not None:
+      kwargs["trainable"] = False
+
+    colocate_with = kwargs.pop("colocate_with", None)
+    if colocate_with is None:
+      with ops.device(self._device):
+        return next_creator(*args, **kwargs)
+    if isinstance(colocate_with, six.string_types):
+      with ops.device(colocate_with):
+        return next_creator(*args, **kwargs)
+    if (isinstance(colocate_with, list) and len(colocate_with) == 1 and
+        isinstance(colocate_with[0], six.string_types)):
+      with ops.device(colocate_with[0]):
+        return next_creator(*args, **kwargs)
+    with ops.colocate_with(colocate_with):
+      return next_creator(*args, **kwargs)
+
+  def distribute_dataset(self, dataset):
+    if context.executing_eagerly():
+      return datasets.Iterator(dataset)
+    else:
+      return dataset.make_one_shot_iterator()
+
+  def _broadcast(self, tensor, destinations):
+    return tensor
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    # We don't run `fn` in multiple threads in OneDeviceStrategy.
+    kwargs.pop("run_concurrently", None)
+    with ops.device(self._device), _OneDeviceTowerContext(self):
+      return fn(*args, **kwargs)
+
+  def map(self, map_over, fn, *args, **kwargs):
+    with ops.device(self._device):
+      return values.MapOutput([fn(m, *args, **kwargs) for m in map_over])
+
+  def _reduce(self, method_string, value, destinations):
+    if not isinstance(value, values.MapOutput):
+      return value
+    l = value.get()
+    assert l
+    with ops.device(self._device):
+      if method_string == "sum":
+        return math_ops.add_n(l)
+      elif method_string == "mean":
+        return math_ops.add_n(l) / len(l)
+      else:
+        assert False
+
+  def _update(self, var, fn, *args, **kwargs):
+    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
+      return fn(var, *args, **kwargs)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    del colocate_with
+    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
+      return fn(*args, **kwargs)
+
+  def _fetch(self, val, destination, fn):
+    """Return a copy of `val` or `fn(val)` on `destination`."""
+    with ops.device(self._device):
+      v = fn(val)
+    with ops.device(destination):
+      return array_ops.identity(v)
+
+  def _unwrap(self, value):
+    return [value]
+
+  @property
+  def is_single_tower(self):
+    return True
+
+  @property
+  def num_towers(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    return [self._device]
+
+  @property
+  def parameter_devices(self):
+    return [self._device]
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return [self._device]
+
+  def _worker_device_index(self):
+    return 0
+
+
+class _OneDeviceTowerContext(distribute_lib.TowerContext):
+
+  def __init__(self, distribution_strategy):
+    distribute_lib.TowerContext.__init__(
+        self, distribution_strategy, tower_id=0)
+
+  @property
+  def device(self):
+    return self._distribution_strategy.worker_devices[0]
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7101ed0756f44b846f10ddc6d429afe005a2f196
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class OneDeviceStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+
+
+@test_util.with_c_api
+class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    return one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+
+  def testMinimizeLossEager(self):
+    self._test_minimize_loss_eager(self._get_distribution_strategy())
+
+  def testMinimizeLossGraph(self):
+    self._test_minimize_loss_graph(self._get_distribution_strategy())
+
+  def testMapReduce(self):
+    self._test_map_reduce(self._get_distribution_strategy())
+
+  def testDeviceIndex(self):
+    self._test_device_index(self._get_distribution_strategy())
+
+  def testTowerId(self):
+    self._test_tower_id(self._get_distribution_strategy())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCallAndMergeExceptions(self):
+    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0912b625f44342d22acc0ce9bb52a6b632c75a0
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -0,0 +1,70 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for running legacy optimizer code with DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+
+
+class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v2_optimizers(),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+  def testTrainNetwork(self, distribution, optimizer_fn,
+                       use_callable_loss=True):
+    with distribution.scope():
+      model_fn, dataset, layer = minimize_loss_example(
+          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
+
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return control_flow_ops.group(distribution.unwrap(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(), run_concurrently=layer.built)))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(10):
+        run_step()
+
+        weights.append(self.evaluate(distribution.fetch(layer.kernel)))
+        biases.append(self.evaluate(distribution.fetch(layer.bias)))
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1ddf3cece1c3fa549d6d2999a9bff9671fcdd76
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -0,0 +1,166 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Extension of prefetching_ops to support more than one device."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.contrib.data.python.ops import prefetching_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import nest as data_nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.util import nest
+
+
+# pylint: disable=protected-access
+class _PrefetchToDeviceIterator(object):
+  """A replacement for @{tf.data.Iterator} that prefetches to another device."""
+
+  def __init__(self, input_dataset, devices, buffer_size):
+    self._input_dataset = input_dataset
+    self._get_next_call_count = 0
+    self._devices = devices
+    input_iterator = input_dataset.make_one_shot_iterator()
+    input_iterator_handle = input_iterator.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, input_iterator.output_types, input_iterator.output_shapes,
+          input_iterator.output_classes)
+      return remote_iterator.get_next()
+
+    target_device = gen_dataset_ops.iterator_get_device(
+        input_iterator._iterator_resource)
+    self._buffering_resources = []
+    for device in nest.flatten(self._devices):
+      with ops.device(device):
+        buffer_resource_handle = prefetching_ops.function_buffering_resource(
+            f=_prefetch_fn,
+            target_device=target_device,
+            string_arg=input_iterator_handle,
+            buffer_size=buffer_size)
+        self._buffering_resources.append(buffer_resource_handle)
+
+  def get_next(self, name=None):
+    """See @{tf.data.Iterator.get_next}."""
+    self._get_next_call_count += 1
+    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
+
+    flat_result = []
+    # TODO(priyag): This will fail if the input size (typically number of
+    # batches) is not divisible by number of devices.
+    # How do we handle that more gracefully / let the user know?
+    for buffer_resource in self._buffering_resources:
+      flat_ret = gen_dataset_ops.function_buffering_resource_get_next(
+          buffer_resource,
+          output_types=data_nest.flatten(sparse.as_dense_types(
+              self.output_types, self.output_classes)), name=name)
+
+      ret = sparse.deserialize_sparse_tensors(
+          data_nest.pack_sequence_as(self.output_types, flat_ret),
+          self.output_types, self.output_shapes, self.output_classes)
+
+      for tensor, shape in zip(
+          data_nest.flatten(ret), data_nest.flatten(self.output_shapes)):
+        if isinstance(tensor, ops.Tensor):
+          tensor.set_shape(shape)
+      flat_result.append(ret)
+
+    return nest.pack_sequence_as(self._devices, flat_result)
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+# pylint: enable=protected-access
+
+
+class _PrefetchToDeviceDataset(dataset_ops.Dataset):
+  """A `Dataset` whose iterator prefetches elements to other device(s)."""
+
+  def __init__(self, input_dataset, devices, buffer_size):
+    self._input_dataset = input_dataset
+    self._devices = devices
+    self._buffer_size = buffer_size if buffer_size is not None else 1
+
+  def make_one_shot_iterator(self):
+    return _PrefetchToDeviceIterator(self._input_dataset, self._devices,
+                                     self._buffer_size)
+
+  def make_initializable_iterator(self, shared_name=None):
+    raise NotImplementedError("`prefetch_to_devices()` is not currently "
+                              "compatible with initializable iterators. Use "
+                              "`make_one_shot_iterator()` instead.")
+
+  def _as_variant_tensor(self):
+    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
+    # transformation methods is called.
+    # TODO(mrry): Investigate support for chaining further transformations after
+    # the prefetch, including GPU support.
+    raise NotImplementedError("`prefetch_to_devices()` must be the last "
+                              "transformation in a dataset pipeline.")
+
+  # TODO(priyag): Fix the output types, shapes and classes to match the result
+  # of get_next (which has the additional nesting layer of devices now).
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+def prefetch_to_devices(devices, buffer_size=None):
+  """A transformation that prefetches dataset values to the given `devices`.
+
+  NOTE: Although the transformation creates a @{tf.data.Dataset}, the
+  transformation must be the final `Dataset` in the input pipeline.
+
+  Args:
+    devices: A nested structure of devices on which to prefetch the data. It can
+      be a single device name, or a tuple or list of device names.
+    buffer_size: (Optional.) The number of elements to buffer on each device.
+      Defaults to an automatically chosen value.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+  def _apply_fn(dataset):
+    return _PrefetchToDeviceDataset(dataset, devices, buffer_size)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed16f4607881f2864479c04b4c25e95d9fa1850
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
@@ -0,0 +1,68 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for prefetching_ops_v2."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import prefetching_ops_v2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class PrefetchingOpsV2Test(test.TestCase):
+
+  def testPrefetchToOneDevice(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices("/gpu:0"))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToTwoDevicesInAList(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    output = []
+    with self.test_session() as sess:
+      for _ in range(5):
+        result = sess.run(next_element)
+        self.assertEqual(2, len(result))
+        output.extend(result)
+      self.assertEquals(set(range(10)), set(output))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator.py b/tensorflow/contrib/distribute/python/shared_variable_creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7083e279f20803b227dcd52f6420ae832aa2df4
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/shared_variable_creator.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to re-use variables created on first device on subsequent devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+_VARIABLE_UNIQUIFYING_REGEX = re.compile(r"_\d/")
+_VARIABLE_UNIQUIFYING_REGEX_AT_END = re.compile(r"_\d$")
+
+
+def _canonicalize_variable_name(name):
+  # If no name is specified, uses default name "Variable".
+  if name is None:
+    return "Variable"
+  # Replace all instances of "_<num>/" with "/"
+  name = _VARIABLE_UNIQUIFYING_REGEX.sub("/", name)
+  # Replace any instances of "_<num>" at the end of the string with ""
+  name = _VARIABLE_UNIQUIFYING_REGEX_AT_END.sub("", name)
+  return name
+
+
+def make_fn(shared_variable_store, device_id):
+  """Construct the variable creator function for device `device_id`.
+
+  Constructs custom variable creator functions for the given device.
+  On first device (device_id == 0), it creates the variable using the
+  `next_creator`, and stores it in the provided `shared_variable_store`.
+  On all other devices (device_id > 0), it tries to re-use the variable
+  already created with the same name. If no such variable exists, it throws an
+  error.
+  Additionally, we de-uniquify variable names before checking for matches. This
+  helps re-use variables which are intended to be the same but have different
+  names due to variable uniquification happening upstream. Since this might
+  mean we may have multiple variables with the same canonical name, we store
+  them in a list per canonical name and return them in the same order as well.
+
+  Args:
+    shared_variable_store: A dictionary that we will use to store variables
+      created on the first device, and re-used by creators for other devices.
+    device_id: Integer index of the device whose creator should be
+      constructed.
+
+  Returns:
+    An appropriate creator function based on device_id.
+
+  """
+  variable_scope_access_index = {}
+  assert isinstance(device_id, int)
+
+  def create_new_variable(next_creator, *args, **kwargs):
+    """Create the variable using `next_creator` and store it."""
+    canonical_name = _canonicalize_variable_name(kwargs.get("name"))
+    v = next_creator(*args, **kwargs)
+
+    if canonical_name not in shared_variable_store:
+      shared_variable_store[canonical_name] = []
+    shared_variable_store[canonical_name].append(v)
+    return v
+
+  def reuse_variable(next_creator, *args, **kwargs):
+    """Re-use existing variable from store with same name (in order)."""
+    del next_creator, args
+    name = kwargs.get("name")
+    canonical_name = _canonicalize_variable_name(name)
+
+    try:
+      variable_index = variable_scope_access_index.get(canonical_name, 0)
+      v = shared_variable_store[canonical_name][variable_index]
+      # TODO(priyag): Make this variable re-use more robust by adding checks
+      # that the requested shape and dtype match the existing variable.
+      variable_scope_access_index[canonical_name] = variable_index + 1
+      return v
+    except (KeyError, IndexError):
+      raise RuntimeError(
+          "Tried to create variable {} with mismatching name on device {}".
+          format(name, device_id))
+
+  if device_id == 0:
+    return create_new_variable
+  else:
+    return reuse_variable
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..713494d603b855be2863af9f24ab98d4cf048042
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SharedVariableCreator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
+
+
+class CanonicalizeVariableNameTest(test.TestCase):
+
+  def _canonicalize(self, name):
+    return shared_variable_creator._canonicalize_variable_name(name)
+
+  def testNoName(self):
+    self.assertEquals("Variable", self._canonicalize(None))
+
+  def testPatternInMiddle(self):
+    self.assertEquals("foo/bar/baz", self._canonicalize("foo_1/bar_1/baz"))
+
+  def testPatternAtEnd(self):
+    self.assertEquals("foo", self._canonicalize("foo_1"))
+
+  def testWrongPatterns(self):
+    self.assertEquals("foo_1:0", self._canonicalize("foo_1:0"))
+    self.assertEquals("foo1", self._canonicalize("foo1"))
+    self.assertEquals("foo_a", self._canonicalize("foo_a"))
+
+
+@test_util.with_c_api
+class SharedVariableCreatorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSharedVariable(self):
+
+    shared_variable_store = {}
+    num_devices = 3
+    creator_fns = []
+    for i in range(num_devices):
+      creator_fn = shared_variable_creator.make_fn(shared_variable_store, i)
+      creator_fns.append(creator_fn)
+
+    with variable_scope.variable_creator_scope(creator_fns[0]):
+      v0 = variable_scope.variable(1.0, name="foo")
+
+    with variable_scope.variable_creator_scope(creator_fns[1]):
+      v1 = variable_scope.variable(1.0, name="foo")
+
+    with variable_scope.variable_creator_scope(creator_fns[2]):
+      v2 = variable_scope.variable(1.0, name="foo")
+
+    # v1 and v2 should be same as v0
+    self.assertIs(v1, v0)
+    self.assertIs(v2, v0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef5fd2f8943d348a0721cd72032bf6cb2199ad9
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple network to use in tests and examples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import step_fn
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.layers import core
+from tensorflow.python.layers import normalization
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def single_loss_example(optimizer_fn, distribution, use_bias=False):
+  """Build a very simple network to use in tests and examples."""
+  dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+  optimizer = optimizer_fn()
+  layer = core.Dense(1, use_bias=use_bias)
+
+  def loss_fn(x):
+    y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
+    return y * y
+
+  single_loss_step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer,
+                                                    distribution)
+
+  # Layer is returned for inspecting the kernels in tests.
+  return single_loss_step, layer
+
+
+def minimize_loss_example(optimizer_fn,
+                          use_bias=False,
+                          use_callable_loss=True,
+                          create_optimizer_inside_model_fn=False):
+  """Example of non-distribution-aware legacy code."""
+  dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+  # An Optimizer instance is created either outside or inside model_fn.
+  outer_optimizer = None
+  if not create_optimizer_inside_model_fn:
+    outer_optimizer = optimizer_fn()
+
+  layer = core.Dense(1, use_bias=use_bias)
+
+  def model_fn(x):
+    """A very simple model written by the user."""
+
+    def loss_fn():
+      y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
+      return y * y
+
+    optimizer = outer_optimizer or optimizer_fn()
+
+    if use_callable_loss:
+      return optimizer.minimize(loss_fn)
+    else:
+      return optimizer.minimize(loss_fn())
+
+  return model_fn, dataset, layer
+
+
+def batchnorm_example(optimizer_fn,
+                      batch_per_epoch=1,
+                      momentum=0.9,
+                      renorm=False):
+  """Example of non-distribution-aware legacy code with batch normalization."""
+  # input shape is [16, 8], input values are increasing in both dimensions.
+  dataset = dataset_ops.Dataset.from_tensor_slices(
+      [[[float(x * 8 + y + z * 100)
+         for y in range(8)]
+        for x in range(16)]
+       for z in range(batch_per_epoch)]).repeat()
+  optimizer = optimizer_fn()
+  batchnorm = normalization.BatchNormalization(
+      renorm=renorm, momentum=momentum, fused=False)
+
+  def model_fn(x):
+
+    def loss_fn():
+      y = math_ops.reduce_sum(batchnorm(x, training=True), axis=1)
+      loss = math_ops.reduce_mean(y - constant_op.constant(1.))
+      return loss
+
+    # Callable loss.
+    return optimizer.minimize(loss_fn)
+
+  return model_fn, dataset, batchnorm
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..82514c64be40b421c4a9887932f2cfb8e1ac4be0
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The step function abstraction represents a single training step."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.training import optimizer as optimizer_lib
+
+
+class Step(object):
+  """Interface for performing each step of a training algorithm."""
+
+  def __init__(self, distribution):
+    self._distribution = distribution
+
+  @property
+  def distribution(self):
+    return self._distribution
+
+  def __call__(self):
+    """Perform one step of this training algorithm."""
+    return self.step(self.inputs())
+
+  def inputs(self):
+    """For the generating the input to be passed to `step()`."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  def step(self, inputs):
+    """Perform the main computation of this training algorithm."""
+    raise NotImplementedError("must be implemented in descendants")
+
+
+class StandardInputStep(Step):
+  """Step with a standard implementation of input handling.
+
+  Args:
+    input_dataset: a tf.data Dataset that provides input.
+  """
+
+  def __init__(self, input_dataset, distribution):
+    Step.__init__(self, distribution)
+    self._distributed_input = distribution.distribute_dataset(input_dataset)
+
+  def inputs(self):
+    return self._distributed_input.get_next()
+
+
+class StandardSingleLossStep(StandardInputStep):
+  """A step function that implements a training step for a feed forward network.
+
+  An instance of this class is intended to be used as a callable:
+
+  ```python
+  ...
+  step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer)
+  step.initialize(distribution)
+
+  # Run a single training step on a given DistributionStrategy:
+  step(distribution)
+  ...
+  ```
+
+  Args:
+    input_dataset: a tf.data Dataset that provides input.
+    loss_fn: a function that returns loss.
+    optimizer: an optimizer that implements an update rule.
+    distribution: a `DistributionStrategy` object.
+  """
+
+  def __init__(self, input_dataset, loss_fn, optimizer, distribution):
+    StandardInputStep.__init__(self, input_dataset, distribution)
+    self._loss_fn = loss_fn
+    self._optimizer = optimizer
+    self._is_run_concurrently = False
+
+  def step(self, inputs):
+    with self._distribution.scope():
+      gradients_fn = backprop.implicit_grad(self._loss_fn)
+      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
+
+      grads_and_vars = self.distribution.call_for_each_tower(
+          gradients_fn, inputs, run_concurrently=self._is_run_concurrently)
+      # If threads use layers, then we need to run the first step sequentially,
+      # so that layers.build() is not executed in parallel.  Otherwise, multiple
+      # sets of mirrored variables are going to be created.
+      self._is_run_concurrently = True
+      return self._optimizer._distributed_apply(  # pylint: disable=protected-access
+          self.distribution, grads_and_vars)
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..75c5ec9659d193e77d219ba79977615d58841d64
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class Step."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python.single_loss_example import single_loss_example
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.ops import variables
+
+
+class SingleLossStepTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=combinations.graph_and_eager_modes)))
+  def testTrainNetwork(self, distribution, optimizer_fn):
+    with distribution.scope():
+      single_loss_step, layer = single_loss_example(
+          optimizer_fn, distribution, use_bias=True)
+
+      if context.executing_eagerly():
+        run_step = single_loss_step
+      else:
+        with self.test_session() as sess:
+          run_step = sess.make_callable(single_loss_step())
+      self.evaluate(variables.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(10):
+        run_step()
+
+        weights.append(self.evaluate(distribution.fetch(layer.kernel)))
+        biases.append(self.evaluate(distribution.fetch(layer.bias)))
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b4ad9f146bc1d6a987fbeecbb05122946137154
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -0,0 +1,225 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for testing DistributionStrategy descendants."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import optimizer
+
+
+class _TestException(Exception):
+  pass
+
+
+# May be the argument to either distribution.call_for_each_tower() or
+# get_tower_context().merge_call()
+def _raise_exception_fn(_=None):
+  raise _TestException()
+
+
+# Must be the argument to a distribution.call_for_each_tower() call, calls a
+# get_tower_context().merge_call() that raises an exception.
+def _merge_raises_fn():
+  distribute_lib.get_tower_context().merge_call(_raise_exception_fn)
+
+
+# Must be the argument to a get_tower_context().merge_call() call, calls
+# dist.call_for_each_tower() with a function that raises an exception.
+def _call_raises_fn(dist):
+  dist.call_for_each_tower(_raise_exception_fn)
+
+
+# Must be the argument to a distribution.call_for_each_tower() call,
+# calls a get_tower_context().merge_call() that calls a
+# call_for_each_tower() that raises an exception.
+def _merge_call_raises_fn():
+  distribute_lib.get_tower_context().merge_call(_call_raises_fn)
+
+
+# Must be the argument to a get_tower_context().merge_call() call, calls
+# dist.call_for_each_tower() with a function that calls a
+# get_tower_context().merge_call() that raises an exception.
+def _call_merge_raises_fn(dist):
+  dist.call_for_each_tower(_merge_raises_fn)
+
+
+# Must be the argument to a distribution.call_for_each_tower() call, calls a
+# get_tower_context().merge_call() that calls a call_for_each_tower() that
+# calls a get_tower_context().merge_call() that raises an exception.
+def _merge_call_merge_raises_fn():
+  distribute_lib.get_tower_context().merge_call(_call_merge_raises_fn)
+
+
+class DistributionTestBase(test.TestCase):
+  """Some tests that should work with any DistributionStrategy."""
+
+  def _test_minimize_loss_eager(self, d):
+    with d.scope():
+      l = core.Dense(1, use_bias=False)
+
+      def loss(x):
+        # TODO(josh11b): What if this constant was instead a captured
+        # value?  Would it need to be a value that has been passed
+        # through d.broadcast()?
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+      # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
+      # common `implicit_grad` function and put it in DistributionStrategy.
+      grad_fn = backprop.implicit_grad(loss)
+      grad_fn = optimizer.get_filtered_grad_fn(grad_fn)
+
+      def update(v, g):
+        return v.assign_sub(0.2 * g)
+
+      one = d.broadcast(constant_op.constant([[1.]]))
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.call_for_each_tower(grad_fn, one, run_concurrently=l.built)
+
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.fetch(v)
+          before_list.append(fetched)
+          # control_dependencies irrelevant but harmless in eager execution
+          with ops.control_dependencies([fetched]):
+            g = d.reduce("sum", g, destinations=v)
+            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+              after_list.append(d.fetch(v))
+        return before_list, after_list
+
+      for i in range(10):
+        b, a = step()
+        if i == 0:
+          before, = b  # pylint: disable=unbalanced-tuple-unpacking
+        after, = a  # pylint: disable=unbalanced-tuple-unpacking
+
+      error_before = abs(before.numpy() - 1)
+      error_after = abs(after.numpy() - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+
+  def _test_minimize_loss_graph(self, d, soft_placement=False):
+    config = config_pb2.ConfigProto()
+    config.allow_soft_placement = soft_placement
+    config.gpu_options.per_process_gpu_memory_fraction = 0.3
+    with context.graph_mode(), \
+         ops.Graph().as_default(), \
+         self.test_session(config=config) as sess, \
+         d.scope():
+      l = core.Dense(1, use_bias=False)
+
+      def loss(x):
+        # TODO(josh11b): What if this constant was instead a captured
+        # value?  Would it need to be a value that has been passed
+        # through d.broadcast()?
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+
+      grad_fn = backprop.implicit_grad(loss)
+
+      def update(v, g):
+        return v.assign_sub(0.2 * g)
+
+      one = d.broadcast(constant_op.constant([[1.]]))
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.call_for_each_tower(grad_fn, one)
+
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.fetch(v)
+          before_list.append(fetched)
+          with ops.control_dependencies([fetched]):
+            g = d.reduce("sum", g, destinations=v)
+            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+              after_list.append(d.fetch(v))
+        return before_list, after_list
+
+      before_out, after_out = step()
+      variables.global_variables_initializer().run()
+      for i in range(10):
+        b, a = sess.run((before_out, after_out))
+        if i == 0:
+          before, = b
+        after, = a
+
+      error_before = abs(before - 1)
+      error_after = abs(after - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+
+  def _test_map_reduce(self, d, in_graph=None):
+    with d.scope():
+      map_in = [constant_op.constant(i) for i in range(10)]
+      map_out = d.map(map_in, lambda x, y: x * y, 2)
+      observed = d.fetch(d.reduce("sum", map_out))
+      expected = 90  # 2 * (0 + 1 + ... + 9)
+      self.assertEqual(expected, observed.numpy())
+
+  def _test_device_index(self, d):
+    with d.scope():
+      expected_devices = [False] * len(d.worker_devices)
+
+      def mark_devices_fn(device_id):
+        self.assertLess(device_id, len(d.worker_devices))
+        self.assertFalse(expected_devices[device_id])
+        expected_devices[device_id] = True
+
+      d.call_for_each_tower(mark_devices_fn, d.worker_device_index)
+      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+
+  def _test_tower_id(self, d):
+    with d.scope():
+      expected_devices = [False] * len(d.worker_devices)
+
+      def mark_devices_fn():
+        tower_id = distribute_lib.get_tower_context().tower_id
+        self.assertLess(tower_id, len(d.worker_devices))
+        self.assertFalse(expected_devices[tower_id])
+        expected_devices[tower_id] = True
+
+      d.call_for_each_tower(mark_devices_fn)
+      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+
+  def _test_call_and_merge_exceptions(self, dist):
+    with dist.scope():
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_raise_exception_fn)
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_merge_raises_fn)
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_merge_call_raises_fn)
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_merge_call_merge_raises_fn)
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
new file mode 100644
index 0000000000000000000000000000000000000000..87bf0590384cc74ca0f0575bcef4e84599a8b666
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -0,0 +1,578 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed values.
+
+See go/tf-distribution-strategy.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+import six
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.distribute.python import prefetching_ops_v2
+from tensorflow.contrib.eager.python import datasets
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import saver
+from tensorflow.python.util import nest
+
+
+# pylint: disable=line-too-long
+# TODO(josh11b): Should device values be strings or DeviceSpec objects
+# Not sure DeviceSpec objects are usable as a dict key.
+class DistributedValues(object):
+  """Holds a map from device to values. Either PerDevice or Mirrored."""
+
+  def __init__(self, index):
+    self._index = {device_util.canonicalize(key): value
+                   for key, value in six.iteritems(index)}
+
+  def get(self, device=None):
+    """Returns the value for the current device or raises a ValueError."""
+    if device is None:
+      tower_context = distribute_lib.get_tower_context()
+      if tower_context:
+        device = tower_context.device
+      else:
+        device = distribute_lib.get_update_device()
+        if device is None:
+          device = device_util.current()
+    device = device_util.canonicalize(device)
+    try:
+      return self._index[device]
+    except KeyError:
+      raise ValueError("Device %s not found in %s (current device %s)" %
+                       (device, self._index.keys(), device_util.current()))
+
+  def on_device(self, device):
+    device = device_util.canonicalize(device)
+    return device in self._index
+
+  @property
+  def devices(self):
+    return list(self._index.keys())
+
+  def __str__(self):
+    return "%s:%s" % (self.__class__.__name__, self._index)
+
+  def __repr__(self):
+    return "%s(%r)" % (self.__class__.__name__, self._index)
+
+  # TODO(josh11b): Possibly make an accessor for _index for use by
+  # DistributionStrategy implementations.
+
+
+class DistributedDelegate(DistributedValues):
+  """A map from device to values; acts as the same type as the values."""
+
+  def __init__(self, index):
+    super(DistributedDelegate, self).__init__(index)
+
+  def __getattr__(self, name):
+    return getattr(self.get(), name)
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o): return self.get() + o
+  def __radd__(self, o): return o + self.get()
+  def __sub__(self, o): return self.get() - o
+  def __rsub__(self, o): return o - self.get()
+  def __mul__(self, o): return self.get() * o
+  def __rmul__(self, o): return o * self.get()
+  def __truediv__(self, o): return self.get() / o
+  def __rtruediv__(self, o): return o / self.get()
+  def __floordiv__(self, o): return self.get() // o
+  def __rfloordiv__(self, o): return o // self.get()
+  def __mod__(self, o): return self.get() % o
+  def __rmod__(self, o): return o % self.get()
+  def __lt__(self, o): return self.get() < o
+  def __le__(self, o): return self.get() <= o
+  def __gt__(self, o): return self.get() > o
+  def __ge__(self, o): return self.get() >= o
+  def __and__(self, o): return self.get() & o
+  def __rand__(self, o): return o & self.get()
+  def __or__(self, o): return self.get() | o
+  def __ror__(self, o): return o | self.get()
+  def __xor__(self, o): return self.get() ^ o
+  def __rxor__(self, o): return o ^ self.get()
+  def __getitem__(self, o): return self.get()[o]
+  def __pow__(self, o, modulo=None): return pow(self.get(), o, modulo)
+  def __rpow__(self, o): return pow(o, self.get())
+  def __invert__(self): return ~self.get()
+  def __neg__(self): return -self.get()
+  def __abs__(self): return abs(self.get())
+
+  def __div__(self, o):
+    try:
+      return self.get().__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self.get().__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self.get().__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self.get().__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  # TODO(josh11b): Even more operator overloads.
+
+
+class PerDevice(DistributedValues):
+  """Holds a map from device to unsynchronized values."""
+  pass
+
+
+class Mirrored(DistributedValues):
+  """Holds a map from device to values which are kept in sync."""
+  pass
+
+
+def _assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(array_ops.identity(tensor))
+
+
+DistributedVarOp = collections.namedtuple(
+    "DistributedVarOp", ["name", "graph", "type"])
+
+
+class DistributedVariable(DistributedDelegate):
+  """Holds a map from device to variables."""
+  # TODO(josh11b): Support changing the set of variables if e.g. if new
+  # devices are joining or a device is to leave.
+
+  def __init__(self, index):
+    # Child class must set self._primary_var before calling
+    # super(...).__init__(index).
+    self._common_name = self._primary_var.name.split(":")[0]
+    super(DistributedVariable, self).__init__(index)
+
+  @property
+  def initializer(self):
+    return control_flow_ops.group([v.initializer for v in self._index.values()])
+
+  @property
+  def graph(self):
+    return self._primary_var.graph
+
+  @property
+  def _shared_name(self):
+    return self._common_name
+
+  @property
+  def _unique_id(self):
+    return self._primary_var._unique_id   # pylint: disable=protected-access
+
+  @property
+  def name(self):
+    return self._primary_var.name
+
+  @property
+  def dtype(self):
+    return self._primary_var.dtype
+
+  @property
+  def shape(self):
+    return self._primary_var.shape
+
+  def get_shape(self):
+    return self._primary_var.get_shape()
+
+  def to_proto(self, export_scope=None):
+    return self._primary_var.to_proto(export_scope=export_scope)
+
+  @property
+  def op(self):
+    # We want cross-tower code that does some var.op.X calls
+    # to work (even if the current device isn't in self.devices), but
+    # other uses of var.op in a cross-tower context to fail.
+    if distribute_lib.get_cross_tower_context():
+      return DistributedVarOp(self._primary_var.op.name,
+                              self._primary_var.op.graph,
+                              self._primary_var.op.type)
+    return self.get().op
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion(var, dtype=None, name=None, as_ref=False):
+  # Try to avoid assignments to and other mutations of MirroredVariable
+  # state except through a DistributionStrategy.update() call.
+  assert not as_ref
+  return ops.internal_convert_to_tensor(
+      var.get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+ops.register_tensor_conversion_function(DistributedVariable, _tensor_conversion)
+ops.register_dense_tensor_like_type(DistributedVariable)
+
+
+class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
+  """Class for defining how to restore a MirroredVariable."""
+
+  def __init__(self, mirrored_variable, primary_variable, name):
+    self._mirrored_variable = mirrored_variable
+    super(_MirroredSaveable, self).__init__(primary_variable, "", name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group([
+        _assign_on_device(d, v, tensor)
+        for d, v in six.iteritems(self._mirrored_variable._index)])  # pylint: disable=protected-access
+
+
+def _get_update_device():
+  """Validate we are in update/update_non_slot() and return current device.
+
+  This is used in MirroredVariable.assign* members, to make sure they
+  are only called via an update method, to make sure all components of the
+  variable are being updated in a consistent way.
+
+  Returns:
+    A string device.
+
+  Raises:
+    RuntimeError: If not in distribution.update()/.update_non_slot().
+  """
+  device = distribute_lib.get_update_device()
+  if device is None:
+    raise RuntimeError(
+        "Use DistributionStrategy.update() to modify a MirroredVariable.")
+  return device
+
+
+class MirroredVariable(DistributedVariable, Mirrored,
+                       checkpointable.CheckpointableBase):
+  """Holds a map from device to variables whose values are kept in sync."""
+
+  def __init__(self, index, primary_var):
+    # Use a weakref to make it easy to map from the contained values
+    # to the container without introducing a reference cycle.
+    for v in six.itervalues(index):
+      v._mirrored_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._primary_var = primary_var
+    super(MirroredVariable, self).__init__(index)
+
+  # We use _get_update_device() for the assign* methods to enforce
+  # that we are in an update() function. The arguments to update() are
+  # automatically unwrapped so the update() function would normally
+  # see regular variables, not MirroredVariables. However, the update
+  # function can still operate on wrapped MirroredVariables through
+  # object members, captured arguments, etc. This is more likely in an
+  # update_non_slot() function (like OptimizerV2._finish), which can
+  # update several non-slot variables in one call.
+  def assign_sub(self, *args, **kwargs):
+    return self.get(device=_get_update_device()).assign_sub(*args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    return self.get(device=_get_update_device()).assign_add(*args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    return self.get(device=_get_update_device()).assign(*args, **kwargs)
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    MirroredVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _MirroredSaveable(self, self._primary_var, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+
+class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Class for defining how to restore a TowerLocalVariable."""
+
+  def __init__(self, tower_local_variable, name):
+    self._tower_local_variable = tower_local_variable
+    # We use a callable so that we don't have to evaluate this expression
+    # in the case where we are trying to restore instead of save.
+    def tensor():
+      return distribute_lib.get_distribution_strategy().fetch(
+          tower_local_variable)
+    spec = saver.BaseSaverBuilder.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name,
+        dtype=tower_local_variable.dtype)
+    super(_TowerLocalSaveable, self).__init__(tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    tensor, = restored_tensors
+    # To preserve the sum across save and restore, we have to divide the
+    # total across all devices when restoring a variable that was summed
+    # when saving.
+    if self._tower_local_variable.reduce_method == "sum":
+      tensor *= 1. / len(self._tower_local_variable.devices)
+    return control_flow_ops.group([
+        _assign_on_device(d, v, tensor)
+        for d, v in six.iteritems(self._tower_local_variable._index)])  # pylint: disable=protected-access
+
+
+class TowerLocalVariable(DistributedVariable, PerDevice,
+                         checkpointable.CheckpointableBase):
+  """Holds a map from device to variables whose values are reduced on save."""
+
+  def __init__(self, index, primary_var, reduce_method):
+    self._primary_var = primary_var
+    self._reduce_method = reduce_method
+    super(TowerLocalVariable, self).__init__(index)
+
+  def assign_sub(self, *args, **kwargs):
+    return self.get().assign_sub(*args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    return self.get().assign_add(*args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    return self.get().assign(*args, **kwargs)
+
+  @property
+  def reduce_method(self):
+    return self._reduce_method
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    TowerLocalVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _TowerLocalSaveable(self, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+
+def _devices_match(d1, d2):
+  return device_util.canonicalize(d1) == device_util.canonicalize(d2)
+
+
+def regroup(per_device, wrap_class=PerDevice):
+  """Makes device->nest map into a nest of PerDevice/Mirrored values."""
+  items = list(per_device.items())
+  assert items
+  v0 = items[0][1]  # First value
+
+  if isinstance(v0, list):
+    for _, v in items[1:]:
+      assert isinstance(v, list)
+      assert len(v) == len(v0), ("len(v) == %d, len(v0) == %d, v: %s, v0: %s" %
+                                 (len(v), len(v0), v, v0))
+    return [regroup({k: v[i] for k, v in items}, wrap_class)
+            for i in range(len(v0))]
+
+  if isinstance(v0, tuple):
+    for _, v in items[1:]:
+      assert isinstance(v, tuple)
+      assert len(v) == len(v0)
+    regrouped_tuple = tuple(regroup({k: v[i] for k, v in items}, wrap_class)
+                            for i in range(len(v0)))
+    if hasattr(v0, "_fields"):
+      # This tuple is in fact a namedtuple! Create a new namedtuple instance
+      # and initialize it with the regrouped values:
+      assert hasattr(type(v0), "_make")
+      return type(v0)._make(regrouped_tuple)
+    else:
+      return regrouped_tuple
+
+  if isinstance(v0, dict):
+    v0keys = set(v0.keys())
+    for _, v in items[1:]:
+      assert isinstance(v, dict)
+      assert set(v.keys()) == v0keys
+    return {key: regroup({k: v[key] for k, v in items}, wrap_class)
+            for key in v0keys}
+
+  # If exactly the same object across all devices, return it unwrapped.
+  same_id = True
+  for _, v in items[1:]:
+    if v is not v0:
+      same_id = False
+      break
+  # Consider three cases where same_id is true:
+  # * If v0 is a MirroredVariable (and same_id means it is the same
+  #   across all devices), we want to return it. We check
+  #   MirroredVariable specifically since it can look like it
+  #   has a _mirrored_container member since its members do.
+  # * If v0 is a member of a mirrored variable, in which case
+  #   hasattr(v0, "_mirrored_container") is true, we want to
+  #   return the MirroredVariable that contains it using the
+  #   _mirrored_container logic below. This case can trigger
+  #   same_id when there is only one device.
+  # * In any other situation, same_id means we return v0.
+  if same_id and (isinstance(v0, MirroredVariable) or
+                  not hasattr(v0, "_mirrored_container")):
+    return v0
+
+  # Detect the case where each device has a parallel component of the
+  # same MirroredVariable. In this case we want to return the
+  # containing MirroredVariable, after a bunch of sanity checking.
+  # In particular, each component should have the same container,
+  # and the devices of the variables should match the keys of the
+  # per-device dictionary.
+  # TODO(josh11b): Do we need similar logic for TowerLocalVariables?
+  if hasattr(v0, "_mirrored_container"):
+    # pylint: disable=protected-access
+    assert not isinstance(v0, MirroredVariable), (
+        "ids = %s, items = %s" % ([id(v[1]) for v in items], items))
+    assert _devices_match(v0.device, items[0][0]), (
+        "v0.device = %s, items = %s" % (v0.device, items))
+    mirrored_container = v0._mirrored_container()
+    assert mirrored_container is not None
+    for d, v in items[1:]:
+      assert _devices_match(v.device, d), (
+          "v.device = %s, d = %s, items = %s" % (v.device, d, items))
+      assert mirrored_container is v._mirrored_container()
+    return mirrored_container
+  # pylint: enable=protected-access
+
+  return wrap_class(per_device)
+
+
+def select_device(device, structured):
+  """Specialize a nest of regular & per-device values for one device."""
+  def _get(x):
+    return x.get(device) if isinstance(x, DistributedValues) else x
+
+  return nest.map_structure(_get, structured)
+
+
+def select_device_mirrored(device, structured):
+  """Specialize a nest of regular & mirrored values for one device."""
+  def _get_mirrored(x):
+    if isinstance(x, DistributedValues):
+      if not isinstance(x, Mirrored):
+        raise TypeError(
+            "Expected value to be mirrored across towers: %s in %s." %
+            (x, structured))
+      return x.get(device)
+    else:
+      return x
+
+  return nest.map_structure(_get_mirrored, structured)
+
+
+class PerDeviceDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `PerDeviceDataset`."""
+
+  def __init__(self, iterator, devices, prefetch_on_device=None):
+    self._iterator = iterator
+    self._devices = devices
+    self._prefetch_on_device = prefetch_on_device
+
+  def get_next(self, name=None):
+    """Scatter the input across devices."""
+    if self._prefetch_on_device:
+      data_list = self._iterator.get_next(name=name)
+      index = dict(zip(self._devices, data_list))
+    else:
+      batch = self._iterator.get_next(name=name)
+      index = {}
+      def get_ith(i):
+        return lambda x: x[i]
+
+      for i, d in enumerate(self._devices):
+        index[d] = nest.map_structure(get_ith(i), batch)
+        if context.executing_eagerly():
+          with ops.device(d):
+            index[d] = nest.map_structure(array_ops.identity, index[d])
+
+    return regroup(index)
+
+
+class PerDeviceDataset(object):
+  """Like `tf.data.Dataset` split devices, producing `PerDevice` data."""
+
+  def __init__(self, dataset, devices, prefetch_on_device=None):
+    self._devices = devices
+
+    # Default to using prefetching in graph mode, unless specified.
+    # TODO(priyag): Enable prefetching in eager mode.
+    self._prefetch_on_device = prefetch_on_device
+    if self._prefetch_on_device is None:
+      self._prefetch_on_device = not context.executing_eagerly()
+    assert not (self._prefetch_on_device and context.executing_eagerly()), (
+        "Prefetching is only supported in graph mode currently")
+
+    if self._prefetch_on_device:
+      self._dataset = dataset
+    else:
+      # TODO(priyag): If dropping remainder is not appropriate, find another
+      # approach to distributing the dataset when not possible to divide evenly.
+      # Possibly not an issue when we start using PartitionedDataset.
+      self._dataset = dataset.apply(
+          batching.batch_and_drop_remainder(len(devices)))
+
+  def make_one_shot_iterator(self):
+    """Get a one time use iterator for the distributed PerDeviceDataset."""
+    if self._prefetch_on_device:
+      on_device_dataset = self._dataset.apply(
+          prefetching_ops_v2.prefetch_to_devices(self._devices))
+      dataset_iterator = on_device_dataset.make_one_shot_iterator()
+    elif context.executing_eagerly():
+      dataset_iterator = datasets.Iterator(self._dataset)
+    else:
+      dataset_iterator = self._dataset.make_one_shot_iterator()
+
+    return PerDeviceDataIterator(
+        dataset_iterator, self._devices, self._prefetch_on_device)
+
+
+class MapOutput(object):
+  """Map can result in multiple outputs per device."""
+
+  def __init__(self, l):
+    self._l = l
+
+  def get(self):
+    return self._l
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c0d4b7d6c78b7cf63c613201d83d4793ecfe76b
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -0,0 +1,807 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the distributed values library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import values
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import device_util
+from tensorflow.python.training import saver as saver_lib
+
+
+@test_util.with_c_api
+class DistributedValuesTest(test.TestCase):
+
+  def testGetEager(self):
+    with ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = constant_op.constant(2)
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      with self.assertRaises(ValueError):
+        self.assertIsNone(v.get("/device:GPU:2"))
+
+  def testGetGraph(self):
+    with context.graph_mode(), \
+        ops.Graph().as_default(), \
+        ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = constant_op.constant(2)
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      with self.assertRaises(ValueError):
+        self.assertIsNone(v.get("/device:GPU:2"))
+
+  def testCanonicalization(self):
+    canonical_cpu = ["/job:localhost/replica:0/task:0/device:CPU:0"]
+    v = values.DistributedValues({"": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    v = values.DistributedValues({"/device:CPU:0": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    v = values.DistributedValues({"/cpu:0": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    v = values.DistributedValues({"/CPU:0": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    with self.assertRaises(AssertionError):
+      v = values.DistributedValues({"/device:cpu:0": 42})
+
+
+@test_util.with_c_api
+class DistributedDelegateTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGetAttr(self):
+    with ops.device("/device:CPU:0"):
+
+      class Foo(object):
+
+        def __init__(self, x):
+          self.x = x
+
+      v = values.DistributedDelegate(
+          {"/device:CPU:0": Foo(7), "/device:GPU:0": Foo(8)})
+      self.assertEqual(7, v.x)
+      with self.assertRaises(AttributeError):
+        _ = v.y
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOperatorOverride(self):
+    with ops.device("/device:CPU:0"):
+      v = values.DistributedDelegate({"/device:CPU:0": 7, "/device:GPU:0": 8})
+      # v should act like int(7).
+      self.assertEqual(8, v + 1)
+      self.assertEqual(10, 3 + v)
+      self.assertEqual(14, v + v)
+      self.assertEqual(5, v - 2)
+      self.assertEqual(6, 13 - v)
+      self.assertEqual(0, v - v)
+      self.assertEqual(14, v * 2)
+      self.assertEqual(21, 3 * v)
+      self.assertEqual(49, v * v)
+      self.assertEqual(3.5, v / 2)
+      self.assertEqual(1.5, 10.5 / v)
+      self.assertEqual(3, v // 2)
+      self.assertEqual(2, 15 // v)
+      self.assertEqual(1, v % 2)
+      self.assertEqual(2, 16 % v)
+      self.assertTrue(v < 12)
+      self.assertTrue(v <= 12)
+      self.assertFalse(v > 12)
+      self.assertFalse(v >= 12)
+      self.assertFalse(12 < v)
+      self.assertFalse(12 <= v)
+      self.assertTrue(12 > v)
+      self.assertTrue(12 >= v)
+      self.assertEqual(3, v & 3)
+      self.assertEqual(3, 11 & v)
+      self.assertEqual(15, v | 8)
+      self.assertEqual(23, 16 | v)
+      self.assertEqual(4, v ^ 3)
+      self.assertEqual(12, 11 ^ v)
+      self.assertEqual(343, pow(v, 3))
+      self.assertEqual(3, pow(v, 3, 10))
+      self.assertEqual(128, pow(2, v))
+      self.assertEqual(-7, -v)
+      self.assertEqual(~7, ~v)
+      self.assertEqual(7, abs(v))
+      with self.assertRaises(TypeError):
+        _ = v[2]
+
+
+def _device_str(d):
+  return "/device:GPU:" + str(d)
+
+
+def _nested_value(d):
+  return ("a" + d, ["b" + d, {"c": "d" + d, "e": "f" + d}, "g" + d], "h" + d)
+
+
+def _make_mirrored():
+  v = []
+  index = {}
+  devices = ["/device:GPU:0", "/device:CPU:0"]
+  for d, n, init in zip(devices, ["v", "v/replica"], [1., 2.]):
+    with ops.device(d):
+      v.append(variable_scope.get_variable(
+          name=n, initializer=init, use_resource=True))
+      index[d] = v[-1]
+  mirrored = values.MirroredVariable(index, v[0])
+  return v, devices, mirrored
+
+
+@test_util.with_c_api
+class RegroupAndSelectDeviceTest(test.TestCase):
+
+  def _is_per_device(self, result, expected, klass=values.PerDevice):
+    self.assertIsInstance(result, klass)
+    # We canonicalize the devices to match the device strings returned
+    # by PerDevice, which also does device string canonicalization.
+    devices = [device_util.canonicalize(_device_str(i))
+               for i in range(len(expected))]
+    self.assertEqual(set(devices), set(result.devices))
+    for i, d in enumerate(devices):
+      self.assertEqual(expected[i], result.get(d))
+      self.assertEqual(expected[i], result.get(_device_str(i)))
+
+  def testNested(self):
+    result = values.regroup({_device_str(0): _nested_value("1"),
+                             _device_str(1): _nested_value("2")})
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(3, len(result))
+    self._is_per_device(result[0], ["a1", "a2"])
+    self._is_per_device(result[2], ["h1", "h2"])
+
+    self.assertIsInstance(result[1], list)
+    self.assertEqual(3, len(result[1]))
+    self._is_per_device(result[1][0], ["b1", "b2"])
+    self._is_per_device(result[1][2], ["g1", "g2"])
+
+    self.assertIsInstance(result[1][1], dict)
+    self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
+    self._is_per_device(result[1][1]["c"], ["d1", "d2"])
+    self._is_per_device(result[1][1]["e"], ["f1", "f2"])
+
+    # Also test that we can undo the merge using select_device()
+    self.assertEqual(_nested_value("1"),
+                     values.select_device(_device_str(0), result))
+    self.assertEqual(_nested_value("2"),
+                     values.select_device(_device_str(1), result))
+    # select_device_mirrored() should fail due to non-mirrored values
+    with self.assertRaises(TypeError):
+      values.select_device_mirrored(_device_str(0), result)
+    with self.assertRaises(TypeError):
+      values.select_device_mirrored(_device_str(1), result)
+
+  def testWrapClass(self):
+    # Normally a mirrored value would be the same across devices, but
+    # for a test it is convenient to be able to tell the values apart.
+    result = values.regroup({_device_str(0): _nested_value("1"),
+                             _device_str(1): _nested_value("2")},
+                            values.Mirrored)
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(3, len(result))
+    self._is_per_device(result[0], ["a1", "a2"], values.Mirrored)
+    self._is_per_device(result[2], ["h1", "h2"], values.Mirrored)
+
+    self.assertIsInstance(result[1], list)
+    self.assertEqual(3, len(result[1]))
+    self._is_per_device(result[1][0], ["b1", "b2"], values.Mirrored)
+    self._is_per_device(result[1][2], ["g1", "g2"], values.Mirrored)
+
+    self.assertIsInstance(result[1][1], dict)
+    self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
+    self._is_per_device(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
+    self._is_per_device(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
+
+    # Also test that we can undo the merge using select_device()
+    self.assertEqual(_nested_value("1"),
+                     values.select_device(_device_str(0), result))
+    self.assertEqual(_nested_value("2"),
+                     values.select_device(_device_str(1), result))
+    # Values are marked as mirrored, so select_device_mirrored() is allowed.
+    self.assertEqual(_nested_value("1"),
+                     values.select_device_mirrored(_device_str(0), result))
+    self.assertEqual(_nested_value("2"),
+                     values.select_device_mirrored(_device_str(1), result))
+
+  def testMirroredContainer(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+    v, devices, mirrored = _make_mirrored()
+    result = values.regroup(dict(zip(devices, v)))
+    self.assertIs(mirrored, result)
+
+  def testSameId(self):
+    foo = object()
+    result = values.regroup({_device_str(0): ("a", foo),
+                             _device_str(1): ("b", foo)})
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(2, len(result))
+    self._is_per_device(result[0], ["a", "b"])
+    self.assertIs(foo, result[1])
+
+    # Test select_device(), should undo the merge done by regroup().
+    result_0 = values.select_device(_device_str(0), result)
+    self.assertIsInstance(result_0, tuple)
+    self.assertEqual(2, len(result_0))
+    self.assertEqual("a", result_0[0])
+    self.assertIs(foo, result_0[1])
+    result_1 = values.select_device(_device_str(1), result)
+    self.assertIsInstance(result_1, tuple)
+    self.assertEqual(2, len(result_1))
+    self.assertEqual("b", result_1[0])
+    self.assertIs(foo, result_1[1])
+
+  def testOneDevice(self):
+    result = values.regroup({_device_str(0): _nested_value("1")})
+    # On one device regroup() and select_device() are basically identity.
+    self.assertEqual(_nested_value("1"), result)
+    self.assertEqual(_nested_value("1"),
+                     values.select_device(_device_str(0), result))
+
+    # The one exception has to do with MirroredVariables.
+    d = "/device:CPU:0"
+    with ops.device(d):
+      v = variable_scope.get_variable(
+          name="v", initializer=1., use_resource=True)
+      index = {d: v}
+    mirrored = values.MirroredVariable(index, v)
+    result = values.regroup(index)
+    self.assertIs(mirrored, result)
+
+  def testNamedTupleEstimatorSpec(self):
+    with context.graph_mode(), ops.Graph().as_default():
+      created_estimator_specs = {}
+      to_regroup = {}
+
+      for device_id in range(3):
+        spec = model_fn_lib.EstimatorSpec(
+            mode=model_fn_lib.ModeKeys.TRAIN,
+            loss=constant_op.constant(device_id / 2),
+            train_op=array_ops.identity(constant_op.constant(device_id)))
+        created_estimator_specs[device_id] = spec
+        to_regroup[_device_str(device_id)] = spec
+
+      merged_estimator_spec = values.regroup(to_regroup)
+
+      self.assertTrue(
+          isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec))
+      self.assertEquals(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
+      for device_id in range(3):
+        d = _device_str(device_id)
+        self.assertEquals(created_estimator_specs[device_id].loss,
+                          merged_estimator_spec.loss.get(d))
+        self.assertEquals(created_estimator_specs[device_id].train_op,
+                          merged_estimator_spec.train_op.get(d))
+        # Scaffold is populated by `EstimatorSpec.__new__`.
+        self.assertEquals(created_estimator_specs[device_id].scaffold,
+                          merged_estimator_spec.scaffold.get(d))
+        # Also test that we can undo the merge using select_device()
+        self.assertEquals(created_estimator_specs[device_id],
+                          values.select_device(_device_str(device_id),
+                                               merged_estimator_spec))
+
+
+@test_util.with_c_api
+class PerDeviceDatasetTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  def _test_iterator_no_prefetch(self, devices, dataset, expected_values):
+    per_device_dataset = values.PerDeviceDataset(
+        dataset, devices, prefetch_on_device=False)
+    iterator = per_device_dataset.make_one_shot_iterator()
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      actual = self.evaluate([
+          values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, actual)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      self.evaluate([
+          values.select_device(d, next_element) for d in devices])
+
+  def _test_iterator_with_prefetch(self, devices, dataset, expected_values):
+    if not context.executing_eagerly():
+      per_device_dataset = values.PerDeviceDataset(
+          dataset, devices, prefetch_on_device=True)
+      iterator = per_device_dataset.make_one_shot_iterator()
+
+      # With prefetching, we cannot guarantee which input ends up on which
+      # device, so we verify that the complete set seen on all devices is
+      # correct, and equal numbers are distributed to each device.
+      combined_actual = []
+      combined_expected = []
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        combined_actual.extend(self.evaluate([
+            values.select_device(d, next_element) for d in devices]))
+        combined_expected.extend(expected_value)
+
+      self.assertEqual(set(combined_expected), set(combined_actual))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        self.evaluate([
+            values.select_device(d, next_element) for d in devices])
+
+  def _test_iterator(self, devices, dataset, expected_values):
+    self._test_iterator_no_prefetch(devices, dataset, expected_values)
+    self._test_iterator_with_prefetch(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOneDevice(self):
+    devices = ["/device:CPU:0"]
+    dataset = dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleDevices(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset = dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testTupleDataset(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testUnevenDatasetBatches(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset = dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(devices, dataset, expected_values)
+
+
+@test_util.with_c_api
+class MirroredVariableTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testProperties(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    v, _, mirrored = _make_mirrored()
+
+    self.assertEquals(v[0].name, mirrored.name)
+    self.assertEquals(v[0].dtype, mirrored.dtype)
+    self.assertEquals(v[0].shape, mirrored.shape)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testVariableOnAnotherDevice(self):
+    v = variable_scope.get_variable(
+        name="v", initializer=[1.], use_resource=True)
+    index = {"/job:foo/device:CPU:0": v}
+    mirrored = values.MirroredVariable(index, v)
+
+    self.assertEquals(v.name, mirrored.name)
+    self.assertEquals(v.dtype, mirrored.dtype)
+    self.assertEquals(v.shape, mirrored.shape)
+
+  def _assign_mirrored(self, devices, v, new):
+    for d, var, n in zip(devices, v, new):
+      with ops.device(d):
+        self.evaluate(var.assign(n))
+
+  def _save_return_saver(self, sess, var):
+    saver = saver_lib.Saver(var_list=[var])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    return saver.save(sess, prefix), saver
+
+  def _save(self, sess, var):
+    save_path, _ = self._save_return_saver(sess, var)
+    return save_path
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveAndRestoreMirroredOneGraph(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    with self.test_session() as sess:
+      v, devices, mirrored = _make_mirrored()
+
+      # Overwrite the initial values.
+      self._assign_mirrored(devices, v, [3., 4.])
+
+      # Saves the current value of v[0], 3.
+      save_path, saver = self._save_return_saver(sess, mirrored)
+
+      # Change the values between save and restore.
+      self._assign_mirrored(devices, v, [5., 6.])
+
+      # Restores the saved value of 3. to both variables.
+      saver.restore(sess, save_path)
+      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
+
+  def _save_mirrored(self):
+    """Save variables with mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, devices, mirrored = _make_mirrored()
+
+      # Overwrite the initial values.
+      self._assign_mirrored(devices, v, [3., 4.])
+
+      # Saves the current value of v[0], 3.
+      save_path = self._save(sess, mirrored)
+
+      # Change the values between save and restore.
+      self._assign_mirrored(devices, v, [5., 6.])
+    return save_path
+
+  def _save_normal(self):
+    """Save variables without mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=1., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(3.))
+
+      # Saves the current value of var, 3.
+      save_path = self._save(sess, var)
+
+      # Change the values between save and restore.
+      self.evaluate(var.assign(5.))
+    return save_path
+
+  def _restore_normal(self, save_path):
+    """Restore to variables without mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=7., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(8.))
+
+      # Restores the saved value of 3. to `var`.
+      saver = saver_lib.Saver(var_list=[var])
+      saver.restore(sess, save_path)
+      self.assertEqual(3., self.evaluate(var))
+
+  def _restore_mirrored(self, save_path):
+    """Restore to variables with mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, devices, mirrored = _make_mirrored()
+
+      # Overwrite the initial values.
+      self._assign_mirrored(devices, v, [7., 8.])
+
+      # Restores the saved value of 3. to both variables.
+      saver = saver_lib.Saver(var_list=[mirrored])
+      saver.restore(sess, save_path)
+      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveMirroredRestoreMirrored(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_mirrored()
+    self._restore_mirrored(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveMirroredRestoreNormal(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_mirrored()
+    self._restore_normal(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveNormalRestoreMirrored(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_normal()
+    self._restore_mirrored(save_path)
+
+
+_devices = ["/device:GPU:0", "/device:CPU:0"]
+
+
+def _make_tower_local(method):
+  v = []
+  index = {}
+  for d, n, init in zip(_devices, ["v", "v/replica"], [1., 2.]):
+    with ops.device(d):
+      v.append(variable_scope.get_variable(
+          name=n, initializer=init, use_resource=True))
+      index[d] = v[-1]
+  tower_local = values.TowerLocalVariable(index, v[0], method)
+  return v, tower_local
+
+
+@test_util.with_c_api
+class TowerLocalVariableTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testProperties(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    v, tower_local = _make_tower_local("sum")
+
+    self.assertEquals(v[0].name, tower_local.name)
+    self.assertEquals(v[0].dtype, tower_local.dtype)
+    self.assertEquals(v[0].shape, tower_local.shape)
+    self.assertEquals("sum", tower_local.reduce_method)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testVariableOnAnotherDevice(self):
+    v = variable_scope.get_variable(
+        name="v", initializer=[1.], use_resource=True)
+    index = {"/job:foo/device:CPU:0": v}
+    tower_local = values.TowerLocalVariable(index, v, "mean")
+
+    self.assertEquals(v.name, tower_local.name)
+    self.assertEquals(v.dtype, tower_local.dtype)
+    self.assertEquals(v.shape, tower_local.shape)
+    self.assertEquals("mean", tower_local.reduce_method)
+
+  def _assign_tower_local(self, devices, v, new):
+    for d, var, n in zip(devices, v, new):
+      with ops.device(d):
+        self.evaluate(var.assign(n))
+
+  def _save_return_saver(self, sess, var):
+    saver = saver_lib.Saver(var_list=[var])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    return saver.save(sess, prefix), saver
+
+  def _save(self, sess, var):
+    save_path, _ = self._save_return_saver(sess, var)
+    return save_path
+
+  def _dist_scope(self):
+    return mirrored_strategy.MirroredStrategy(_devices).scope()
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveAndRestoreTowerLocalSumOneGraph(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    with self.test_session() as sess:
+      v, tower_local = _make_tower_local("sum")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [3., 4.])
+
+      with self._dist_scope():
+        # Saves the current value of v[0] + v[1], 7.
+        save_path, saver = self._save_return_saver(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+
+        # Restores the saved value of 7. which gets divided equally
+        # between the variables.
+        saver.restore(sess, save_path)
+        self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveAndRestoreTowerLocalMeanOneGraph(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    with self.test_session() as sess:
+      v, tower_local = _make_tower_local("mean")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [3., 4.])
+
+      with self._dist_scope():
+        # Saves the current value of (v[0] + v[1])/2, 3.5.
+        save_path, saver = self._save_return_saver(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+
+        # Restores the saved value of 3.5 to both variables.
+        saver.restore(sess, save_path)
+        self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
+
+  def _save_tower_local_mean(self):
+    """Save variables with mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("mean")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [3., 4.])
+
+      with self._dist_scope():
+        # Saves the current value of (v[0] + v[1])/2, 3.5
+        save_path = self._save(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+    return save_path
+
+  def _save_tower_local_sum(self):
+    """Save variables with mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("sum")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [1.5, 2.])
+
+      with self._dist_scope():
+        # Saves the current value of v[0] + v[1], 3.5
+        save_path = self._save(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+    return save_path
+
+  def _save_normal(self):
+    """Save variables without mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=1., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(3.5))
+
+      # Saves the current value of var, 3.5.
+      save_path = self._save(sess, var)
+
+      # Change the values between save and restore.
+      self.evaluate(var.assign(5.))
+    return save_path
+
+  def _restore_normal(self, save_path):
+    """Restore to variables without mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=7., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(8.))
+
+      # Restores the saved value of 3.5 to `var`.
+      saver = saver_lib.Saver(var_list=[var])
+      saver.restore(sess, save_path)
+      self.assertEqual(3.5, self.evaluate(var))
+
+  def _restore_tower_local_mean(self, save_path):
+    """Restore to variables with mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("mean")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [7., 8.])
+
+      with self._dist_scope():
+        # Restores the saved value of 3.5 to both variables.
+        saver = saver_lib.Saver(var_list=[tower_local])
+        saver.restore(sess, save_path)
+        self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
+
+  def _restore_tower_local_sum(self, save_path):
+    """Restore to variables with mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("sum")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [7., 8.])
+
+      with self._dist_scope():
+        # Restores the saved value of 3.5 to both variables.
+        saver = saver_lib.Saver(var_list=[tower_local])
+        saver.restore(sess, save_path)
+        self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]]))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalRestoreTowerLocalMean(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_mean()
+    self._restore_tower_local_mean(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalRestoreTowerLocalSum(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_sum()
+    self._restore_tower_local_sum(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalMeanRestoreNormal(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_mean()
+    self._restore_normal(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalSumRestoreNormal(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_sum()
+    self._restore_normal(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveNormalRestoreTowerLocalMean(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_normal()
+    self._restore_tower_local_mean(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveNormalRestoreTowerLocalSum(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_normal()
+    self._restore_tower_local_sum(save_path)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 1bd73ee7044de34988144196f53299db2fb80fcf..9799901483f1a8fa192b97b3d0f052e672c26843 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -457,6 +457,20 @@ cuda_py_test(
     tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
+cuda_py_test(
+    name = "batch_reshape_test",
+    size = "small",
+    srcs = ["python/kernel_tests/batch_reshape_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "sample_stats_test",
     size = "medium",
@@ -487,11 +501,7 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    tags = [
-        "manual",
-        "noasan",
-        "noguitar",
-    ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -745,18 +755,6 @@ cuda_py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # === Bijector Tests ==========================================================
 
 cuda_py_test(
@@ -1106,25 +1104,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "sigmoid_centered_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bijectors/sigmoid_centered_test.py"],
-    additional_deps = [
-        ":bijectors_py",
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "//tensorflow/contrib/linalg:linalg_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 # Tests for SinhArcSinh bijector.  The file name has the extra "_bijector" to
 # avoid BUILD rule name conflicts with the distribution by the same name.
 cuda_py_test(
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 61c411271d0bb8d7b4cc3b14992b82ec1e5674ed..4d4489468d9dcfbe152c42f5f841f6c25a9f1e6f 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops.autoregressive import *
+from tensorflow.contrib.distributions.python.ops.batch_reshape import *
 from tensorflow.contrib.distributions.python.ops.binomial import *
 from tensorflow.contrib.distributions.python.ops.cauchy import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
@@ -96,9 +97,10 @@ _allowed_symbols = [
     'ReparameterizationType',
     'Distribution',
     'Autoregressive',
-    'Binomial',
+    'BatchReshape',
     'Bernoulli',
     'Beta',
+    'Binomial',
     'BetaWithSoftplusConcentration',
     'Categorical',
     'Chi2',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6c8d2cf6e75f049248c6b16f429847889d141fa
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
@@ -0,0 +1,568 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for BatchReshape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import batch_reshape as batch_reshape_lib
+from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_lib
+from tensorflow.contrib.distributions.python.ops import poisson as poisson_lib
+from tensorflow.contrib.distributions.python.ops import wishart as wishart_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+class _BatchReshapeTest(object):
+
+  def make_wishart(self, dims, new_batch_shape, old_batch_shape):
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = self.dtype([
+        [[1., 0.5],
+         [0.5, 1.]],
+        [[0.5, 0.25],
+         [0.25, 0.75]],
+    ])
+    scale = np.reshape(np.concatenate([scale, scale], axis=0),
+                       old_batch_shape + [dims, dims])
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    wishart = wishart_lib.WishartFull(df=5, scale=scale_ph)
+    reshape_wishart = batch_reshape_lib.BatchReshape(
+        distribution=wishart,
+        batch_shape=new_batch_shape_ph,
+        validate_args=True)
+
+    return wishart, reshape_wishart
+
+  def test_matrix_variate_sample_and_log_prob(self):
+    dims = 2
+    new_batch_shape = [4]
+    old_batch_shape = [2, 2]
+    wishart, reshape_wishart = self.make_wishart(
+        dims, new_batch_shape, old_batch_shape)
+
+    batch_shape = reshape_wishart.batch_shape_tensor()
+    event_shape = reshape_wishart.event_shape_tensor()
+
+    expected_sample_shape = [3, 1] + new_batch_shape + [dims, dims]
+    x = wishart.sample([3, 1], seed=42)
+    expected_sample = array_ops.reshape(x, expected_sample_shape)
+    actual_sample = reshape_wishart.sample([3, 1], seed=42)
+
+    expected_log_prob_shape = [3, 1] + new_batch_shape
+    expected_log_prob = array_ops.reshape(
+        wishart.log_prob(x), expected_log_prob_shape)
+    actual_log_prob = reshape_wishart.log_prob(expected_sample)
+
+    with self.test_session() as sess:
+      [
+          batch_shape_,
+          event_shape_,
+          expected_sample_, actual_sample_,
+          expected_log_prob_, actual_log_prob_,
+      ] = sess.run([
+          batch_shape,
+          event_shape,
+          expected_sample, actual_sample,
+          expected_log_prob, actual_log_prob,
+      ])
+
+    self.assertAllEqual(new_batch_shape, batch_shape_)
+    self.assertAllEqual([dims, dims], event_shape_)
+    self.assertAllClose(expected_sample_, actual_sample_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_log_prob_, actual_log_prob_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(new_batch_shape, reshape_wishart.batch_shape)
+    self.assertAllEqual([dims, dims], reshape_wishart.event_shape)
+    self.assertAllEqual(expected_sample_shape, actual_sample.shape)
+    self.assertAllEqual(expected_log_prob_shape, actual_log_prob.shape)
+
+  def test_matrix_variate_stats(self):
+    dims = 2
+    new_batch_shape = [4]
+    old_batch_shape = [2, 2]
+    wishart, reshape_wishart = self.make_wishart(
+        dims, new_batch_shape, old_batch_shape)
+
+    expected_scalar_stat_shape = new_batch_shape
+    expected_matrix_stat_shape = new_batch_shape + [dims, dims]
+
+    expected_entropy = array_ops.reshape(
+        wishart.entropy(), expected_scalar_stat_shape)
+    actual_entropy = reshape_wishart.entropy()
+
+    expected_mean = array_ops.reshape(
+        wishart.mean(), expected_matrix_stat_shape)
+    actual_mean = reshape_wishart.mean()
+
+    expected_mode = array_ops.reshape(
+        wishart.mode(), expected_matrix_stat_shape)
+    actual_mode = reshape_wishart.mode()
+
+    expected_stddev = array_ops.reshape(
+        wishart.stddev(), expected_matrix_stat_shape)
+    actual_stddev = reshape_wishart.stddev()
+
+    expected_variance = array_ops.reshape(
+        wishart.variance(), expected_matrix_stat_shape)
+    actual_variance = reshape_wishart.variance()
+
+    with self.test_session() as sess:
+      [
+          expected_entropy_, actual_entropy_,
+          expected_mean_, actual_mean_,
+          expected_mode_, actual_mode_,
+          expected_stddev_, actual_stddev_,
+          expected_variance_, actual_variance_,
+      ] = sess.run([
+          expected_entropy, actual_entropy,
+          expected_mean, actual_mean,
+          expected_mode, actual_mode,
+          expected_stddev, actual_stddev,
+          expected_variance, actual_variance,
+      ])
+
+    self.assertAllClose(expected_entropy_, actual_entropy_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mean_, actual_mean_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mode_, actual_mode_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_stddev_, actual_stddev_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_variance_, actual_variance_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(expected_scalar_stat_shape, actual_entropy.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_mean.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_mode.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_stddev.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_variance.shape)
+
+  def make_normal(self, new_batch_shape, old_batch_shape):
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = self.dtype(0.5 + np.arange(
+        np.prod(old_batch_shape)).reshape(old_batch_shape))
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    normal = normal_lib.Normal(loc=self.dtype(0), scale=scale_ph)
+    reshape_normal = batch_reshape_lib.BatchReshape(
+        distribution=normal,
+        batch_shape=new_batch_shape_ph,
+        validate_args=True)
+    return normal, reshape_normal
+
+  def test_scalar_variate_sample_and_log_prob(self):
+    new_batch_shape = [2, 2]
+    old_batch_shape = [4]
+
+    normal, reshape_normal = self.make_normal(
+        new_batch_shape, old_batch_shape)
+
+    batch_shape = reshape_normal.batch_shape_tensor()
+    event_shape = reshape_normal.event_shape_tensor()
+
+    expected_sample_shape = new_batch_shape
+    x = normal.sample(seed=52)
+    expected_sample = array_ops.reshape(x, expected_sample_shape)
+    actual_sample = reshape_normal.sample(seed=52)
+
+    expected_log_prob_shape = new_batch_shape
+    expected_log_prob = array_ops.reshape(
+        normal.log_prob(x), expected_log_prob_shape)
+    actual_log_prob = reshape_normal.log_prob(expected_sample)
+
+    with self.test_session() as sess:
+      [
+          batch_shape_,
+          event_shape_,
+          expected_sample_, actual_sample_,
+          expected_log_prob_, actual_log_prob_,
+      ] = sess.run([
+          batch_shape,
+          event_shape,
+          expected_sample, actual_sample,
+          expected_log_prob, actual_log_prob,
+      ])
+    self.assertAllEqual(new_batch_shape, batch_shape_)
+    self.assertAllEqual([], event_shape_)
+    self.assertAllClose(expected_sample_, actual_sample_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_log_prob_, actual_log_prob_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(new_batch_shape, reshape_normal.batch_shape)
+    self.assertAllEqual([], reshape_normal.event_shape)
+    self.assertAllEqual(expected_sample_shape, actual_sample.shape)
+    self.assertAllEqual(expected_log_prob_shape, actual_log_prob.shape)
+
+  def test_scalar_variate_stats(self):
+    new_batch_shape = [2, 2]
+    old_batch_shape = [4]
+
+    normal, reshape_normal = self.make_normal(new_batch_shape, old_batch_shape)
+
+    expected_scalar_stat_shape = new_batch_shape
+
+    expected_entropy = array_ops.reshape(
+        normal.entropy(), expected_scalar_stat_shape)
+    actual_entropy = reshape_normal.entropy()
+
+    expected_mean = array_ops.reshape(
+        normal.mean(), expected_scalar_stat_shape)
+    actual_mean = reshape_normal.mean()
+
+    expected_mode = array_ops.reshape(
+        normal.mode(), expected_scalar_stat_shape)
+    actual_mode = reshape_normal.mode()
+
+    expected_stddev = array_ops.reshape(
+        normal.stddev(), expected_scalar_stat_shape)
+    actual_stddev = reshape_normal.stddev()
+
+    expected_variance = array_ops.reshape(
+        normal.variance(), expected_scalar_stat_shape)
+    actual_variance = reshape_normal.variance()
+
+    with self.test_session() as sess:
+      [
+          expected_entropy_, actual_entropy_,
+          expected_mean_, actual_mean_,
+          expected_mode_, actual_mode_,
+          expected_stddev_, actual_stddev_,
+          expected_variance_, actual_variance_,
+      ] = sess.run([
+          expected_entropy, actual_entropy,
+          expected_mean, actual_mean,
+          expected_mode, actual_mode,
+          expected_stddev, actual_stddev,
+          expected_variance, actual_variance,
+      ])
+    self.assertAllClose(expected_entropy_, actual_entropy_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mean_, actual_mean_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mode_, actual_mode_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_stddev_, actual_stddev_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_variance_, actual_variance_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(expected_scalar_stat_shape, actual_entropy.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_mean.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_mode.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_stddev.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_variance.shape)
+
+  def make_mvn(self, dims, new_batch_shape, old_batch_shape):
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+    reshape_mvn = batch_reshape_lib.BatchReshape(
+        distribution=mvn,
+        batch_shape=new_batch_shape_ph,
+        validate_args=True)
+    return mvn, reshape_mvn
+
+  def test_vector_variate_sample_and_log_prob(self):
+    dims = 3
+    new_batch_shape = [2, 1]
+    old_batch_shape = [2]
+    mvn, reshape_mvn = self.make_mvn(
+        dims, new_batch_shape, old_batch_shape)
+
+    batch_shape = reshape_mvn.batch_shape_tensor()
+    event_shape = reshape_mvn.event_shape_tensor()
+
+    expected_sample_shape = [3] + new_batch_shape + [dims]
+    x = mvn.sample(3, seed=62)
+    expected_sample = array_ops.reshape(x, expected_sample_shape)
+    actual_sample = reshape_mvn.sample(3, seed=62)
+
+    expected_log_prob_shape = [3] + new_batch_shape
+    expected_log_prob = array_ops.reshape(
+        mvn.log_prob(x), expected_log_prob_shape)
+    actual_log_prob = reshape_mvn.log_prob(expected_sample)
+
+    with self.test_session() as sess:
+      [
+          batch_shape_,
+          event_shape_,
+          expected_sample_, actual_sample_,
+          expected_log_prob_, actual_log_prob_,
+      ] = sess.run([
+          batch_shape,
+          event_shape,
+          expected_sample, actual_sample,
+          expected_log_prob, actual_log_prob,
+      ])
+    self.assertAllEqual(new_batch_shape, batch_shape_)
+    self.assertAllEqual([dims], event_shape_)
+    self.assertAllClose(expected_sample_, actual_sample_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_log_prob_, actual_log_prob_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(new_batch_shape, reshape_mvn.batch_shape)
+    self.assertAllEqual([dims], reshape_mvn.event_shape)
+    self.assertAllEqual(expected_sample_shape, actual_sample.shape)
+    self.assertAllEqual(expected_log_prob_shape, actual_log_prob.shape)
+
+  def test_vector_variate_stats(self):
+    dims = 3
+    new_batch_shape = [2, 1]
+    old_batch_shape = [2]
+    mvn, reshape_mvn = self.make_mvn(
+        dims, new_batch_shape, old_batch_shape)
+
+    expected_scalar_stat_shape = new_batch_shape
+
+    expected_entropy = array_ops.reshape(
+        mvn.entropy(), expected_scalar_stat_shape)
+    actual_entropy = reshape_mvn.entropy()
+
+    expected_vector_stat_shape = new_batch_shape + [dims]
+
+    expected_mean = array_ops.reshape(
+        mvn.mean(), expected_vector_stat_shape)
+    actual_mean = reshape_mvn.mean()
+
+    expected_mode = array_ops.reshape(
+        mvn.mode(), expected_vector_stat_shape)
+    actual_mode = reshape_mvn.mode()
+
+    expected_stddev = array_ops.reshape(
+        mvn.stddev(), expected_vector_stat_shape)
+    actual_stddev = reshape_mvn.stddev()
+
+    expected_variance = array_ops.reshape(
+        mvn.variance(), expected_vector_stat_shape)
+    actual_variance = reshape_mvn.variance()
+
+    expected_matrix_stat_shape = new_batch_shape + [dims, dims]
+
+    expected_covariance = array_ops.reshape(
+        mvn.covariance(), expected_matrix_stat_shape)
+    actual_covariance = reshape_mvn.covariance()
+
+    with self.test_session() as sess:
+      [
+          expected_entropy_, actual_entropy_,
+          expected_mean_, actual_mean_,
+          expected_mode_, actual_mode_,
+          expected_stddev_, actual_stddev_,
+          expected_variance_, actual_variance_,
+          expected_covariance_, actual_covariance_,
+      ] = sess.run([
+          expected_entropy, actual_entropy,
+          expected_mean, actual_mean,
+          expected_mode, actual_mode,
+          expected_stddev, actual_stddev,
+          expected_variance, actual_variance,
+          expected_covariance, actual_covariance,
+      ])
+    self.assertAllClose(expected_entropy_, actual_entropy_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mean_, actual_mean_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mode_, actual_mode_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_stddev_, actual_stddev_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_variance_, actual_variance_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_covariance_, actual_covariance_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(expected_scalar_stat_shape, actual_entropy.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_mean.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_mode.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_stddev.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_variance.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_covariance.shape)
+
+  def test_bad_reshape_size(self):
+    dims = 2
+    new_batch_shape = [2, 3]
+    old_batch_shape = [2]   # 2 != 2*3
+
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(
+          ValueError, (r"`batch_shape` size \(6\) must match "
+                       r"`distribution\.batch_shape` size \(2\)")):
+        batch_reshape_lib.BatchReshape(
+            distribution=mvn,
+            batch_shape=new_batch_shape_ph,
+            validate_args=True)
+
+    else:
+      with self.test_session():
+        with self.assertRaisesOpError(r"`batch_shape` size must match "
+                                      r"`distributions.batch_shape` size"):
+          batch_reshape_lib.BatchReshape(
+              distribution=mvn,
+              batch_shape=new_batch_shape_ph,
+              validate_args=True).sample().eval()
+
+  def test_non_positive_shape(self):
+    dims = 2
+    new_batch_shape = [-1, -2]   # -1*-2=2 so will pass size check.
+    old_batch_shape = [2]
+
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(ValueError, r".*must be positive.*"):
+        batch_reshape_lib.BatchReshape(
+            distribution=mvn,
+            batch_shape=new_batch_shape_ph,
+            validate_args=True)
+
+    else:
+      with self.test_session():
+        with self.assertRaisesOpError(r".*must be positive.*"):
+          batch_reshape_lib.BatchReshape(
+              distribution=mvn,
+              batch_shape=new_batch_shape_ph,
+              validate_args=True).sample().eval()
+
+  def test_non_vector_shape(self):
+    dims = 2
+    new_batch_shape = 2
+    old_batch_shape = [2]
+
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(ValueError, r".*must be a vector.*"):
+        batch_reshape_lib.BatchReshape(
+            distribution=mvn,
+            batch_shape=new_batch_shape_ph,
+            validate_args=True)
+
+    else:
+      with self.test_session():
+        with self.assertRaisesOpError(r".*must be a vector.*"):
+          batch_reshape_lib.BatchReshape(
+              distribution=mvn,
+              batch_shape=new_batch_shape_ph,
+              validate_args=True).sample().eval()
+
+  def test_broadcasting_explicitly_unsupported(self):
+    old_batch_shape = [4]
+    new_batch_shape = [1, 4, 1]
+    rate_ = self.dtype([1, 10, 2, 20])
+
+    rate = array_ops.placeholder_with_default(
+        rate_,
+        shape=old_batch_shape if self.is_static_shape else None)
+    poisson_4 = poisson_lib.Poisson(rate)
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+    poisson_141_reshaped = batch_reshape_lib.BatchReshape(
+        poisson_4, new_batch_shape_ph, validate_args=True)
+
+    x_4 = self.dtype([2, 12, 3, 23])
+    x_114 = self.dtype([2, 12, 3, 23]).reshape(1, 1, 4)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(NotImplementedError,
+                                   "too few event dims"):
+        poisson_141_reshaped.log_prob(x_4)
+      with self.assertRaisesRegexp(NotImplementedError,
+                                   "unexpected batch and event shape"):
+        poisson_141_reshaped.log_prob(x_114)
+      return
+
+    with self.assertRaisesOpError("too few event dims"):
+      with self.test_session():
+        poisson_141_reshaped.log_prob(x_4).eval()
+
+    with self.assertRaisesOpError("unexpected batch and event shape"):
+      with self.test_session():
+        poisson_141_reshaped.log_prob(x_114).eval()
+
+
+class BatchReshapeStaticTest(_BatchReshapeTest, test.TestCase):
+
+  dtype = np.float32
+  is_static_shape = True
+
+
+class BatchReshapeDynamicTest(_BatchReshapeTest, test.TestCase):
+
+  dtype = np.float64
+  is_static_shape = False
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
index 20e754308449af3f0399101f4ea1bb47b3356424..a748acd667e58f9b527bab11d8bc4d086996e9f3 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -66,12 +66,10 @@ class ChainBijectorTest(test.TestCase):
   def testShapeGetters(self):
     with self.test_session():
       bijector = Chain([
-          SoftmaxCentered(
-              event_ndims=1, validate_args=True),
-          SoftmaxCentered(
-              event_ndims=0, validate_args=True)
+          SoftmaxCentered(validate_args=True),
+          SoftmaxCentered(validate_args=True),
       ])
-      x = tensor_shape.TensorShape([])
+      x = tensor_shape.TensorShape([1])
       y = tensor_shape.TensorShape([2 + 1])
       self.assertAllEqual(y, bijector.forward_event_shape(x))
       self.assertAllEqual(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
index 28e3e3135455348debb002b7d457e785799e1564..58ba9cedb1437df4e000ce32fe39664afa76c3b5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
@@ -37,8 +37,7 @@ class InvertBijectorTest(test.TestCase):
           bijectors.Exp(event_ndims=1),
           bijectors.Affine(shift=[0., 1.], scale_diag=[2., 3.]),
           bijectors.Softplus(event_ndims=1),
-          bijectors.SoftmaxCentered(event_ndims=1),
-          bijectors.SigmoidCentered(),
+          bijectors.SoftmaxCentered(),
       ]:
         rev = bijectors.Invert(fwd)
         self.assertEqual("_".join(["invert", fwd.name]), rev.name)
@@ -61,9 +60,9 @@ class InvertBijectorTest(test.TestCase):
 
   def testShapeGetters(self):
     with self.test_session():
-      bijector = bijectors.Invert(bijectors.SigmoidCentered(validate_args=True))
+      bijector = bijectors.Invert(bijectors.SoftmaxCentered(validate_args=True))
       x = tensor_shape.TensorShape([2])
-      y = tensor_shape.TensorShape([])
+      y = tensor_shape.TensorShape([1])
       self.assertAllEqual(y, bijector.forward_event_shape(x))
       self.assertAllEqual(
           y.as_list(),
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
index ad11d9f2484c4b08c67c5f82aec1320475d1d983..074b5f275d107fa49de42df262476bd4aa48ffae 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
@@ -69,7 +69,7 @@ class KumaraswamyBijectorTest(test.TestCase):
       bijector = Kumaraswamy(
           concentration1=concentration1,
           concentration0=concentration0, validate_args=True)
-      # Omitting the endpoints 0 and 1, since idlj will be inifinity at these
+      # Omitting the endpoints 0 and 1, since idlj will be infinity at these
       # endpoints.
       y = np.linspace(.01, 0.99, num=10).astype(np.float32)
       x = 1 - (1 - y ** concentration1) ** concentration0
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py
deleted file mode 100644
index 4ff3f334ccb59f1c117b3d35032d9e799cfd79bb..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import SigmoidCentered
-from tensorflow.python.platform import test
-
-
-class SigmoidCenteredBijectorTest(test.TestCase):
-  """Tests correctness of the Y = g(X) = (1 + exp(-X))^-1 transformation."""
-
-  def testBijector(self):
-    with self.test_session():
-      sigmoid = SigmoidCentered()
-      self.assertEqual("sigmoid_centered", sigmoid.name)
-      x = np.log([[2., 3, 4],
-                  [4., 8, 12]])
-      y = [[[2. / 3, 1. / 3],
-            [3. / 4, 1. / 4],
-            [4. / 5, 1. / 5]],
-           [[4. / 5, 1. / 5],
-            [8. / 9, 1. / 9],
-            [12. / 13, 1. / 13]]]
-      self.assertAllClose(y, sigmoid.forward(x).eval())
-      self.assertAllClose(x, sigmoid.inverse(y).eval())
-      self.assertAllClose(
-          -np.sum(np.log(y), axis=2),
-          sigmoid.inverse_log_det_jacobian(y).eval(),
-          atol=0.,
-          rtol=1e-7)
-      self.assertAllClose(
-          -sigmoid.inverse_log_det_jacobian(y).eval(),
-          sigmoid.forward_log_det_jacobian(x).eval(),
-          atol=0.,
-          rtol=1e-7)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
index 4a7679daad6f6acc632eb9133078499dda89e43d..cad4dd1ac8de0da6405aacb9047714b37eec73e3 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
@@ -34,34 +34,9 @@ rng = np.random.RandomState(42)
 class SoftmaxCenteredBijectorTest(test.TestCase):
   """Tests correctness of the Y = g(X) = exp(X) / sum(exp(X)) transformation."""
 
-  def testBijectorScalar(self):
-    with self.test_session():
-      softmax = SoftmaxCentered()  # scalar by default
-      self.assertEqual("softmax_centered", softmax.name)
-      x = np.log([[2., 3, 4],
-                  [4., 8, 12]])
-      y = [[[2. / 3, 1. / 3],
-            [3. / 4, 1. / 4],
-            [4. / 5, 1. / 5]],
-           [[4. / 5, 1. / 5],
-            [8. / 9, 1. / 9],
-            [12. / 13, 1. / 13]]]
-      self.assertAllClose(y, softmax.forward(x).eval())
-      self.assertAllClose(x, softmax.inverse(y).eval())
-      self.assertAllClose(
-          -np.sum(np.log(y), axis=2),
-          softmax.inverse_log_det_jacobian(y).eval(),
-          atol=0.,
-          rtol=1e-7)
-      self.assertAllClose(
-          -softmax.inverse_log_det_jacobian(y).eval(),
-          softmax.forward_log_det_jacobian(x).eval(),
-          atol=0.,
-          rtol=1e-7)
-
   def testBijectorVector(self):
     with self.test_session():
-      softmax = SoftmaxCentered(event_ndims=1)
+      softmax = SoftmaxCentered()
       self.assertEqual("softmax_centered", softmax.name)
       x = np.log([[2., 3, 4], [4., 8, 12]])
       y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
@@ -80,7 +55,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
 
   def testBijectorUnknownShape(self):
     with self.test_session():
-      softmax = SoftmaxCentered(event_ndims=1)
+      softmax = SoftmaxCentered()
       self.assertEqual("softmax_centered", softmax.name)
       x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
       real_x = np.log([[2., 3, 4], [4., 8, 12]])
@@ -106,24 +81,21 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
 
   def testShapeGetters(self):
     with self.test_session():
-      for x, y, b in ((tensor_shape.TensorShape([]),
-                       tensor_shape.TensorShape([2]),
-                       SoftmaxCentered(
-                           event_ndims=0, validate_args=True)),
-                      (tensor_shape.TensorShape([4]),
-                       tensor_shape.TensorShape([5]),
-                       SoftmaxCentered(
-                           event_ndims=1, validate_args=True))):
-        self.assertAllEqual(y, b.forward_event_shape(x))
-        self.assertAllEqual(y.as_list(),
-                            b.forward_event_shape_tensor(x.as_list()).eval())
-        self.assertAllEqual(x, b.inverse_event_shape(y))
-        self.assertAllEqual(x.as_list(),
-                            b.inverse_event_shape_tensor(y.as_list()).eval())
+      x = tensor_shape.TensorShape([4])
+      y = tensor_shape.TensorShape([5])
+      bijector = SoftmaxCentered(validate_args=True)
+      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(y.as_list(),
+                          bijector.forward_event_shape_tensor(
+                              x.as_list()).eval())
+      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+      self.assertAllEqual(x.as_list(),
+                          bijector.inverse_event_shape_tensor(
+                              y.as_list()).eval())
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      softmax = SoftmaxCentered(event_ndims=1)
+      softmax = SoftmaxCentered()
       x = np.linspace(-50, 50, num=10).reshape(5, 2).astype(np.float32)
       # Make y values on the simplex with a wide range.
       y_0 = np.ones(5).astype(np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
index 507ceb35853ebe0a996d789b3bdf8a5f2284549c..68e0d9cb8277f3953039963fec0da499db7a16d1 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
@@ -16,6 +16,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib import distributions
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -25,23 +27,23 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
-ds = distributions
+tfd = distributions
 
 
 class DistributionTest(test.TestCase):
 
   def testParamShapesAndFromParams(self):
     classes = [
-        ds.Normal,
-        ds.Bernoulli,
-        ds.Beta,
-        ds.Chi2,
-        ds.Exponential,
-        ds.Gamma,
-        ds.InverseGamma,
-        ds.Laplace,
-        ds.StudentT,
-        ds.Uniform,
+        tfd.Normal,
+        tfd.Bernoulli,
+        tfd.Beta,
+        tfd.Chi2,
+        tfd.Exponential,
+        tfd.Gamma,
+        tfd.InverseGamma,
+        tfd.Laplace,
+        tfd.StudentT,
+        tfd.Uniform,
     ]
 
     sample_shapes = [(), (10,), (10, 20, 30)]
@@ -63,15 +65,15 @@ class DistributionTest(test.TestCase):
     with self.test_session():
       # Note: we cannot easily test all distributions since each requires
       # different initialization arguments. We therefore spot test a few.
-      normal = ds.Normal(loc=1., scale=2., validate_args=True)
+      normal = tfd.Normal(loc=1., scale=2., validate_args=True)
       self.assertEqual(normal.parameters, normal.copy().parameters)
-      wishart = ds.WishartFull(df=2, scale=[[1., 2], [2, 5]],
-                               validate_args=True)
+      wishart = tfd.WishartFull(df=2, scale=[[1., 2], [2, 5]],
+                                validate_args=True)
       self.assertEqual(wishart.parameters, wishart.copy().parameters)
 
   def testCopyOverride(self):
     with self.test_session():
-      normal = ds.Normal(loc=1., scale=2., validate_args=True)
+      normal = tfd.Normal(loc=1., scale=2., validate_args=True)
       unused_normal_copy = normal.copy(validate_args=False)
       base_params = normal.parameters.copy()
       copy_params = normal.copy(validate_args=False).parameters.copy()
@@ -84,19 +86,19 @@ class DistributionTest(test.TestCase):
       mu = 1.
       sigma = 2.
 
-      normal = ds.Normal(mu, sigma, validate_args=True)
+      normal = tfd.Normal(mu, sigma, validate_args=True)
       self.assertTrue(tensor_util.constant_value(normal.is_scalar_event()))
       self.assertTrue(tensor_util.constant_value(normal.is_scalar_batch()))
 
-      normal = ds.Normal([mu], [sigma], validate_args=True)
+      normal = tfd.Normal([mu], [sigma], validate_args=True)
       self.assertTrue(tensor_util.constant_value(normal.is_scalar_event()))
       self.assertFalse(tensor_util.constant_value(normal.is_scalar_batch()))
 
-      mvn = ds.MultivariateNormalDiag([mu], [sigma], validate_args=True)
+      mvn = tfd.MultivariateNormalDiag([mu], [sigma], validate_args=True)
       self.assertFalse(tensor_util.constant_value(mvn.is_scalar_event()))
       self.assertTrue(tensor_util.constant_value(mvn.is_scalar_batch()))
 
-      mvn = ds.MultivariateNormalDiag([[mu]], [[sigma]], validate_args=True)
+      mvn = tfd.MultivariateNormalDiag([[mu]], [[sigma]], validate_args=True)
       self.assertFalse(tensor_util.constant_value(mvn.is_scalar_event()))
       self.assertFalse(tensor_util.constant_value(mvn.is_scalar_batch()))
 
@@ -126,7 +128,7 @@ class DistributionTest(test.TestCase):
       self.assertFalse(is_scalar.eval(feed_dict={x: [1]}))
 
   def _GetFakeDistribution(self):
-    class FakeDistribution(ds.Distribution):
+    class FakeDistribution(tfd.Distribution):
       """Fake Distribution for testing _set_sample_static_shape."""
 
       def __init__(self, batch_shape=None, event_shape=None):
@@ -188,6 +190,105 @@ class DistributionTest(test.TestCase):
       y = dist._set_sample_static_shape(x, sample_shape)
       self.assertTrue(y.get_shape().ndims is None)
 
+  def testStrWorksCorrectlyScalar(self):
+    normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1))
+    self.assertEqual(
+        ("tf.distributions.Normal("
+         "\"Normal\", "
+         "batch_shape=(), "
+         "event_shape=(), "
+         "dtype=float16)"),  # Got the dtype right.
+        str(normal))
+
+    chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly")
+    self.assertEqual(
+        ("tf.distributions.Chi2("
+         "\"silly\", "  # What a silly name that is!
+         "batch_shape=(2,), "
+         "event_shape=(), "
+         "dtype=float32)"),
+        str(chi2))
+
+    exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32))
+    self.assertEqual(
+        ("tf.distributions.Exponential(\"Exponential\", "
+         # No batch shape.
+         "event_shape=(), "
+         "dtype=float32)"),
+        str(exp))
+
+  def testStrWorksCorrectlyMultivariate(self):
+    mvn_static = tfd.MultivariateNormalDiag(
+        loc=np.zeros([2, 2]), name="MVN")
+    self.assertEqual(
+        ("tf.distributions.MultivariateNormalDiag("
+         "\"MVN\", "
+         "batch_shape=(2,), "
+         "event_shape=(2,), "
+         "dtype=float64)"),
+        str(mvn_static))
+
+    mvn_dynamic = tfd.MultivariateNormalDiag(
+        loc=array_ops.placeholder(shape=[None, 3], dtype=dtypes.float32),
+        name="MVN2")
+    self.assertEqual(
+        ("tf.distributions.MultivariateNormalDiag("
+         "\"MVN2\", "
+         "batch_shape=(?,), "  # Partially known.
+         "event_shape=(3,), "
+         "dtype=float32)"),
+        str(mvn_dynamic))
+
+  def testReprWorksCorrectlyScalar(self):
+    normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1))
+    self.assertEqual(
+        ("<tf.distributions.Normal"
+         " 'Normal'"
+         " batch_shape=()"
+         " event_shape=()"
+         " dtype=float16>"),  # Got the dtype right.
+        repr(normal))
+
+    chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly")
+    self.assertEqual(
+        ("<tf.distributions.Chi2"
+         " 'silly'"  # What a silly name that is!
+         " batch_shape=(2,)"
+         " event_shape=()"
+         " dtype=float32>"),
+        repr(chi2))
+
+    exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32))
+    self.assertEqual(
+        ("<tf.distributions.Exponential"
+         " 'Exponential'"
+         " batch_shape=<unknown>"
+         " event_shape=()"
+         " dtype=float32>"),
+        repr(exp))
+
+  def testReprWorksCorrectlyMultivariate(self):
+    mvn_static = tfd.MultivariateNormalDiag(
+        loc=np.zeros([2, 2]), name="MVN")
+    self.assertEqual(
+        ("<tf.distributions.MultivariateNormalDiag"
+         " 'MVN'"
+         " batch_shape=(2,)"
+         " event_shape=(2,)"
+         " dtype=float64>"),
+        repr(mvn_static))
+
+    mvn_dynamic = tfd.MultivariateNormalDiag(
+        loc=array_ops.placeholder(shape=[None, 3], dtype=dtypes.float32),
+        name="MVN2")
+    self.assertEqual(
+        ("<tf.distributions.MultivariateNormalDiag"
+         " 'MVN2'"
+         " batch_shape=(?,)"  # Partially known.
+         " event_shape=(3,)"
+         " dtype=float32>"),
+        repr(mvn_dynamic))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
index 4186cf129dbf31724c84133734da3f226817c71a..ea04e8c29a2c94d4939bad277afa380401067ff2 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.contrib.distributions.python.ops import sample_stats
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.platform import test
 
@@ -455,6 +456,16 @@ class PercentileTestWithNearestInterpolation(test.TestCase):
       with self.assertRaisesOpError("rank"):
         pct.eval(feed_dict={q_ph: [0.5]})
 
+  def test_finds_max_of_long_array(self):
+    # d - 1 == d in float32 and d = 3e7.
+    # So this test only passes if we use double for the percentile indices.
+    # If float is used, it fails with InvalidArgumentError about an index out of
+    # bounds.
+    x = math_ops.linspace(0., 3e7, num=int(3e7))
+    with self.test_session():
+      minval = sample_stats.percentile(x, q=0, validate_args=True)
+      self.assertAllEqual(0, minval.eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
index 3548ac18078a0b40f117c2bf9e2b34d20cee163b..0400c80c29cf0c36090168b7a1a6358ad49fde49 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
@@ -22,39 +22,75 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import statistical_testing as st
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import check_ops
 from tensorflow.python.platform import test
 
 
 class StatisticalTestingTest(test.TestCase):
 
   def test_dkwm_design_mean_one_sample_soundness(self):
-    numbers = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
+    thresholds = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
     rates = [1e-6, 1e-3, 1e-2, 1.1e-1, 0.2, 0.5, 0.7, 1.]
-    with self.test_session() as sess:
-      for ff in rates:
-        for fp in rates:
-          sufficient_n = st.min_num_samples_for_dkwm_mean_test(
-              numbers, 0., 1., false_fail_rate=ff, false_pass_rate=fp)
-          detectable_d = st.min_discrepancy_of_true_means_detectable_by_dkwm(
-              sufficient_n, 0., 1., false_fail_rate=ff, false_pass_rate=fp)
-          sess.run(check_ops.assert_less_equal(detectable_d, numbers))
+    false_fail_rates, false_pass_rates = np.meshgrid(rates, rates)
+    false_fail_rates = false_fail_rates.flatten().astype(np.float32)
+    false_pass_rates = false_pass_rates.flatten().astype(np.float32)
+
+    detectable_discrepancies = []
+    for false_pass_rate, false_fail_rate in zip(
+        false_pass_rates, false_fail_rates):
+      sufficient_n = st.min_num_samples_for_dkwm_mean_test(
+          thresholds, low=0., high=1., false_fail_rate=false_fail_rate,
+          false_pass_rate=false_pass_rate)
+      detectable_discrepancies.append(
+          st.min_discrepancy_of_true_means_detectable_by_dkwm(
+              sufficient_n, low=0., high=1., false_fail_rate=false_fail_rate,
+              false_pass_rate=false_pass_rate))
+
+    detectable_discrepancies_ = self.evaluate(detectable_discrepancies)
+    for discrepancies, false_pass_rate, false_fail_rate in zip(
+        detectable_discrepancies_, false_pass_rates, false_fail_rates):
+      below_threshold = discrepancies <= thresholds
+      self.assertAllEqual(
+          np.ones_like(below_threshold, np.bool), below_threshold,
+          msg='false_pass_rate({}), false_fail_rate({})'.format(
+              false_pass_rate, false_fail_rate))
 
   def test_dkwm_design_mean_two_sample_soundness(self):
-    numbers = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
+    thresholds = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
     rates = [1e-6, 1e-3, 1e-2, 1.1e-1, 0.2, 0.5, 0.7, 1.]
-    with self.test_session() as sess:
-      for ff in rates:
-        for fp in rates:
-          (sufficient_n1,
-           sufficient_n2) = st.min_num_samples_for_dkwm_mean_two_sample_test(
-               numbers, 0., 1., 0., 1.,
-               false_fail_rate=ff, false_pass_rate=fp)
-          d_fn = st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample
-          detectable_d = d_fn(
-              sufficient_n1, 0., 1., sufficient_n2, 0., 1.,
-              false_fail_rate=ff, false_pass_rate=fp)
-          sess.run(check_ops.assert_less_equal(detectable_d, numbers))
+    false_fail_rates, false_pass_rates = np.meshgrid(rates, rates)
+    false_fail_rates = false_fail_rates.flatten().astype(np.float32)
+    false_pass_rates = false_pass_rates.flatten().astype(np.float32)
+
+    detectable_discrepancies = []
+    for false_pass_rate, false_fail_rate in zip(
+        false_pass_rates, false_fail_rates):
+      [
+          sufficient_n1,
+          sufficient_n2
+      ] = st.min_num_samples_for_dkwm_mean_two_sample_test(
+          thresholds, low1=0., high1=1., low2=0., high2=1.,
+          false_fail_rate=false_fail_rate,
+          false_pass_rate=false_pass_rate)
+
+      detectable_discrepancies.append(
+          st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
+              n1=sufficient_n1,
+              low1=0.,
+              high1=1.,
+              n2=sufficient_n2,
+              low2=0.,
+              high2=1.,
+              false_fail_rate=false_fail_rate,
+              false_pass_rate=false_pass_rate))
+
+    detectable_discrepancies_ = self.evaluate(detectable_discrepancies)
+    for discrepancies, false_pass_rate, false_fail_rate in zip(
+        detectable_discrepancies_, false_pass_rates, false_fail_rates):
+      below_threshold = discrepancies <= thresholds
+      self.assertAllEqual(
+          np.ones_like(below_threshold, np.bool), below_threshold,
+          msg='false_pass_rate({}), false_fail_rate({})'.format(
+              false_pass_rate, false_fail_rate))
 
   def test_true_mean_confidence_interval_by_dkwm_one_sample(self):
     rng = np.random.RandomState(seed=0)
@@ -105,16 +141,16 @@ class StatisticalTestingTest(test.TestCase):
 
   def test_dkwm_mean_two_sample_assertion(self):
     rng = np.random.RandomState(seed=0)
-    num_samples = 15000
+    num_samples = 4000
 
-    # 15000 samples is chosen to be enough to find discrepancies of
-    # size 0.1 or more with assurance 1e-6, as confirmed here:
+    # 4000 samples is chosen to be enough to find discrepancies of
+    # size 0.2 or more with assurance 1e-6, as confirmed here:
     with self.test_session() as sess:
       d = st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
           num_samples, 0., 1., num_samples, 0., 1.,
           false_fail_rate=1e-6, false_pass_rate=1e-6)
       d = sess.run(d)
-      self.assertLess(d, 0.1)
+      self.assertLess(d, 0.2)
 
     # Test that the test assertion agrees that the standard
     # uniform distribution has the same mean as itself.
@@ -124,6 +160,15 @@ class StatisticalTestingTest(test.TestCase):
       sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
           samples1, 0., 1., samples2, 0., 1., false_fail_rate=1e-6))
 
+  def test_dkwm_mean_two_sample_assertion_beta_2_1_false(self):
+    rng = np.random.RandomState(seed=0)
+    num_samples = 4000
+    samples1 = rng.uniform(size=num_samples).astype(np.float32)
+
+    # As established above, 4000 samples is enough to find discrepancies
+    # of size 0.2 or more with assurance 1e-6.
+
+    with self.test_session() as sess:
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is different from the mean of beta(2, 1).
       beta_high_samples = rng.beta(2, 1, size=num_samples).astype(np.float32)
@@ -133,6 +178,15 @@ class StatisticalTestingTest(test.TestCase):
             beta_high_samples, 0., 1.,
             false_fail_rate=1e-6))
 
+  def test_dkwm_mean_two_sample_assertion_beta_1_2_false(self):
+    rng = np.random.RandomState(seed=0)
+    num_samples = 4000
+    samples1 = rng.uniform(size=num_samples).astype(np.float32)
+
+    # As established above, 4000 samples is enough to find discrepancies
+    # of size 0.2 or more with assurance 1e-6.
+
+    with self.test_session() as sess:
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is different from the mean of beta(1, 2).
       beta_low_samples = rng.beta(1, 2, size=num_samples).astype(np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
index af13553c32bdb6ef4038daa5e4bbef3251cff2f3..f0ba1ec3eb57c67c1a0edb15639e91916a4509b7 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
@@ -186,12 +186,14 @@ class TransformedDistributionTest(test.TestCase):
       standard_normal = ds.Normal(loc=0., scale=1.)
       multi_logit_normal = self._cls()(
           distribution=standard_normal,
-          bijector=softmax)
-      x = [[-np.log(3.), 0.],
-           [np.log(3), np.log(5)]]
+          bijector=softmax,
+          event_shape=[1])
+      x = [[[-np.log(3.)], [0.]],
+           [[np.log(3)], [np.log(5)]]]
       y = softmax.forward(x).eval()
-      expected_log_pdf = (stats.norm(loc=0., scale=1.).logpdf(x) -
-                          np.sum(np.log(y), axis=-1))
+      expected_log_pdf = (
+          np.squeeze(stats.norm(loc=0., scale=1.).logpdf(x)) -
+          np.sum(np.log(y), axis=-1))
       self.assertAllClose(expected_log_pdf,
                           multi_logit_normal.log_prob(y).eval())
       self.assertAllClose(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index 9044aa2850ae35f29cd48b0c5f54aa948bea0408..dcecce981f16a2d9e772d4e40062ff250725c3ac 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -390,6 +390,26 @@ class WishartCholeskyTest(test.TestCase):
                 chol_scale, dtype=np.int32),
             validate_args=False)
 
+  def testSampleBroadcasts(self):
+    dims = 2
+    batch_shape = [2, 3]
+    sample_shape = [2, 1]
+    scale = np.float32([
+        [[1., 0.5],
+         [0.5, 1.]],
+        [[0.5, 0.25],
+         [0.25, 0.75]],
+    ])
+    scale = np.reshape(np.concatenate([scale, scale, scale], axis=0),
+                       batch_shape + [dims, dims])
+    wishart = distributions.WishartFull(df=5, scale=scale)
+    x = wishart.sample(sample_shape, seed=42)
+    with self.test_session() as sess:
+      x_ = sess.run(x)
+    expected_shape = sample_shape + batch_shape + [dims, dims]
+    self.assertAllEqual(expected_shape, x.shape)
+    self.assertAllEqual(expected_shape, x_.shape)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/autoregressive.py b/tensorflow/contrib/distributions/python/ops/autoregressive.py
index 852298bf334666db003353d5fc8e172ffb738668..69f3d57ff000d6c9acc8aa9e3d0ad8d9cbb6bb3c 100644
--- a/tensorflow/contrib/distributions/python/ops/autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/autoregressive.py
@@ -36,7 +36,8 @@ class Autoregressive(distribution_lib.Distribution):
     "Autoregressive models decompose the joint density as a product of
     conditionals, and model each conditional in turn. Normalizing flows
     transform a base density (e.g. a standard Gaussian) into the target density
-    by an invertible transformation with tractable Jacobian." [1]
+    by an invertible transformation with tractable Jacobian." [(Papamakarios et
+    al., 2016)][1]
 
   In other words, the "autoregressive property" is equivalent to the
   decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
@@ -45,17 +46,18 @@ class Autoregressive(distribution_lib.Distribution):
 
   Practically speaking the autoregressive property means that there exists a
   permutation of the event coordinates such that each coordinate is a
-  diffeomorphic function of only preceding coordinates. [2]
+  diffeomorphic function of only preceding coordinates
+  [(van den Oord et al., 2016)][2].
 
   #### Mathematical Details
 
-  The probability function is,
+  The probability function is
 
   ```none
   prob(x; fn, n) = fn(x).prob(x)
   ```
 
-  And a sample is generated by,
+  And a sample is generated by
 
   ```none
   x = fn(...fn(fn(x0).sample()).sample()).sample()
@@ -93,13 +95,15 @@ class Autoregressive(distribution_lib.Distribution):
 
   ```
 
-  [1]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
+  #### References
 
-  [2]: "Conditional Image Generation with PixelCNN Decoders."
-       Aaron van den Oord, Nal Kalchbrenner, Oriol Vinyals, Lasse Espeholt, Alex
-       Graves, Koray Kavukcuoglu. Arxiv, 2016.
+  [1]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
+
+  [2]: Aaron van den Oord, Nal Kalchbrenner, Oriol Vinyals, Lasse Espeholt,
+       Alex Graves, and Koray Kavukcuoglu. Conditional Image Generation with
+       PixelCNN Decoders. In _Neural Information Processing Systems_, 2016.
        https://arxiv.org/abs/1606.05328
   """
 
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e6c35e0d6076113839481678abd3c20f8fb5db9
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -0,0 +1,415 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The BatchReshape distribution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+
+
+__all__ = [
+    "BatchReshape",
+]
+
+
+class BatchReshape(distribution_lib.Distribution):
+  """The Batch-Reshaping distribution.
+
+  This "meta-distribution" reshapes the batch dimensions of another
+  distribution.
+
+  Note: Unlike `tf.reshape`, the `BatchReshape` distribution does not support
+  `-1` for flattening.
+
+  #### Examples
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  dtype = np.float32
+  dims = 2
+  new_batch_shape = [1, 2, 3]
+  old_batch_shape = [6]
+
+  scale = np.ones(old_batch_shape + [dims], dtype)
+  mvn = tfd.MultivariateNormalDiag(scale_diag=scale)
+  reshape_mvn = tfd.BatchReshape(
+      distribution=mvn,
+      batch_shape=new_batch_shape,
+      validate_args=True)
+
+  reshape_mvn.batch_shape
+  # ==> [1, 2, 3]
+
+  x = reshape_mvn.sample(sample_shape=[4, 5])
+  x.shape
+  # ==> [4, 5, 1, 2, 3, 2] == sample_shape + new_batch_shape + [dims]
+
+  reshape_mvn.log_prob(x).shape
+  # ==> [4, 5, 1, 2, 3] == sample_shape + new_batch_shape
+  ```
+
+  """
+
+  def __init__(self,
+               distribution,
+               batch_shape,
+               validate_args=False,
+               allow_nan_stats=True,
+               name=None):
+    """Construct BatchReshape distribution.
+
+    Args:
+      distribution: The base distribution instance to reshape. Typically an
+        instance of `Distribution`.
+      batch_shape: Positive `int`-like vector-shaped `Tensor` representing the
+        new shape of the batch dimensions.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: The name to give Ops created by the initializer.
+        Default value: `"BatchReshape" + distribution.name`.
+
+    Raises:
+      ValueError: if `batch_shape` is not a vector.
+      ValueError: if `batch_shape` has non-positive elements.
+      ValueError: if `batch_shape` size is not the same as a
+        `distribution.batch_shape` size.
+    """
+    parameters = locals()
+    name = name or "BatchReshape" + distribution.name
+    self._distribution = distribution
+    with ops.name_scope(name, values=[batch_shape]) as name:
+      self._batch_shape_ = ops.convert_to_tensor(
+          batch_shape,
+          dtype=dtypes.int32,
+          name="batch_shape")
+      self._batch_shape_static = tensor_util.constant_value(self._batch_shape_)
+      if self._batch_shape_static is not None:
+        self._batch_shape_static = np.int32(self._batch_shape_static)
+      self._runtime_assertions = validate_init_args(
+          self._distribution,
+          self._batch_shape_,
+          validate_args,
+          self._batch_shape_static)
+      super(BatchReshape, self).__init__(
+          dtype=self._distribution.dtype,
+          reparameterization_type=self._distribution.reparameterization_type,
+          validate_args=validate_args,
+          allow_nan_stats=allow_nan_stats,
+          parameters=parameters,
+          graph_parents=(
+              [self._batch_shape_] +
+              self._distribution._graph_parents),  # pylint: disable=protected-access
+          name=name)
+
+  @property
+  def distribution(self):
+    return self._distribution
+
+  def _batch_shape_tensor(self):
+    with ops.control_dependencies(self._runtime_assertions):
+      return array_ops.identity(self._batch_shape_)
+
+  def _batch_shape(self):
+    return tensor_shape.TensorShape(self._batch_shape_static)
+
+  def _event_shape_tensor(self):
+    with ops.control_dependencies(self._runtime_assertions):
+      return array_ops.identity(self.distribution.event_shape_tensor())
+
+  def _event_shape(self):
+    return self.distribution.event_shape
+
+  def _sample_n(self, n, seed=None):
+    with ops.control_dependencies(self._runtime_assertions):
+      x = self.distribution.sample(sample_shape=n, seed=seed)
+      new_shape = array_ops.concat([
+          [n],
+          self.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], axis=0)
+      return array_ops.reshape(x, new_shape)
+
+  def _log_prob(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.log_prob, x)
+
+  def _prob(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.prob, x)
+
+  def _log_cdf(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.log_cdf, x)
+
+  def _cdf(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.cdf, x)
+
+  def _log_survival_function(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.log_survival_function, x)
+
+  def _survival_function(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.survival_function, x)
+
+  def _entropy(self):
+    return self._call_and_reshape_output(
+        self.distribution.entropy,
+        [],
+        [tensor_shape.scalar()])
+
+  def _mean(self):
+    return self._call_and_reshape_output(self.distribution.mean)
+
+  def _mode(self):
+    return self._call_and_reshape_output(self.distribution.mode)
+
+  def _stddev(self):
+    return self._call_and_reshape_output(self.distribution.stddev)
+
+  def _variance(self):
+    return self._call_and_reshape_output(self.distribution.variance)
+
+  def _covariance(self):
+    return self._call_and_reshape_output(
+        self.distribution.covariance,
+        [self.event_shape_tensor()]*2,
+        [self.event_shape]*2)
+
+  def _sample_shape(self, x):
+    """Computes graph and static `sample_shape`."""
+    x_ndims = (array_ops.rank(x) if x.shape.ndims is None else x.shape.ndims)
+    event_ndims = (array_ops.size(self.event_shape_tensor())
+                   if self.event_shape.ndims is None
+                   else self.event_shape.ndims)
+    batch_ndims = (array_ops.size(self.batch_shape_tensor())
+                   if self.batch_shape.ndims is None
+                   else self.batch_shape.ndims)
+    sample_ndims = x_ndims - batch_ndims - event_ndims
+    if isinstance(sample_ndims, int):
+      static_sample_shape = x.shape[:sample_ndims]
+    else:
+      static_sample_shape = tensor_shape.TensorShape(None)
+    if static_sample_shape.is_fully_defined():
+      sample_shape = np.int32(static_sample_shape.as_list())
+    else:
+      sample_shape = array_ops.shape(x)[:sample_ndims]
+    return sample_shape, static_sample_shape
+
+  def _call_reshape_input_output(self, fn, x):
+    """Calls `fn`, appropriately reshaping its input `x` and output."""
+    with ops.control_dependencies(
+        self._runtime_assertions + self._validate_sample_arg(x)):
+      sample_shape, static_sample_shape = self._sample_shape(x)
+      old_shape = array_ops.concat([
+          sample_shape,
+          self.distribution.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], axis=0)
+      result = fn(array_ops.reshape(x, old_shape))
+      new_shape = array_ops.concat([
+          sample_shape,
+          self.batch_shape_tensor(),
+      ], axis=0)
+      result = array_ops.reshape(result, new_shape)
+      if (static_sample_shape.ndims is not None and
+          self.batch_shape.ndims is not None):
+        new_shape = static_sample_shape.concatenate(self.batch_shape)
+        result.set_shape(result.shape.merge_with(new_shape))
+      return result
+
+  def _call_and_reshape_output(
+      self,
+      fn,
+      event_shape_list=None,
+      static_event_shape_list=None):
+    """Calls `fn` and appropriately reshapes its output."""
+    with ops.control_dependencies(self._runtime_assertions):
+      if event_shape_list is None:
+        event_shape_list = [self._event_shape_tensor()]
+      if static_event_shape_list is None:
+        static_event_shape_list = [self.event_shape]
+      new_shape = array_ops.concat(
+          [self.batch_shape_tensor()] + event_shape_list,
+          axis=0)
+      result = array_ops.reshape(fn(), new_shape)
+      if (self.batch_shape.ndims is not None and
+          self.event_shape.ndims is not None):
+        event_shape = tensor_shape.TensorShape([])
+        for rss in static_event_shape_list:
+          event_shape = event_shape.concatenate(rss)
+        static_shape = result.shape.merge_with(
+            self.batch_shape.concatenate(event_shape))
+        result.set_shape(static_shape)
+      return result
+
+  def _validate_sample_arg(self, x):
+    """Helper which validates sample arg, e.g., input to `log_prob`."""
+    with ops.name_scope(name="validate_sample_arg", values=[x]):
+      x_ndims = (array_ops.rank(x) if x.shape.ndims is None else x.shape.ndims)
+      event_ndims = (array_ops.size(self.event_shape_tensor())
+                     if self.event_shape.ndims is None
+                     else self.event_shape.ndims)
+      batch_ndims = (array_ops.size(self.batch_shape_tensor())
+                     if self.batch_shape.ndims is None
+                     else self.batch_shape.ndims)
+      expected_batch_event_ndims = batch_ndims + event_ndims
+
+      if (isinstance(x_ndims, int) and
+          isinstance(expected_batch_event_ndims, int)):
+        if x_ndims < expected_batch_event_ndims:
+          raise NotImplementedError(
+              "Broadcasting is not supported; too few event dims "
+              "(expected at least {}, saw {}).".format(
+                  expected_batch_event_ndims, x_ndims))
+        ndims_assertion = []
+      elif self.validate_args:
+        ndims_assertion = [
+            check_ops.assert_greater_equal(
+                x_ndims,
+                expected_batch_event_ndims,
+                message="Broadcasting is not supported; too few event dims.",
+                name="assert_batch_and_event_ndims_large_enough"),
+        ]
+
+      if (self.batch_shape.is_fully_defined() and
+          self.event_shape.is_fully_defined()):
+        expected_batch_event_shape = np.int32(self.batch_shape.concatenate(
+            self.event_shape).as_list())
+      else:
+        expected_batch_event_shape = array_ops.concat([
+            self.batch_shape_tensor(),
+            self.event_shape_tensor(),
+        ], axis=0)
+
+      sample_ndims = x_ndims - expected_batch_event_ndims
+      if isinstance(sample_ndims, int):
+        sample_ndims = max(sample_ndims, 0)
+      if (isinstance(sample_ndims, int) and
+          x.shape[sample_ndims:].is_fully_defined()):
+        actual_batch_event_shape = np.int32(x.shape[sample_ndims:].as_list())
+      else:
+        sample_ndims = math_ops.maximum(sample_ndims, 0)
+        actual_batch_event_shape = array_ops.shape(x)[sample_ndims:]
+
+      if (isinstance(expected_batch_event_shape, np.ndarray) and
+          isinstance(actual_batch_event_shape, np.ndarray)):
+        if any(expected_batch_event_shape != actual_batch_event_shape):
+          raise NotImplementedError("Broadcasting is not supported; "
+                                    "unexpected batch and event shape "
+                                    "(expected {}, saw {}).".format(
+                                        expected_batch_event_shape,
+                                        actual_batch_event_shape))
+        # We need to set the final runtime-assertions to `ndims_assertion` since
+        # its possible this assertion was created. We could add a condition to
+        # only do so if `self.validate_args == True`, however this is redundant
+        # as `ndims_assertion` already encodes this information.
+        runtime_assertions = ndims_assertion
+      elif self.validate_args:
+        # We need to make the `ndims_assertion` a control dep because otherwise
+        # TF itself might raise an exception owing to this assertion being
+        # ill-defined, ie, one cannot even compare different rank Tensors.
+        with ops.control_dependencies(ndims_assertion):
+          shape_assertion = check_ops.assert_equal(
+              expected_batch_event_shape,
+              actual_batch_event_shape,
+              message=("Broadcasting is not supported; "
+                       "unexpected batch and event shape."),
+              name="assert_batch_and_event_shape_same")
+        runtime_assertions = [shape_assertion]
+      else:
+        runtime_assertions = []
+
+      return runtime_assertions
+
+
+def validate_init_args(
+    distribution,
+    batch_shape,
+    validate_args,
+    batch_shape_static):
+  """Helper to __init__ which makes or raises assertions."""
+  with ops.name_scope(name="validate_init_args",
+                      values=[batch_shape] + distribution._graph_parents):  # pylint: disable=protected-access
+    runtime_assertions = []
+
+    if batch_shape.shape.ndims is not None:
+      if batch_shape.shape.ndims != 1:
+        raise ValueError("`batch_shape` must be a vector "
+                         "(saw rank: {}).".format(
+                             batch_shape.shape.ndims))
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_rank(
+              batch_shape,
+              1,
+              message="`batch_shape` must be a vector.",
+              name="assert_batch_shape_is_vector"),
+      ]
+
+    batch_size_static = np.prod(batch_shape_static)
+    dist_batch_size_static = (
+        None if not distribution.batch_shape.is_fully_defined()
+        else np.prod(distribution.batch_shape).value)
+
+    if batch_size_static is not None and dist_batch_size_static is not None:
+      if batch_size_static != dist_batch_size_static:
+        raise ValueError("`batch_shape` size ({}) must match "
+                         "`distribution.batch_shape` size ({}).".format(
+                             batch_size_static,
+                             dist_batch_size_static))
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_equal(
+              math_ops.reduce_prod(batch_shape),
+              math_ops.reduce_prod(distribution.batch_shape_tensor()),
+              message=("`batch_shape` size must match "
+                       "`distributions.batch_shape` size."),
+              name="assert_batch_size"),
+      ]
+
+    if batch_shape_static is not None:
+      if np.any(batch_shape_static < 1):
+        raise ValueError("`batch_shape` elements must be positive "
+                         "(i.e., larger than zero).")
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_positive(
+              batch_shape,
+              message=("`batch_shape` elements must be positive "
+                       "(i.e., larger than zero)."),
+              name="assert_batch_shape_positive")
+      ]
+
+    return runtime_assertions
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index 452f1caa30fdbf5442274cbcc7f3549081b80ae9..bc6b02542ebf3b83d58f888509dafb86351de8a7 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -35,7 +35,6 @@
 @@RealNVP
 @@Reshape
 @@Sigmoid
-@@SigmoidCentered
 @@SinhArcsinh
 @@SoftmaxCentered
 @@Softplus
@@ -72,7 +71,6 @@ from tensorflow.contrib.distributions.python.ops.bijectors.power_transform impor
 from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import *
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import *
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
index 7fe73ada4466d38a7d352f23a55d6b90ed38c84a..bef7bbb49b715497695f7513e19ecab4fa56c47e 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
@@ -62,7 +62,7 @@ class Affine(bijector.Bijector):
   matrices, i.e., the matmul is [matrix-free](
   https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
 
-  Examples:
+  #### Examples
 
   ```python
   # Y = X
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py b/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
index be72ff3081225b9f9fdb6541322b7fc3d4aaa41e..33fdd32d7a0a01685690e598c69adca2c95972e9 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
@@ -76,15 +76,16 @@ def _undo_batch_normalization(x,
 class BatchNormalization(bijector.Bijector):
   """Compute `Y = g(X) s.t. X = g^-1(Y) = (Y - mean(Y)) / std(Y)`.
 
-  Applies Batch Normalization [1] to samples from a data distribution. This can
-  be used to stabilize training of normalizing flows [2, 3].
+  Applies Batch Normalization [(Ioffe and Szegedy, 2015)][1] to samples from a
+  data distribution. This can be used to stabilize training of normalizing
+  flows ([Papamakarios et al., 2016][3]; [Dinh et al., 2017][2])
 
   When training Deep Neural Networks (DNNs), it is common practice to
   normalize or whiten features by shifting them to have zero mean and
   scaling them to have unit variance.
 
-  The `inverse()` method of the BatchNorm bijector, which is used in the
-  log-likelihood computation of data samples, implements the normalization
+  The `inverse()` method of the `BatchNormalization` bijector, which is used in
+  the log-likelihood computation of data samples, implements the normalization
   procedure (shift-and-scale) using the mean and standard deviation of the
   current minibatch.
 
@@ -92,7 +93,6 @@ class BatchNormalization(bijector.Bijector):
   `X*std(Y) + mean(Y)` with the running-average mean and standard deviation
   computed at training-time. De-normalization is useful for sampling.
 
-
   ```python
 
   dist = tfd.TransformedDistribution(
@@ -112,19 +112,20 @@ class BatchNormalization(bijector.Bijector):
   `BatchNorm.forward(BatchNorm.inverse(...))` will be identical when
   `training=False` but may be different when `training=True`.
 
-  [1]: "Batch Normalization: Accelerating Deep Network Training by Reducing
-       Internal Covariate Shift."
-       Sergey Ioffe, Christian Szegedy. Arxiv. 2015.
-       https://arxiv.org/abs/1502.03167
+  #### References
 
-  [2]: "Density Estimation using Real NVP."
-     Laurent Dinh, Jascha Sohl-Dickstein, Samy Bengio. ICLR. 2017.
-     https://arxiv.org/abs/1605.08803
+  [1]: Sergey Ioffe and Christian Szegedy. Batch Normalization: Accelerating
+       Deep Network Training by Reducing Internal Covariate Shift. In
+       _International Conference on Machine Learning_, 2015.
+       https://arxiv.org/abs/1502.03167
 
-  [3]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
+  [2]: Laurent Dinh, Jascha Sohl-Dickstein, and Samy Bengio. Density Estimation
+       using Real NVP. In _International Conference on Learning
+       Representations_, 2017. https://arxiv.org/abs/1605.08803
 
+  [3]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
   """
 
   def __init__(self,
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index 43208ff088b469b70ebc08757daac277d4432b37..8f09e16058b766c788ab3acced6940fd0026b521 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -57,7 +57,7 @@ class CholeskyOuterProduct(bijector.Bijector):
   that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
   diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
 
-  Examples:
+  #### Examples
 
   ```python
   bijector.CholeskyOuterProduct().forward(x=[[1., 0], [2, 1]])
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index 5251dbcb5748f75688aa43ce6e4e9dbd76be78bb..84b2340c75514c3d2c12bf4d775ba74450a0dc26 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -45,14 +45,15 @@ __all__ = [
 class MaskedAutoregressiveFlow(bijector_lib.Bijector):
   """Affine MaskedAutoregressiveFlow bijector for vector-valued events.
 
-  The affine autoregressive flow [1] provides a relatively simple framework for
-  user-specified (deep) architectures to learn a distribution over vector-valued
-  events. Regarding terminology,
+  The affine autoregressive flow [(Papamakarios et al., 2016)][3] provides a
+  relatively simple framework for user-specified (deep) architectures to learn
+  a distribution over vector-valued events. Regarding terminology,
 
     "Autoregressive models decompose the joint density as a product of
     conditionals, and model each conditional in turn. Normalizing flows
     transform a base density (e.g. a standard Gaussian) into the target density
-    by an invertible transformation with tractable Jacobian." [1]
+    by an invertible transformation with tractable Jacobian."
+    [(Papamakarios et al., 2016)][3]
 
   In other words, the "autoregressive property" is equivalent to the
   decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
@@ -75,26 +76,26 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
 
   Given a `shift_and_log_scale_fn`, the forward and inverse transformations are
   (a sequence of) affine transformations. A "valid" `shift_and_log_scale_fn`
-  must compute each `shift` (aka `loc` or "mu" [2]) and `log(scale)` (aka
-  "alpha" [2]) such that each are broadcastable with the arguments to `forward`
-  and `inverse`, i.e., such that the calculations in `forward`, `inverse`
-  [below] are possible.
+  must compute each `shift` (aka `loc` or "mu" in [Germain et al. (2015)][1])
+  and `log(scale)` (aka "alpha" in [Germain et al. (2015)][1]) such that each
+  are broadcastable with the arguments to `forward` and `inverse`, i.e., such
+  that the calculations in `forward`, `inverse` [below] are possible.
 
   For convenience, `masked_autoregressive_default_template` is offered as a
   possible `shift_and_log_scale_fn` function. It implements the MADE
-  architecture [2]. MADE is a feed-forward network that computes a `shift` and
-  `log(scale)` using `masked_dense` layers in a deep neural network. Weights are
-  masked to ensure the autoregressive property. It is possible that this
-  architecture is suboptimal for your task. To build alternative networks,
-  either change the arguments to `masked_autoregressive_default_template`, use
-  the `masked_dense` function to roll-out your own, or use some other
-  architecture, e.g., using `tf.layers`.
+  architecture [(Germain et al., 2015)][1]. MADE is a feed-forward network that
+  computes a `shift` and `log(scale)` using `masked_dense` layers in a deep
+  neural network. Weights are masked to ensure the autoregressive property. It
+  is possible that this architecture is suboptimal for your task. To build
+  alternative networks, either change the arguments to
+  `masked_autoregressive_default_template`, use the `masked_dense` function to
+  roll-out your own, or use some other architecture, e.g., using `tf.layers`.
 
   Warning: no attempt is made to validate that the `shift_and_log_scale_fn`
   enforces the "autoregressive property".
 
   Assuming `shift_and_log_scale_fn` has valid shape and autoregressive
-  semantics, the forward transformation is,
+  semantics, the forward transformation is
 
   ```python
   def forward(x):
@@ -106,7 +107,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
     return y
   ```
 
-  and the inverse transformation is,
+  and the inverse transformation is
 
   ```python
   def inverse(y):
@@ -121,7 +122,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
   the "last" `y` used to compute `shift`, `log_scale`. (Roughly speaking, this
   also proves the transform is bijective.)
 
-  #### Example Use
+  #### Examples
 
   ```python
   tfd = tf.contrib.distributions
@@ -142,7 +143,8 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
   maf.log_prob(x)   # Almost free; uses Bijector caching.
   maf.log_prob(0.)  # Cheap; no `tf.while_loop` despite no Bijector caching.
 
-  # [1] also describes an "Inverse Autoregressive Flow", e.g.,
+  # [Papamakarios et al. (2016)][3] also describe an Inverse Autoregressive
+  # Flow [(Kingma et al., 2016)][2]:
   iaf = tfd.TransformedDistribution(
       distribution=tfd.Normal(loc=0., scale=1.),
       bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
@@ -168,14 +170,20 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
       event_shape=[dims])
   ```
 
-  [1]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
+  #### References
 
-  [2]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
+  [1]: Mathieu Germain, Karol Gregor, Iain Murray, and Hugo Larochelle. MADE:
+       Masked Autoencoder for Distribution Estimation. In _International
+       Conference on Machine Learning_, 2015. https://arxiv.org/abs/1502.03509
 
+  [2]: Diederik P. Kingma, Tim Salimans, Rafal Jozefowicz, Xi Chen, Ilya
+       Sutskever, and Max Welling. Improving Variational Inference with Inverse
+       Autoregressive Flow. In _Neural Information Processing Systems_, 2016.
+       https://arxiv.org/abs/1606.04934
+
+  [3]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
   """
 
   def __init__(self,
@@ -329,11 +337,7 @@ def masked_dense(inputs,
                  **kwargs):
   """A autoregressively masked dense layer. Analogous to `tf.layers.dense`.
 
-  See [1] for detailed explanation.
-
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
+  See [Germain et al. (2015)][1] for detailed explanation.
 
   Arguments:
     inputs: Tensor input.
@@ -358,6 +362,12 @@ def masked_dense(inputs,
   Raises:
     NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
       graph execution.
+
+  #### References
+
+  [1]: Mathieu Germain, Karol Gregor, Iain Murray, and Hugo Larochelle. MADE:
+       Masked Autoencoder for Distribution Estimation. In _International
+       Conference on Machine Learning_, 2015. https://arxiv.org/abs/1502.03509
   """
   # TODO(b/67594795): Better support of dynamic shape.
   input_depth = inputs.shape.with_rank_at_least(1)[-1].value
@@ -398,23 +408,24 @@ def masked_autoregressive_default_template(
     name=None,
     *args,
     **kwargs):
-  """Build the MADE Model [1].
+  """Build the Masked Autoregressive Density Estimator (Germain et al., 2015).
 
   This will be wrapped in a make_template to ensure the variables are only
-  created once. It takes the input and returns the `loc` ("mu" [1]) and
-  `log_scale` ("alpha" [1]) from the MADE network.
+  created once. It takes the input and returns the `loc` ("mu" in [Germain et
+  al. (2015)][1]) and `log_scale` ("alpha" in [Germain et al. (2015)][1]) from
+  the MADE network.
 
   Warning: This function uses `masked_dense` to create randomly initialized
   `tf.Variables`. It is presumed that these will be fit, just as you would any
   other neural architecture which uses `tf.layers.dense`.
 
-  #### About Hidden Layers:
+  #### About Hidden Layers
 
   Each element of `hidden_layers` should be greater than the `input_depth`
   (i.e., `input_depth = tf.shape(input)[-1]` where `input` is the input to the
   neural network). This is necessary to ensure the autoregressivity property.
 
-  #### About Clipping:
+  #### About Clipping
 
   This function also optionally clips the `log_scale` (but possibly not its
   gradient). This is useful because if `log_scale` is too small/large it might
@@ -427,11 +438,7 @@ def masked_autoregressive_default_template(
   `grad[exp(clip(x))] = grad[x] exp(clip(x))` rather than the usual
   `grad[clip(x)] exp(clip(x))`.
 
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  Arguments:
+  Args:
     hidden_layers: Python `list`-like of non-negative integer, scalars
       indicating the number of units in each hidden layer. Default: `[512, 512].
     shift_only: Python `bool` indicating if only the `shift` term shall be
@@ -450,12 +457,20 @@ def masked_autoregressive_default_template(
     **kwargs: `tf.layers.dense` keyword arguments.
 
   Returns:
-    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
-    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
+    shift: `Float`-like `Tensor` of shift terms (the "mu" in
+      [Germain et al.  (2015)][1]).
+    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in
+      [Germain et al. (2015)][1]).
 
   Raises:
     NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
       graph execution.
+
+  #### References
+
+  [1]: Mathieu Germain, Karol Gregor, Iain Murray, and Hugo Larochelle. MADE:
+       Masked Autoencoder for Distribution Estimation. In _International
+       Conference on Machine Learning_, 2015. https://arxiv.org/abs/1502.03509
   """
 
   with ops.name_scope(name, "masked_autoregressive_default_template",
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
index 2840f52e742eac5e9e37a576bf7f6d6f05a07a35..71ab369d01aafc33854a2c2437f96bbb493cc6fb 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
@@ -38,7 +38,7 @@ class RealNVP(bijector_lib.Bijector):
   """RealNVP "affine coupling layer" for vector-valued events.
 
   Real NVP models a normalizing flow on a `D`-dimensional distribution via a
-  single `D-d`-dimensional conditional distribution [1]:
+  single `D-d`-dimensional conditional distribution [(Dinh et al., 2017)][1]:
 
   `y[d:D] = y[d:D] * math_ops.exp(log_scale_fn(y[d:D])) + shift_fn(y[d:D])`
   `y[0:d] = x[0:d]`
@@ -51,31 +51,34 @@ class RealNVP(bijector_lib.Bijector):
 
   Masking is currently only supported for base distributions with
   `event_ndims=1`. For more sophisticated masking schemes like checkerboard or
-  channel-wise masking [2], use the `tfb.Permute` bijector to re-order desired
-  masked units into the first `d` units. For base distributions with
-  `event_ndims > 1`, use the `tfb.Reshape` bijector to flatten the event shape.
-
-  Recall that the MAF bijector [2] implements a normalizing flow via an
-  autoregressive transformation. MAF and IAF have opposite computational
-  tradeoffs - MAF can train all units in parallel but must sample units
-  sequentially, while IAF must train units sequentially but can sample in
-  parallel. In contrast, Real NVP can compute both forward and inverse
-  computations in parallel. However, the lack of an autoregressive
+  channel-wise masking [(Papamakarios et al., 2016)[4], use the `tfb.Permute`
+  bijector to re-order desired masked units into the first `d` units. For base
+  distributions with `event_ndims > 1`, use the `tfb.Reshape` bijector to
+  flatten the event shape.
+
+  Recall that the MAF bijector [(Papamakarios et al., 2016)][4] implements a
+  normalizing flow via an autoregressive transformation. MAF and IAF have
+  opposite computational tradeoffs - MAF can train all units in parallel but
+  must sample units sequentially, while IAF must train units sequentially but
+  can sample in parallel. In contrast, Real NVP can compute both forward and
+  inverse computations in parallel. However, the lack of an autoregressive
   transformations makes it less expressive on a per-bijector basis.
 
   A "valid" `shift_and_log_scale_fn` must compute each `shift` (aka `loc` or
-  "mu" [2]) and `log(scale)` (aka "alpha" [2]) such that each are broadcastable
-  with the arguments to `forward` and `inverse`, i.e., such that the
-  calculations in `forward`, `inverse` [below] are possible. For convenience,
+  "mu" in [Papamakarios et al. (2016)][4]) and `log(scale)` (aka "alpha" in
+  [Papamakarios et al. (2016)][4]) such that each are broadcastable with the
+  arguments to `forward` and `inverse`, i.e., such that the calculations in
+  `forward`, `inverse` [below] are possible. For convenience,
   `real_nvp_default_nvp` is offered as a possible `shift_and_log_scale_fn`
   function.
 
-  NICE [3] is a special case of the Real NVP bijector which discards the scale
-  transformation, resulting in a constant-time inverse-log-determinant-Jacobian.
-  To use a NICE bijector instead of Real NVP, `shift_and_log_scale_fn` should
-  return `(shift, None)`, and `is_constant_jacobian` should be set to `True` in
-  the `RealNVP` constructor. Calling `real_nvp_default_template` with
-  `shift_only=True` returns one such NICE-compatible `shift_and_log_scale_fn`.
+  NICE [(Dinh et al., 2014)][2] is a special case of the Real NVP bijector
+  which discards the scale transformation, resulting in a constant-time
+  inverse-log-determinant-Jacobian. To use a NICE bijector instead of Real
+  NVP, `shift_and_log_scale_fn` should return `(shift, None)`, and
+  `is_constant_jacobian` should be set to `True` in the `RealNVP` constructor.
+  Calling `real_nvp_default_template` with `shift_only=True` returns one such
+  NICE-compatible `shift_and_log_scale_fn`.
 
   Caching: the scalar input depth `D` of the base distribution is not known at
   construction time. The first call to any of `forward(x)`, `inverse(x)`,
@@ -103,23 +106,24 @@ class RealNVP(bijector_lib.Bijector):
   nvp.log_prob(0.)
   ```
 
-  For more examples, see [4].
+  For more examples, see [Jang (2018)][3].
 
-  [1]: "Density Estimation using Real NVP."
-       Laurent Dinh, Jascha Sohl-Dickstein, Samy Bengio. ICLR. 2017.
-       https://arxiv.org/abs/1605.08803
+  #### References
 
-  [2]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
+  [1]: Laurent Dinh, Jascha Sohl-Dickstein, and Samy Bengio. Density Estimation
+       using Real NVP. In _International Conference on Learning
+       Representations_, 2017. https://arxiv.org/abs/1605.08803
 
-  [3]: "NICE: Non-linear Independent Components Estimation."
-       Laurent Dinh, David Krueger, Yoshua Bengio. ICLR. 2015.
-       https://arxiv.org/abs/1410.8516
+  [2]: Laurent Dinh, David Krueger, and Yoshua Bengio. NICE: Non-linear
+       Independent Components Estimation. _arXiv preprint arXiv:1410.8516_,
+       2014. https://arxiv.org/abs/1410.8516
 
-  [4]: "Normalizing Flows Tutorial, Part 2: Modern Normalizing Flows."
-       Eric Jang. Blog post. January 2018.
-       http://blog.evjang.com/2018/01/nf2.html
+  [3]: Eric Jang. Normalizing Flows Tutorial, Part 2: Modern Normalizing Flows.
+       _Technical Report_, 2018. http://blog.evjang.com/2018/01/nf2.html
+
+  [4]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
   """
 
   def __init__(self,
@@ -250,12 +254,20 @@ def real_nvp_default_template(
     **kwargs: `tf.layers.dense` keyword arguments.
 
   Returns:
-    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
-    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
+    shift: `Float`-like `Tensor` of shift terms ("mu" in
+      [Papamakarios et al.  (2016)][1]).
+    log_scale: `Float`-like `Tensor` of log(scale) terms ("alpha" in
+      [Papamakarios et al. (2016)][1]).
 
   Raises:
     NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
       graph execution.
+
+  #### References
+
+  [1]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked
+       Autoregressive Flow for Density Estimation. In _Neural Information
+       Processing Systems_, 2017. https://arxiv.org/abs/1705.07057
   """
 
   with ops.name_scope(name, "real_nvp_default_template"):
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
deleted file mode 100644
index 223bc9d042c69be05b0e578835a31ed6e83c0c97..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SigmoidCentered bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered
-
-
-__all__ = [
-    "SigmoidCentered",
-]
-
-
-class SigmoidCentered(softmax_centered.SoftmaxCentered):
-  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
-
-  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
-
-  See `bijector.SoftmaxCentered` for more details.
-  """
-
-  def __init__(self, validate_args=False, name="sigmoid_centered"):
-    super(SigmoidCentered, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index 24add40445c60db533aac6d0c8eb537774895c65..dc94fd0a38de29f5a7ee6ca826aab0ecf8712966 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -19,10 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -45,17 +42,14 @@ class SoftmaxCentered(bijector.Bijector):
   e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
   coordinate.
 
-  Because we append a coordinate, this bijector only supports `event_ndim in [0,
-  1]`, i.e., scalars and vectors.
-
   Example Use:
 
   ```python
-  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
+  bijector.SoftmaxCentered().forward(tf.log([2, 3, 4]))
   # Result: [0.2, 0.3, 0.4, 0.1]
   # Extra result: 0.1
 
-  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
+  bijector.SoftmaxCentered().inverse([0.2, 0.3, 0.4, 0.1])
   # Result: tf.log([2, 3, 4])
   # Extra coordinate removed.
   ```
@@ -67,82 +61,47 @@ class SoftmaxCentered(bijector.Bijector):
   """
 
   def __init__(self,
-               event_ndims=0,
                validate_args=False,
                name="softmax_centered"):
     self._graph_parents = []
     self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-      if event_ndims is None or event_ndims not in [0, 1]:
-        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
-    self._static_event_ndims = event_ndims
     super(SoftmaxCentered, self).__init__(
-        event_ndims=event_ndims,
+        event_ndims=1,
         validate_args=validate_args,
         name=name)
 
   def _forward_event_shape(self, input_shape):
-    if input_shape.ndims is None:
+    if input_shape.ndims is None or input_shape[-1] is None:
       return input_shape
-    if input_shape.ndims != self._static_event_ndims:
-      raise ValueError("input_shape.dims = %d != %d" %
-                       (input_shape.ndims, self._static_event_ndims))
-    if input_shape.ndims == 0:
-      return tensor_shape.TensorShape([2])
-    if input_shape.ndims == 1:
-      return tensor_shape.TensorShape(input_shape[0] + 1)
-    # Unreachable code:
-    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
+    return tensor_shape.TensorShape([input_shape[-1] + 1])
 
   def _forward_event_shape_tensor(self, input_shape):
-    ndims = array_ops.shape(input_shape)
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_zero_or_one = check_ops.assert_equal(
-          ndims, 0 if self._static_event_ndims == 0 else 1,
-          message="event_ndims must be 0 or 1")
-      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor(
-          [2], dtype=dtypes.int32, name="output_shape")
-    return input_shape + 1
+    return (input_shape[-1] + 1)[..., array_ops.newaxis]
 
   def _inverse_event_shape(self, output_shape):
-    if output_shape.ndims is None:
+    if output_shape.ndims is None or output_shape[-1] is None:
       return output_shape
-    if output_shape.ndims != 1:
-      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
-    if self._static_event_ndims == 0:
-      return tensor_shape.TensorShape([])
-    return tensor_shape.TensorShape(output_shape[0] - 1)
+    if output_shape[-1] <= 1:
+      raise ValueError("output_shape[-1] = %d <= 1" % output_shape[-1])
+    return tensor_shape.TensorShape([output_shape[-1] - 1])
 
   def _inverse_event_shape_tensor(self, output_shape):
-    ndims = array_ops.shape(output_shape)[0]
     if self.validate_args:
       # It is not possible for a negative shape so we need only check <= 1.
-      is_one = check_ops.assert_equal(
-          ndims, 1, message="event_ndims must be 1")
-      ndims = control_flow_ops.with_dependencies([is_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
-    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
+      is_greater_one = check_ops.assert_greater(
+          output_shape[-1], 1, message="Need last dimension greater than 1.")
+      output_shape = control_flow_ops.with_dependencies(
+          [is_greater_one], output_shape)
+    return (output_shape[-1] - 1)[..., array_ops.newaxis]
 
   def _forward(self, x):
     # Pad the last dim with a zeros vector. We need this because it lets us
     # infer the scale in the inverse function.
-    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
-    y = distribution_util.pad(y, axis=-1, back=True)
+    y = distribution_util.pad(x, axis=-1, back=True)
 
     # Set shape hints.
     if x.shape.ndims is not None:
-      shape = x.shape.as_list()
-      if self._static_event_ndims == 0:
-        shape += [2]
-      elif shape[-1] is not None:
-        shape[-1] += 1
-      shape = tensor_shape.TensorShape(shape)
+      shape = x.shape[:-1].concatenate(x.shape[-1] + 1)
       y.shape.assert_is_compatible_with(shape)
       y.set_shape(shape)
 
@@ -167,17 +126,9 @@ class SoftmaxCentered(bijector.Bijector):
     log_normalization = (-x[..., -1])[..., array_ops.newaxis]
     x = x[..., :-1] + log_normalization
 
-    if self._static_event_ndims == 0:
-      x = array_ops.squeeze(x, squeeze_dims=-1)
-
     # Set shape hints.
     if y.shape.ndims is not None:
-      shape = y.shape.as_list()
-      if self._static_event_ndims == 0:
-        shape = shape[:-1]
-      elif shape[-1] is not None:
-        shape[-1] -= 1
-      shape = tensor_shape.TensorShape(shape)
+      shape = y.shape[:-1].concatenate(y.shape[-1] - 1)
       x.shape.assert_is_compatible_with(shape)
       x.set_shape(shape)
 
@@ -203,19 +154,16 @@ class SoftmaxCentered(bijector.Bijector):
     return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
 
   def _forward_log_det_jacobian(self, x):
-    if self._static_event_ndims == 0:
-      return x - 2. * nn_ops.softplus(x)
-    else:
-      # This code is similar to nn_ops.log_softmax but different because we have
-      # an implicit zero column to handle. I.e., instead of:
-      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
-      # we must do:
-      #   log_normalization = 1 + reduce_sum(exp(logits))
-      #   -log_normalization + reduce_sum(logits - log_normalization)
-      log_normalization = nn_ops.softplus(
-          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
-      fldj = (-log_normalization +
-              math_ops.reduce_sum(x - log_normalization,
-                                  axis=-1,
-                                  keep_dims=True))
-      return array_ops.squeeze(fldj, squeeze_dims=-1)
+    # This code is similar to nn_ops.log_softmax but different because we have
+    # an implicit zero column to handle. I.e., instead of:
+    #   reduce_sum(logits - reduce_sum(exp(logits), dim))
+    # we must do:
+    #   log_normalization = 1 + reduce_sum(exp(logits))
+    #   -log_normalization + reduce_sum(logits - log_normalization)
+    log_normalization = nn_ops.softplus(
+        math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
+    fldj = (-log_normalization +
+            math_ops.reduce_sum(x - log_normalization,
+                                axis=-1,
+                                keep_dims=True))
+    return array_ops.squeeze(fldj, squeeze_dims=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/square.py b/tensorflow/contrib/distributions/python/ops/bijectors/square.py
index 2831a92df8e0ad2bf681f13533cdb6f5d2089a57..1e9dbf35091fe51f2478dc085c394a77295ca4ee 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/square.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/square.py
@@ -37,7 +37,7 @@ class Square(bijector.Bijector):
   g is a bijection between the non-negative real numbers (R_+) and the
   non-negative real numbers.
 
-  Examples:
+  #### Examples
 
   ```python
   bijector.Square().forward(x=[[1., 0], [2, 1]])
diff --git a/tensorflow/contrib/distributions/python/ops/estimator.py b/tensorflow/contrib/distributions/python/ops/estimator.py
index 6b53338c4542c75d3977c075b7750c780080ac48..98edd337fe02ffbf53c6ecd9ebda9424231ea2fe 100644
--- a/tensorflow/contrib/distributions/python/ops/estimator.py
+++ b/tensorflow/contrib/distributions/python/ops/estimator.py
@@ -75,7 +75,7 @@ def estimator_head_distribution_regression(make_distribution_fn,
 
 
 class _DistributionRegressionHead(_RegressionHead):
-  """Creates a _RegressionHead instance from an arbitray `Distribution`."""
+  """Creates a _RegressionHead instance from an arbitrary `Distribution`."""
 
   def __init__(self,
                make_distribution_fn,
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index 7dcb3e3ac4db1855adacb7ec0fa8554c45d9c859..b1bacb91b03093fa93a7e5f7eb855dc944dafb44 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -36,7 +36,7 @@ class Independent(distribution_lib.Distribution):
 
   This distribution is useful for regarding a collection of independent,
   non-identical distributions as a single random variable. For example, the
-  `Indpendent` distribution composed of a collection of `Bernoulli`
+  `Independent` distribution composed of a collection of `Bernoulli`
   distributions might define a distribution over an image (where each
   `Bernoulli` is a distribution over each pixel).
 
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
index 120b38db3cf72e8fce56a7e9293cdf25e75784e2..192dede6ff1d4de8d4be9965c414e7453d7b5d4b 100644
--- a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -44,18 +44,16 @@ _kumaraswamy_sample_note = """Note: `x` must have dtype `self.dtype` and be in
 def _harmonic_number(x):
   """Compute the harmonic number from its analytic continuation.
 
-  Derivation from [1] and Euler's constant [2].
-  [1] -
-  https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers
-  [2] - https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant
-
+  Derivation from [here](
+  https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers)
+  and [Euler's constant](
+  https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant).
 
   Args:
     x: input float.
 
   Returns:
     z: The analytic continuation of the harmonic number for the input.
-
   """
   one = array_ops.ones([], dtype=x.dtype)
   return math_ops.digamma(x + one) - math_ops.digamma(one)
diff --git a/tensorflow/contrib/distributions/python/ops/moving_stats.py b/tensorflow/contrib/distributions/python/ops/moving_stats.py
index 20f85643b9e7db61b4786dffe4115c7d3c00b046..87d40805a3c7a9c2871305af7f7182b7e2923530 100644
--- a/tensorflow/contrib/distributions/python/ops/moving_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/moving_stats.py
@@ -47,9 +47,7 @@ def assign_moving_mean_variance(
   Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses
   the lag-1 mean.
 
-  For derivation justification, see equation 143 of:
-    T. Finch, Feb 2009. "Incremental calculation of weighted mean and variance".
-    http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
+  For derivation justification, see [Finch (2009; Eq. 143)][1].
 
   Args:
     mean_var: `float`-like `Variable` representing the exponentially weighted
@@ -72,6 +70,12 @@ def assign_moving_mean_variance(
     TypeError: if `mean_var` does not have float type `dtype`.
     TypeError: if `mean_var`, `variance_var`, `value`, `decay` have different
       `base_dtype`.
+
+  #### References
+
+  [1]: Tony Finch. Incremental calculation of weighted mean and variance.
+       _Technical Report_, 2009.
+       http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
   """
   with ops.name_scope(name, "assign_moving_mean_variance",
                       [variance_var, mean_var, value, decay]):
@@ -183,9 +187,7 @@ def moving_mean_variance(value, decay, collections=None, name=None):
   Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses
   the lag-`1` mean.
 
-  For derivation justification, see equation 143 of:
-    T. Finch, Feb 2009. "Incremental calculation of weighted mean and variance".
-    http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
+  For derivation justification, see [Finch (2009; Eq. 143)][1].
 
   Unlike `assign_moving_mean_variance`, this function handles
   variable creation.
@@ -208,6 +210,12 @@ def moving_mean_variance(value, decay, collections=None, name=None):
   Raises:
     TypeError: if `value_var` does not have float type `dtype`.
     TypeError: if `value`, `decay` have different `base_dtype`.
+
+  #### References
+
+  [1]: Tony Finch. Incremental calculation of weighted mean and variance.
+       _Technical Report_, 2009.
+       http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
   """
   if collections is None:
     collections = [ops.GraphKeys.GLOBAL_VARIABLES]
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index 46c2cc8b7a8c536a90176fbb2b2d52fed61e4705..e3e40b2e9ca232b9970768f21fb95887fdf0df2d 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -52,7 +52,7 @@ class OneHotCategorical(distribution.Distribution):
 
   #### Examples
 
-  Creates a 3-class distiribution, with the 2nd class, the most likely to be
+  Creates a 3-class distribution, with the 2nd class, the most likely to be
   drawn from.
 
   ```python
@@ -60,7 +60,7 @@ class OneHotCategorical(distribution.Distribution):
   dist = OneHotCategorical(probs=p)
   ```
 
-  Creates a 3-class distiribution, with the 2nd class the most likely to be
+  Creates a 3-class distribution, with the 2nd class the most likely to be
   drawn from, using logits.
 
   ```python
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index b525809015537ac8c7ee701c100fba6541fe2e92..e454a53c6275e0c60edd8c87b1c3be670f2b22de 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -35,10 +35,10 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
 
   The RelaxedBernoulli is a distribution over the unit interval (0,1), which
   continuously approximates a Bernoulli. The degree of approximation is
-  controlled by a temperature: as the temperaturegoes to 0 the RelaxedBernoulli
-  becomes discrete with a distribution described by the `logits` or `probs`
-  parameters, as the temperature goes to infinity the RelaxedBernoulli
-  becomes the constant distribution that is identically 0.5.
+  controlled by a temperature: as the temperature goes to 0 the
+  RelaxedBernoulli becomes discrete with a distribution described by the
+  `logits` or `probs` parameters, as the temperature goes to infinity the
+  RelaxedBernoulli becomes the constant distribution that is identically 0.5.
 
   The RelaxedBernoulli distribution is a reparameterized continuous
   distribution that is the binary special case of the RelaxedOneHotCategorical
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index ff33f327c7a77597e516208cacad8c4aed65d1c9..f56ba0781604cb5a4fb3070b79aa86e09ceb6766 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -303,7 +303,7 @@ class RelaxedOneHotCategorical(
   The RelaxedOneHotCategorical is a distribution over random probability
   vectors, vectors of positive real values that sum to one, which continuously
   approximates a OneHotCategorical. The degree of approximation is controlled by
-  a temperature: as the temperaturegoes to 0 the RelaxedOneHotCategorical
+  a temperature: as the temperature goes to 0 the RelaxedOneHotCategorical
   becomes discrete with a distribution described by the `logits` or `probs`
   parameters, as the temperature goes to infinity the RelaxedOneHotCategorical
   becomes the constant distribution that is identically the constant vector of
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index dfc813361977c159d8d48f9d5b9ff03db5b4acdc..f5aaa5cf34abde3ea4d25de1ecf3adaef3f2a770 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -301,13 +302,16 @@ def percentile(x,
 
   with ops.name_scope(name, [x, q]):
     x = ops.convert_to_tensor(x, name="x")
-    q = math_ops.to_float(q, name="q")
+    # Double is needed here and below, else we get the wrong index if the array
+    # is huge along axis.
+    q = math_ops.to_double(q, name="q")
     _get_static_ndims(q, expect_ndims=0)
 
     if validate_args:
       q = control_flow_ops.with_dependencies([
-          check_ops.assert_rank(q, 0), check_ops.assert_greater_equal(q, 0.),
-          check_ops.assert_less_equal(q, 100.)
+          check_ops.assert_rank(q, 0),
+          check_ops.assert_greater_equal(q, math_ops.to_double(0.)),
+          check_ops.assert_less_equal(q, math_ops.to_double(100.))
       ], q)
 
     if axis is None:
@@ -332,7 +336,7 @@ def percentile(x,
       y = _move_dims_to_flat_end(x, axis, x_ndims)
 
     frac_at_q_or_above = 1. - q / 100.
-    d = math_ops.to_float(array_ops.shape(y)[-1])
+    d = math_ops.to_double(array_ops.shape(y)[-1])
 
     if interpolation == "lower":
       index = math_ops.ceil((d - 1) * frac_at_q_or_above)
@@ -341,12 +345,18 @@ def percentile(x,
     elif interpolation == "nearest":
       index = math_ops.round((d - 1) * frac_at_q_or_above)
 
+    # If d is gigantic, then we would have d == d - 1, even in double... So
+    # let's use max/min to avoid out of bounds errors.
+    d = array_ops.shape(y)[-1]
+    # d - 1 will be distinct from d in int32.
+    index = clip_ops.clip_by_value(math_ops.to_int32(index), 0, d - 1)
+
     # Sort everything, not just the top 'k' entries, which allows multiple calls
     # to sort only once (under the hood) and use CSE.
     sorted_y = _sort_tensor(y)
 
     # result.shape = B
-    result = sorted_y[..., math_ops.to_int32(index)]
+    result = sorted_y[..., index]
     result.set_shape(y.get_shape()[:-1])
 
     if keep_dims:
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index 5fb6f0c7eaa8c4734ea4c161b0eee6f24d4c9850..bac0b79d5908712f4e64259768fb6f3b4558f620 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -32,45 +32,50 @@ from tensorflow.python.ops.distributions import util as distribution_util
 class _DistributionShape(object):
   """Manage and manipulate `Distribution` shape.
 
-  Terminology:
-    Recall that a `Tensor` has:
-      - `shape`: size of `Tensor` dimensions,
-      - `ndims`: size of `shape`; number of `Tensor` dimensions,
-      - `dims`: indexes into `shape`; useful for transpose, reduce.
-
-    `Tensor`s sampled from a `Distribution` can be partitioned by `sample_dims`,
-    `batch_dims`, and `event_dims`. To understand the semantics of these
-    dimensions, consider when two of the three are fixed and the remaining
-    is varied:
-      - `sample_dims`: indexes independent draws from identical
-                       parameterizations of the `Distribution`.
-      - `batch_dims`:  indexes independent draws from non-identical
-                       parameterizations of the `Distribution`.
-      - `event_dims`:  indexes event coordinates from one sample.
-
-    The `sample`, `batch`, and `event` dimensions constitute the entirety of a
-    `Distribution` `Tensor`'s shape.
-
-    The dimensions are always in `sample`, `batch`, `event` order.
-
-  Purpose:
-    This class partitions `Tensor` notions of `shape`, `ndims`, and `dims` into
-    `Distribution` notions of `sample,` `batch,` and `event` dimensions. That
-    is, it computes any of:
+  #### Terminology
 
-    ```
-    sample_shape     batch_shape     event_shape
-    sample_dims      batch_dims      event_dims
-    sample_ndims     batch_ndims     event_ndims
-    ```
+  Recall that a `Tensor` has:
+    - `shape`: size of `Tensor` dimensions,
+    - `ndims`: size of `shape`; number of `Tensor` dimensions,
+    - `dims`: indexes into `shape`; useful for transpose, reduce.
+
+  `Tensor`s sampled from a `Distribution` can be partitioned by `sample_dims`,
+  `batch_dims`, and `event_dims`. To understand the semantics of these
+  dimensions, consider when two of the three are fixed and the remaining
+  is varied:
+    - `sample_dims`: indexes independent draws from identical
+                     parameterizations of the `Distribution`.
+    - `batch_dims`:  indexes independent draws from non-identical
+                     parameterizations of the `Distribution`.
+    - `event_dims`:  indexes event coordinates from one sample.
+
+  The `sample`, `batch`, and `event` dimensions constitute the entirety of a
+  `Distribution` `Tensor`'s shape.
+
+  The dimensions are always in `sample`, `batch`, `event` order.
+
+  #### Purpose
+
+  This class partitions `Tensor` notions of `shape`, `ndims`, and `dims` into
+  `Distribution` notions of `sample,` `batch,` and `event` dimensions. That
+  is, it computes any of:
+
+  ```
+  sample_shape     batch_shape     event_shape
+  sample_dims      batch_dims      event_dims
+  sample_ndims     batch_ndims     event_ndims
+  ```
 
-    for a given `Tensor`, e.g., the result of
-    `Distribution.sample(sample_shape=...)`.
+  for a given `Tensor`, e.g., the result of
+  `Distribution.sample(sample_shape=...)`.
 
-    For a given `Tensor`, this class computes the above table using minimal
-    information: `batch_ndims` and `event_ndims`.
+  For a given `Tensor`, this class computes the above table using minimal
+  information: `batch_ndims` and `event_ndims`.
+
+  #### Examples
+
+  We show examples of distribution shape semantics.
 
-  Examples of `Distribution` `shape` semantics:
     - Sample dimensions:
       Computing summary statistics, i.e., the average is a reduction over sample
       dimensions.
@@ -111,52 +116,54 @@ class _DistributionShape(object):
       tf.div(1., tf.reduce_prod(x, event_dims))
       ```
 
-  Examples using this class:
-    Write `S, B, E` for `sample_shape`, `batch_shape`, and `event_shape`.
-
-    ```python
-    # 150 iid samples from one multivariate Normal with two degrees of freedom.
-    mu = [0., 0]
-    sigma = [[1., 0],
-             [0,  1]]
-    mvn = MultivariateNormal(mu, sigma)
-    rand_mvn = mvn.sample(sample_shape=[3, 50])
-    shaper = DistributionShape(batch_ndims=0, event_ndims=1)
-    S, B, E = shaper.get_shape(rand_mvn)
-    # S = [3, 50]
-    # B = []
-    # E = [2]
-
-    # 12 iid samples from one Wishart with 2x2 events.
-    sigma = [[1., 0],
-             [2,  1]]
-    wishart = Wishart(df=5, scale=sigma)
-    rand_wishart = wishart.sample(sample_shape=[3, 4])
-    shaper = DistributionShape(batch_ndims=0, event_ndims=2)
-    S, B, E = shaper.get_shape(rand_wishart)
-    # S = [3, 4]
-    # B = []
-    # E = [2, 2]
-
-    # 100 iid samples from two, non-identical trivariate Normal distributions.
-    mu    = ...  # shape(2, 3)
-    sigma = ...  # shape(2, 3, 3)
-    X = MultivariateNormal(mu, sigma).sample(shape=[4, 25])
-    # S = [4, 25]
-    # B = [2]
-    # E = [3]
-    ```
-
-  Argument Validation:
-    When `validate_args=False`, checks that cannot be done during
-    graph construction are performed at graph execution. This may result in a
-    performance degradation because data must be switched from GPU to CPU.
-
-    For example, when `validate_args=False` and `event_ndims` is a
-    non-constant `Tensor`, it is checked to be a non-negative integer at graph
-    execution. (Same for `batch_ndims`). Constant `Tensor`s and non-`Tensor`
-    arguments are always checked for correctness since this can be done for
-    "free," i.e., during graph construction.
+  We show examples using this class.
+
+  Write `S, B, E` for `sample_shape`, `batch_shape`, and `event_shape`.
+
+  ```python
+  # 150 iid samples from one multivariate Normal with two degrees of freedom.
+  mu = [0., 0]
+  sigma = [[1., 0],
+           [0,  1]]
+  mvn = MultivariateNormal(mu, sigma)
+  rand_mvn = mvn.sample(sample_shape=[3, 50])
+  shaper = DistributionShape(batch_ndims=0, event_ndims=1)
+  S, B, E = shaper.get_shape(rand_mvn)
+  # S = [3, 50]
+  # B = []
+  # E = [2]
+
+  # 12 iid samples from one Wishart with 2x2 events.
+  sigma = [[1., 0],
+           [2,  1]]
+  wishart = Wishart(df=5, scale=sigma)
+  rand_wishart = wishart.sample(sample_shape=[3, 4])
+  shaper = DistributionShape(batch_ndims=0, event_ndims=2)
+  S, B, E = shaper.get_shape(rand_wishart)
+  # S = [3, 4]
+  # B = []
+  # E = [2, 2]
+
+  # 100 iid samples from two, non-identical trivariate Normal distributions.
+  mu    = ...  # shape(2, 3)
+  sigma = ...  # shape(2, 3, 3)
+  X = MultivariateNormal(mu, sigma).sample(shape=[4, 25])
+  # S = [4, 25]
+  # B = [2]
+  # E = [3]
+  ```
+
+  #### Argument Validation
+
+  When `validate_args=False`, checks that cannot be done during
+  graph construction are performed at graph execution. This may result in a
+  performance degradation because data must be switched from GPU to CPU.
+
+  For example, when `validate_args=False` and `event_ndims` is a
+  non-constant `Tensor`, it is checked to be a non-negative integer at graph
+  execution. (Same for `batch_ndims`). Constant `Tensor`s and non-`Tensor`
+  arguments are always checked for correctness since this can be done for
+  "free," i.e., during graph construction.
   """
 
   def __init__(self,
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index 0c747f8e68529484ae6f695b8500cde74857bb11..971d65c4a69140161461fdac93bb588014dd3e88 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -181,7 +181,7 @@ def quadrature_scheme_softmaxnormal_quantiles(
       edges = array_ops.reshape(edges, shape=array_ops.concat([
           [-1], array_ops.ones([batch_ndims], dtype=dtypes.int32)], axis=0))
       quantiles = dist.quantile(edges)
-      quantiles = SoftmaxCentered(event_ndims=1).forward(quantiles)
+      quantiles = SoftmaxCentered().forward(quantiles)
       # Cyclically permute left by one.
       perm = array_ops.concat([
           math_ops.range(1, 1 + batch_ndims), [0]], axis=0)
@@ -248,11 +248,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   The default quadrature scheme chooses `z_{N, n}` as `N` midpoints of
   the quantiles of `p(z)` (generalized quantiles if `K > 2`).
 
-  See [1] for more details.
-
-  [1]. "Quadrature Compound: An approximating family of distributions"
-       Joshua Dillon, Ian Langmore, arXiv preprints
-       https://arxiv.org/abs/1801.03080
+  See [Dillon and Langmore (2018)][1] for more details.
 
   #### About `Vector` distributions in TensorFlow.
 
@@ -313,6 +309,13 @@ class VectorDiffeomixture(distribution_lib.Distribution):
             is_positive_definite=True),
       ],
       validate_args=True)
+  ```
+
+  #### References
+
+  [1]: Joshua Dillon and Ian Langmore. Quadrature Compound: An approximating
+       family of distributions. _arXiv preprint arXiv:1801.03080_, 2018.
+       https://arxiv.org/abs/1801.03080
   """
 
   def __init__(self,
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 8c67647a618d22a58428d78865c4ebf7d98bdf9e..887981d64ef077e2636f8031581c390f177edac8 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -66,7 +66,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   This distribution is an Affine transformation of iid
   [Student's t-distributions](
   https://en.wikipedia.org/wiki/Student%27s_t-distribution)
-  and should not be confused with the [Multivate Student's t-distribution](
+  and should not be confused with the [Multivariate Student's t-distribution](
   https://en.wikipedia.org/wiki/Multivariate_t-distribution). The
   traditional Multivariate Student's t-distribution is type of
   [elliptical distribution](
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index e4ac65012b9c7e3ed5ada3ed75020f3905740156..5a8c94dabf4c3c430bee544a48ee7acfe7dd7ed0 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -228,9 +228,12 @@ class _WishartLinearOperator(distribution.Distribution):
     # Complexity: O(nbk)
     # This parametrization is equivalent to Chi2, i.e.,
     # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2)
+    expanded_df = self.df * array_ops.ones(
+        self.scale_operator.batch_shape_tensor(),
+        dtype=self.df.dtype.base_dtype)
     g = random_ops.random_gamma(shape=[n],
                                 alpha=self._multi_gamma_sequence(
-                                    0.5 * self.df, self.dimension),
+                                    0.5 * expanded_df, self.dimension),
                                 beta=0.5,
                                 dtype=self.dtype,
                                 seed=distribution_util.gen_new_seed(
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index 9d2ca07c3a25fa7acb9b0f5806b763d9a57b51fa..9a3b780af888a597d2440b243ffb8dc98d764f18 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -1,12 +1,8 @@
 # Eager Execution
 
-> *WARNING*: This is a preview/pre-alpha version. The API and performance
-> characteristics are subject to change.
-
-Eager execution is an experimental interface to TensorFlow that provides an
-imperative programming style (à la [NumPy](http://www.numpy.org)). When you
-enable eager execution, TensorFlow operations execute immediately; you do not
-execute a pre-constructed graph with
+Eager execution provides an imperative interface to TensorFlow (similiar to
+[NumPy](http://www.numpy.org)). When you enable eager execution, TensorFlow
+operations execute immediately; you do not execute a pre-constructed graph with
 [`Session.run()`](https://www.tensorflow.org/api_docs/python/tf/Session).
 
 For example, consider a simple computation in TensorFlow:
@@ -33,7 +29,7 @@ print(m)
 ## Caveats
 
 This feature is in early stages and work remains to be done in terms of smooth
-support for distributed and multi-GPU training and CPU performance.
+support for distributed and multi-GPU training and performance.
 
 - [Known issues](https://github.com/tensorflow/tensorflow/issues?q=is%3Aissue%20is%3Aopen%20label%3Acomp%3Aeager)
 - Feedback is welcome, please consider
@@ -41,21 +37,23 @@ support for distributed and multi-GPU training and CPU performance.
 
 ## Installation
 
-Eager execution is included in TensorFlow versions 1.5 and above.
+Eager execution is included in TensorFlow versions 1.7 and above.
 Installation instructions at https://www.tensorflow.org/install/
 
 ## Documentation
 
 For an introduction to eager execution in TensorFlow, see:
 
-- [User Guide](python/g3doc/guide.md)
+- [User Guide](https://www.tensorflow.org/programmers_guide/eager) ([source](../../docs_src/programmers_guide/eager.md))
 - Notebook: [Basic Usage](python/examples/notebooks/1_basics.ipynb)
 - Notebook: [Gradients](python/examples/notebooks/2_gradients.ipynb)
 - Notebook: [Importing Data](python/examples/notebooks/3_datasets.ipynb)
 
 ## Changelog
 
-- 2017/10/31: Initial preview release.
+- 2017/10/31: Initial preview release (in TensorFlow 1.5)
 - 2017/12/01: Example of dynamic neural network:
   [SPINN: Stack-augmented Parser-Interpreter Neural Network](https://arxiv.org/abs/1603.06021).
   See [README.md](python/examples/spinn/README.md) for details.
+- 2017/03: Core functionality moved out of the experimental tf.contrib namespace
+  in TensorFlow 1.7.
diff --git a/tensorflow/contrib/eager/proto/BUILD b/tensorflow/contrib/eager/proto/BUILD
index aedfec8924e7314addd22349c0576a84a58d9aa3..b016d2dcb504044372c895e1eedf3511751bc13e 100644
--- a/tensorflow/contrib/eager/proto/BUILD
+++ b/tensorflow/contrib/eager/proto/BUILD
@@ -4,17 +4,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "checkpointable_object_graph_proto",
     srcs = [
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 384ef7f9630647714b77825b54b3b8a3abdfa6f3..edb9130266e4ea93d2ec6ee373a90df504da18cf 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -70,6 +70,7 @@ cuda_py_test(
     srcs = ["datasets_test.py"],
     additional_deps = [
         ":datasets",
+        ":checkpointable_utils",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/python:dtypes",
@@ -79,6 +80,7 @@ cuda_py_test(
         "//tensorflow/python/data",
         "//tensorflow/python/eager:test",
     ],
+    tags = ["noguitar"],
 )
 
 py_library(
@@ -232,12 +234,15 @@ py_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:io_ops",
+        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
     ],
@@ -267,20 +272,8 @@ cuda_py_test(
         "//tensorflow/python/keras",
     ],
     tags = [
+        "no_oss",  # b/74395663
         "no_windows",  # TODO: needs investigation on Windows
         "notsan",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/eager/python/checkpointable_utils.py
index d07121df635cc95402a4811f810007807dfa0c37..34cb8d0e0887bd5e440873bae117bf27597de11b 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import abc
 import collections
+import functools
 import weakref
 
 from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
@@ -32,7 +33,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpointable as core_checkpointable
@@ -220,12 +220,16 @@ def _serialize_checkpointables(
     object_proto = object_graph_proto.nodes.add()
     object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
     object_name = object_names[checkpointable]
-    for name, saveable in (
+    for name, saveable_factory in (
         checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
       attribute = object_proto.attributes.add()
       attribute.name = name
       attribute.checkpoint_key = "%s/%s/%s" % (
           object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
+      if callable(saveable_factory):
+        saveable = saveable_factory(name=attribute.checkpoint_key)
+      else:
+        saveable = saveable_factory
       # Figure out the name-based Saver's name for this variable.
       saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
           [saveable], convert_variable_to_tensor=False)
@@ -519,6 +523,18 @@ class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
         fetches=fetches, feed_dict=feed_dict, **kwargs)
 
 
+def _copy_saver_with_new_var_list(old_saver, new_var_list):
+  """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
+  new_saver = saver_lib.Saver(var_list=new_var_list)
+  # TODO(allenl): Move to copying functionality to Saver?
+  # pylint: disable=protected-access
+  new_saver._last_checkpoints = old_saver._last_checkpoints
+  new_saver._checkpoints_to_be_deleted = old_saver._checkpoints_to_be_deleted
+  new_saver._next_checkpoint_time = old_saver._next_checkpoint_time
+  # pylint: enable=protected-access
+  return new_saver
+
+
 class CheckpointableSaver(object):
   """Saves and restores a `Checkpointable` object and its dependencies.
 
@@ -561,7 +577,6 @@ class CheckpointableSaver(object):
     self._last_save_saver = None
 
     # Op caching for restore
-    self._object_graph_restore_tensor = None
     self._last_restore_object_graph = None
     self._last_restore_checkpoint = None
 
@@ -598,8 +613,7 @@ class CheckpointableSaver(object):
     """
     named_variables, graph_proto = _serialize_object_graph(
         self._root_checkpointable)
-    in_graph_mode = not context.executing_eagerly()
-    if in_graph_mode:
+    if not context.executing_eagerly():
       if session is None:
         session = ops.get_default_session()
       if self._object_graph_feed_tensor is None:
@@ -618,21 +632,20 @@ class CheckpointableSaver(object):
     named_variables[_OBJECT_GRAPH_PROTO_KEY] = _NoRestoreSaveable(
         tensor=object_graph_tensor,
         name=_OBJECT_GRAPH_PROTO_KEY)
-    if not in_graph_mode or self._last_save_object_graph != graph_proto:
-      if self._last_save_object_graph is not None and in_graph_mode:
-        raise NotImplementedError(
-            "Using a single Saver to save a mutated object graph is not "
-            "currently supported when graph building. Use a different Saver "
-            "when the object graph changes (save ops will be duplicated), or "
-            "file a feature request if this limitation bothers you.")
-      saver = saver_lib.Saver(var_list=named_variables)
-      if in_graph_mode:
-        self._last_save_saver = saver
-        self._last_save_object_graph = graph_proto
-    else:
-      saver = self._last_save_saver
+    if (self._last_save_object_graph != graph_proto
+        # When executing eagerly, we need to re-create SaveableObjects each time
+        # save() is called so they pick up new Tensors passed to their
+        # constructors. That means the Saver needs to be copied with a new
+        # var_list.
+        or context.executing_eagerly()):
+      if self._last_save_object_graph is not None:
+        self._last_save_saver = _copy_saver_with_new_var_list(
+            old_saver=self._last_save_saver, new_var_list=named_variables)
+      else:
+        self._last_save_saver = saver_lib.Saver(var_list=named_variables)
+      self._last_save_object_graph = graph_proto
     with ops.device("/cpu:0"):
-      save_path = saver.save(
+      save_path = self._last_save_saver.save(
           sess=_SessionWithFeedDictAdditions(
               session=session, feed_additions=feed_additions),
           save_path=file_prefix,
@@ -651,7 +664,7 @@ class CheckpointableSaver(object):
             attribute_proto.checkpoint_key]
     return saver_names
 
-  def restore(self, save_path, session=None):
+  def restore(self, save_path):
     """Restore a training checkpoint.
 
     Restores `root_checkpointable` and any objects that it tracks
@@ -661,8 +674,7 @@ class CheckpointableSaver(object):
     constructor after this call will be matched if they have a corresponding
     object in the checkpoint.
 
-    When building a graph, restorations are added to the graph but not run. A
-    session is required to retrieve checkpoint metadata.
+    When building a graph, restorations are added to the graph but not run.
 
     To disallow deferred loading, assert immediately that all checkpointed
     variables have been matched to variable objects:
@@ -700,9 +712,6 @@ class CheckpointableSaver(object):
         object which may run initializers for objects in the dependency
         graph. If the checkpoint was written by the name-based `tf.train.Saver`,
         names are used to match variables.
-      session: The session to retrieve metadata with. Ignored when executing
-        eagerly. If not provided when graph building, the default session is
-        used.
 
     Returns:
       A load status object, which can be used to make assertions about the
@@ -717,32 +726,15 @@ class CheckpointableSaver(object):
       return InitializationOnlyStatus(self._root_checkpointable)
     in_graph_mode = not context.executing_eagerly()
     if in_graph_mode:
-      if session is None:
-        session = ops.get_default_session()
       file_prefix_tensor = self._file_prefix_placeholder
       file_prefix_feed_dict = {self._file_prefix_placeholder: save_path}
     else:
-      session = None
       with ops.device("/cpu:0"):
         file_prefix_tensor = constant_op.constant(save_path)
       file_prefix_feed_dict = None
+    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
     try:
-      if not in_graph_mode or self._object_graph_restore_tensor is None:
-        with ops.device("/cpu:0"):
-          object_graph_string, = io_ops.restore_v2(
-              prefix=file_prefix_tensor,
-              tensor_names=[_OBJECT_GRAPH_PROTO_KEY],
-              shape_and_slices=[""],
-              dtypes=[dtypes.string],
-              name="object_graph_proto_read")
-        if in_graph_mode:
-          self._object_graph_restore_tensor = object_graph_string
-      if in_graph_mode:
-        object_graph_string = session.run(
-            self._object_graph_restore_tensor,
-            feed_dict=file_prefix_feed_dict)
-      else:
-        object_graph_string = object_graph_string.numpy()
+      object_graph_string = reader.get_tensor(_OBJECT_GRAPH_PROTO_KEY)
     except errors_impl.NotFoundError:
       # The object graph proto does not exist in this checkpoint. Try again with
       # name-based saving.
@@ -757,7 +749,6 @@ class CheckpointableSaver(object):
       if in_graph_mode:
         dtype_map = None
       else:
-        reader = pywrap_tensorflow.NewCheckpointReader(save_path)
         dtype_map = reader.get_variable_to_dtype_map()
       checkpoint = core_checkpointable_utils._Checkpoint(  # pylint: disable=protected-access
           object_graph_proto=object_graph_proto,
@@ -877,3 +868,115 @@ class Checkpoint(core_checkpointable.Checkpointable):
     # initialization when executing eagerly.
     self._maybe_create_save_counter()
     return status
+
+
+class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+  """Wraps save and restore callbacks as a `SaveableObject`."""
+
+  def __init__(self, name, dtype, save_callback, restore_callback):
+    self._restore_callback = restore_callback
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(
+        tensor=save_callback,
+        slice_spec="",
+        name=name,
+        dtype=dtype)
+    super(_CallbackSaveable, self).__init__(
+        save_callback, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return self._restore_callback(tensor)
+
+
+class _SplitDependency(core_checkpointable.CheckpointableBase):
+  """Looks like a regular variable while synchronizing save/restores."""
+
+  def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
+               fill_save_buffer_fn, consume_restore_buffer_fn):
+    self._save_buffer = save_buffer
+    self._restore_buffer = restore_buffer
+    self._name = name
+    self._dtype = dtype
+    self._num_components = num_components
+    self._fill_save_buffer_fn = fill_save_buffer_fn
+    self._consume_restore_buffer_fn = consume_restore_buffer_fn
+
+  def _save(self):
+    """Pull from the shared buffer, populating it if necessary."""
+    if self._name not in self._save_buffer:
+      if self._save_buffer:
+        raise AssertionError(
+            ("Split dependency %s (%s) unsynchronized. Split dependencies must "
+             "be saved together.") % (self._name, self))
+      self._fill_save_buffer_fn(self._save_buffer)
+    return self._save_buffer.pop(self._name)
+
+  def _restore(self, tensor):
+    """Push into the shared buffer, flushing it if necessary."""
+    if self._name in self._restore_buffer:
+      raise AssertionError(
+          ("Split dependency %s (%s) unsynchronized. Split dependencies must "
+           "be restored together.") % (self._name, self))
+    self._restore_buffer[self._name] = tensor
+    if len(self._restore_buffer) == self._num_components:
+      op = self._consume_restore_buffer_fn(self._restore_buffer)
+      self._restore_buffer.clear()
+      return op
+    else:
+      return control_flow_ops.no_op()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Looks to Checkpointable like a regular variable."""
+    return {
+        core_checkpointable.VARIABLE_VALUE_KEY:
+        functools.partial(_CallbackSaveable,
+                          dtype=self._dtype,
+                          save_callback=self._save,
+                          restore_callback=self._restore)
+    }
+
+
+def split_dependency(component_names, component_dtypes,
+                     fill_save_buffer_fn, consume_restore_buffer_fn):
+  """Creates multiple dependencies with a synchronized save/restore.
+
+  Useful when a single op produces `Tensor`s which should each be saved under
+  different objects, or when `Tensor`s saved with many different objects need to
+  be restored together as inputs to a single op (i.e. an object which uses a
+  single fused op may be swapped out for a subgraph of objects, and these two
+  programs are checkpoint compatible).
+
+  Args:
+    component_names: A sequence of names for the split
+      dependencies. `fill_save_buffer_fn` must add these keys to the dictionary
+      it is passed, and `consume_restore_buffer_fn` will receive a dictionary
+      with these keys.
+    component_dtypes: Data types for the `Tensor`s being saved and restored, a
+      sequence corresponding to `component_names`.
+    fill_save_buffer_fn: A function which takes an empty dictionary as an
+      argument and adds `Tensor`s with `component_names` as keys. These
+      `Tensor`s will be saved as if they were individual variables.
+    consume_restore_buffer_fn: A function which takes a dictionary with
+      `component_names` as keys mapping to restored individual `Tensor`s and
+      returns a restore op (or if executing eagerly, runs the restoration and
+      may return `None`).
+
+  Returns:
+    A dictionary mapping from names to Checkpointable objects. If one is
+    reachable from an object as a dependency, the others should be too; adding
+    dependencies on some but not all of the objects will result in errors.
+  """
+  save_buffer = {}
+  restore_buffer = {}
+  split_dependencies = {}
+  for name, dtype in zip(component_names, component_dtypes):
+    split_dependencies[name] = _SplitDependency(
+        save_buffer=save_buffer,
+        restore_buffer=restore_buffer,
+        name=name,
+        dtype=dtype,
+        num_components=len(component_names),
+        fill_save_buffer_fn=fill_save_buffer_fn,
+        consume_restore_buffer_fn=consume_restore_buffer_fn)
+  return split_dependencies
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index 2054878bf861553bb6cfa8d3730fa2070cf6b8bb..891c093a0f667deca6c26c453a83eca7305166a0 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -23,14 +23,18 @@ import six
 
 from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import sequential
 from tensorflow.python.keras._impl.keras.engine import training
 from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
@@ -66,6 +70,87 @@ class MyModel(training.Model):
     return ret
 
 
+def _split_variable_closure(variable):
+  def _fill_save_buffer_fn(save_buffer):
+    save_buffer["first_half"] = variable[:2]
+    save_buffer["second_half"] = variable[2:]
+  return _fill_save_buffer_fn
+
+
+def _combine_variable_closure(variable):
+  def _consume_restore_buffer_fn(restore_buffer):
+    return variable.assign(
+        array_ops.concat([restore_buffer["first_half"],
+                          restore_buffer["second_half"]],
+                         axis=0))
+  return _consume_restore_buffer_fn
+
+
+class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase):
+
+  def __init__(self):
+    self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
+    split_dependencies = checkpointable_utils.split_dependency(
+        component_names=("first_half", "second_half"),
+        component_dtypes=(self.combined.dtype,) * 2,
+        fill_save_buffer_fn=_split_variable_closure(
+            self.combined),
+        consume_restore_buffer_fn=_combine_variable_closure(
+            self.combined))
+    for name, dep in split_dependencies.items():
+      self._track_checkpointable(dep, name=name)
+
+
+class HasRegularDeps(checkpointable.Checkpointable):
+
+  def __init__(self):
+    self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
+    self.second_half = resource_variable_ops.ResourceVariable([0., 0.])
+
+
+class OnlyOneDep(checkpointable.Checkpointable):
+
+  def __init__(self):
+    self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
+
+
+class SplitTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testSaveRestoreSplitDep(self):
+    save_checkpoint = checkpointable_utils.Checkpoint(
+        dep=SaveTensorSlicesAsDeps())
+    self.evaluate(save_checkpoint.dep.combined.assign([1., 2., 3., 4.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_checkpoint.save(checkpoint_prefix)
+
+    regular_deps = HasRegularDeps()
+    regular_restore_checkpoint = checkpointable_utils.Checkpoint(
+        dep=regular_deps)
+    regular_restore_checkpoint.restore(
+        save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2.], self.evaluate(regular_deps.first_half))
+    self.assertAllEqual([3., 4.], self.evaluate(regular_deps.second_half))
+
+    one_dep = OnlyOneDep()
+    one_dep_restore_checkpoint = checkpointable_utils.Checkpoint(dep=one_dep)
+    status = one_dep_restore_checkpoint.restore(save_path)
+    with self.assertRaises(AssertionError):
+      # Missing the second dependency.
+      status.assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2.], self.evaluate(one_dep.first_half))
+
+    restore_checkpoint = checkpointable_utils.Checkpoint()
+    status = restore_checkpoint.restore(save_path)
+    restore_checkpoint.dep = SaveTensorSlicesAsDeps()
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual(
+        [1., 2., 3., 4.],
+        self.evaluate(restore_checkpoint.dep.combined))
+
+
 class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
@@ -152,6 +237,50 @@ class InterfaceTests(test.TestCase):
     self.assertAllEqual([1., 1., 1.], self.evaluate(v2))
 
 
+class _MirroringSaveable(core_saver.BaseSaverBuilder.SaveableObject):
+
+  def __init__(self, primary_variable, mirrored_variable, name):
+    self._primary_variable = primary_variable
+    self._mirrored_variable = mirrored_variable
+    tensor = self._primary_variable.read_value()
+    spec = core_saver.BaseSaverBuilder.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name)
+    super(_MirroringSaveable, self).__init__(
+        tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group(
+        self._primary_variable.assign(tensor),
+        self._mirrored_variable.assign(tensor))
+
+
+class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
+  """A Checkpointable object which returns a more complex SaveableObject."""
+
+  def __init__(self):
+    self.non_dep_variable = variable_scope.get_variable(
+        name="non_dep_variable", initializer=6., use_resource=True)
+    self.mirrored = variable_scope.get_variable(
+        name="mirrored", initializer=15., use_resource=True)
+
+  def _gather_saveables_for_checkpoint(self):
+    def _saveable_factory(name=self.non_dep_variable.name):
+      return _MirroringSaveable(
+          primary_variable=self.non_dep_variable,
+          mirrored_variable=self.mirrored,
+          name=name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  # The Saver sorts by name before parsing, so we need a name property.
+  @property
+  def name(self):
+    return self.non_dep_variable.name
+
+
 class CheckpointingTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
@@ -261,6 +390,42 @@ class CheckpointingTests(test.TestCase):
             optimizer_node.slot_variables[0]
             .slot_variable_node_id].attributes[0].checkpoint_key)
 
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testMoreComplexSaveableReturned(self):
+    v = _OwnsMirroredVariables()
+    checkpoint = checkpointable_utils.Checkpoint(v=v)
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    save_path = checkpoint.save(prefix)
+    self.evaluate(v.non_dep_variable.assign(43.))
+    self.evaluate(v.mirrored.assign(44.))
+    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
+    self.assertEqual(42., self.evaluate(v.non_dep_variable))
+    self.assertEqual(42., self.evaluate(v.mirrored))
+    self.evaluate(v.non_dep_variable.assign(44.))
+    save_path = checkpoint.save(prefix)
+    self.evaluate(v.non_dep_variable.assign(45.))
+    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
+    self.assertEqual(44., self.evaluate(v.non_dep_variable))
+    self.assertEqual(44., self.evaluate(v.mirrored))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMoreComplexSaveableReturnedWithGlobalName(self):
+    # The same object can also be saved using the name-based saver.
+    v = _OwnsMirroredVariables()
+    saver = core_saver.Saver(var_list=[v])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    with self.test_session() as sess:
+      save_path = saver.save(sess, prefix)
+      self.evaluate(v.non_dep_variable.assign(43.))
+      self.evaluate(v.mirrored.assign(44.))
+      saver.restore(sess, save_path)
+      self.assertEqual(42., self.evaluate(v.non_dep_variable))
+      self.assertEqual(42., self.evaluate(v.mirrored))
+
   @test_util.run_in_graph_and_eager_modes()
   def testSaveRestore(self):
     model = MyModel()
@@ -296,7 +461,11 @@ class CheckpointingTests(test.TestCase):
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
     on_create_model = MyModel()
-    on_create_optimizer = adam.AdamOptimizer(0.001)
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when appying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0, beta2=1.0)
     on_create_root = checkpointable_utils.Checkpoint(
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
@@ -313,8 +482,8 @@ class CheckpointingTests(test.TestCase):
     self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
     self.assertAllEqual(optimizer_variables[2:],
                         self.evaluate(on_create_optimizer.variables()))
-    on_create_optimizer._create_slots(
-        [resource_variable_ops.ResourceVariable([1.])])
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
     status.assert_consumed()
     beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
     self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
@@ -452,6 +621,35 @@ class CheckpointingTests(test.TestCase):
     name, = named_variables.keys()
     self.assertEqual(name, "..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE")
 
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = checkpointable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
   @test_util.run_in_graph_and_eager_modes()
   def testLateDependencyTracking(self):
 
@@ -778,6 +976,72 @@ class CheckpointingTests(test.TestCase):
         saver.save(checkpoint_prefix)
         self.assertEqual(before_ops, graph.get_operations())
 
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testCheckpointCleanup(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    obj = checkpointable.Checkpointable()
+    obj.var = variable_scope.get_variable(name="v", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    saver = checkpointable_utils.Checkpoint(obj=obj)
+    for _ in range(10):
+      saver.save(checkpoint_prefix)
+    expected_filenames = ["checkpoint"]
+    for checkpoint_number in range(6, 11):
+      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
+      expected_filenames.append(
+          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
+    six.assertCountEqual(
+        self,
+        expected_filenames,
+        os.listdir(checkpoint_directory))
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testCheckpointCleanupChangingVarList(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    obj = checkpointable.Checkpointable()
+    obj.var = variable_scope.get_variable(name="v", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    checkpoint = checkpointable_utils.Checkpoint(obj=obj)
+    looped_variables = []
+    for iteration in range(10):
+      new_variable = resource_variable_ops.ResourceVariable(iteration)
+      self.evaluate(new_variable.initializer)
+      setattr(checkpoint, "var_%d" % iteration, new_variable)
+      checkpoint.save(checkpoint_prefix)
+      looped_variables.append(new_variable)
+    expected_filenames = ["checkpoint"]
+    # We've copied the saver each time, but checkpoint management should still
+    # be consistent.
+    for checkpoint_number in range(6, 11):
+      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
+      expected_filenames.append(
+          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
+    six.assertCountEqual(
+        self,
+        expected_filenames,
+        os.listdir(checkpoint_directory))
+    for v in looped_variables:
+      self.evaluate(v.assign(314))
+    checkpoint.restore(checkpoint_prefix + "-6").run_restore_ops()
+    self.assertEqual(314, self.evaluate(checkpoint.var_9))
+    self.assertEqual(314, self.evaluate(checkpoint.var_8))
+    self.assertEqual(314, self.evaluate(checkpoint.var_6))
+    self.assertEqual(5, self.evaluate(checkpoint.var_5))
+    self.assertEqual(1, self.evaluate(checkpoint.var_1))
+    self.assertEqual(0, self.evaluate(checkpoint.var_0))
+    if context.executing_eagerly():
+      checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+      self.assertEqual(9, self.evaluate(checkpoint.var_9))
+      self.assertEqual(8, self.evaluate(checkpoint.var_8))
+      self.assertEqual(1, self.evaluate(checkpoint.var_1))
+      self.assertEqual(0, self.evaluate(checkpoint.var_0))
+    else:
+      # Restoring into modified graphs is an error while graph building.
+      with self.assertRaises(NotImplementedError):
+        checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+
   def testManyRestoresGraph(self):
     """Restores after the first should not modify the graph."""
     with context.graph_mode():
@@ -855,6 +1119,38 @@ class CheckpointingTests(test.TestCase):
         beta1_power, _ = optimizer._get_beta_accumulators()
         self.assertAllEqual(3., self.evaluate(beta1_power))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_sequential(self):
+    model = sequential.Sequential()
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    model.add(core.Dense(4))
+    second_dense = core.Dense(5)
+    model.add(second_dense)
+    model(constant_op.constant([[1.]]))
+    checkpoint.restore(None).initialize_or_restore()
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([1., 2., 3., 4., 5.])))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([5., 6., 7., 8., 9.])))
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
+
+    deferred_sequential = sequential.Sequential()
+    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
+        model=deferred_sequential)
+    status = deferred_sequential_checkpoint.restore(save_path)
+    deferred_sequential.add(core.Dense(4))
+    deferred_sequential(constant_op.constant([[1.]]))
+    deferred_second_dense = core.Dense(5)
+    deferred_sequential.add(deferred_second_dense)
+    deferred_sequential(constant_op.constant([[1.]]))
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.],
+                        self.evaluate(deferred_second_dense.bias))
+
 
 class TemplateTests(test.TestCase):
 
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 332bada57b42fe53fe6be0de1b39c905c0b32579..99b1e098d57ffcf028e54e7a14c36f7ba178fa45 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -31,6 +31,8 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training.saver import BaseSaverBuilder
 
 _uid_counter = 0
 _uid_lock = threading.Lock()
@@ -44,7 +46,7 @@ def _generate_shared_name(prefix):
   return "{}{}".format(prefix, uid)
 
 
-class Iterator(iterator_ops.EagerIterator):
+class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
   """An iterator producing tf.Tensor objects from a tf.data.Dataset.
 
   NOTE: Unlike the iterator created by the
@@ -96,7 +98,6 @@ class Iterator(iterator_ops.EagerIterator):
             f=remote_fn,
             target_device=target,
             buffer_size=10,
-            thread_pool_size=1,
             container="",
             shared_name=_generate_shared_name("function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
@@ -106,13 +107,44 @@ class Iterator(iterator_ops.EagerIterator):
   def _next_internal(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
     """
-    if self._buffer_resource_handle is not None:
-      with ops.device(self._device):
-        ret = prefetching_ops.function_buffering_resource_get_next(
-            function_buffer_resource=self._buffer_resource_handle,
-            output_types=self._flat_output_types)
-      return sparse.deserialize_sparse_tensors(
-          nest.pack_sequence_as(self._output_types, ret), self._output_types,
-          self._output_shapes, self._output_classes)
-    else:
-      return super(Iterator, self)._next_internal()
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      if self._buffer_resource_handle is not None:
+        with ops.device(self._device):
+          ret = prefetching_ops.function_buffering_resource_get_next(
+              function_buffer_resource=self._buffer_resource_handle,
+              output_types=self._flat_output_types)
+        return sparse.deserialize_sparse_tensors(
+            nest.pack_sequence_as(self._output_types, ret), self._output_types,
+            self._output_shapes, self._output_classes)
+      else:
+        return super(Iterator, self)._next_internal()
+
+  # TODO(shivaniagrawal): Expose checkpointable stateful objects from dataset
+  # attributes(potential).
+
+  class _Saveable(BaseSaverBuilder.SaveableObject):
+    """SaveableObject for saving/restoring iterator state."""
+
+    def __init__(self, iterator_resource, name):
+      serialized_iterator = gen_dataset_ops.serialize_iterator(
+          iterator_resource)
+      specs = [
+          BaseSaverBuilder.SaveSpec(serialized_iterator, "", name + "_STATE")
+      ]
+      # pylint: disable=protected-access
+      super(Iterator._Saveable, self).__init__(iterator_resource, specs, name)
+
+    def restore(self, restored_tensors, restored_shapes):
+      with ops.colocate_with(self.op):
+        return gen_dataset_ops.deserialize_iterator(self.op,
+                                                    restored_tensors[0])
+
+  def _gather_saveables_for_checkpoint(self):
+
+    def _saveable_factory(name):
+      return self._Saveable(self._resource, name)
+
+    return {"ITERATOR": _saveable_factory}
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 4afadd88f59a79dde4f3af5175adbbbb18557ced..c658505de41bb6a0007440f4850fef720c3e97f1 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -16,6 +16,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import threading
 import time
 
@@ -24,6 +26,7 @@ import numpy as np
 from tensorflow.contrib import lookup
 from tensorflow.contrib.data.python.ops import threadpool
 from tensorflow.contrib.data.python.ops import unique
+from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.data import Dataset
 from tensorflow.python.eager import test
@@ -221,6 +224,61 @@ class IteratorTest(test.TestCase):
       # perform work.
       self.assertLessEqual(len(thread_ids), num_threads)
 
+  def testSaveRestore(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    dataset = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset.map(math_ops.square).batch(2)
+    iterator = datasets.Iterator(dataset)
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual([1, 4], iterator.get_next().numpy())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([9, 16], iterator.get_next().numpy())
+    self.assertAllEqual([25, 36], iterator.get_next().numpy())
+    checkpoint.restore(save_path)
+    self.assertAllEqual([9, 16], iterator.get_next().numpy())
+    self.assertAllEqual([25, 36], iterator.get_next().numpy())
+
+  def testSaveRestoreMultipleIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    dataset = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset.map(math_ops.square).batch(2)
+    iterator_1 = datasets.Iterator(dataset)
+    iterator_2 = datasets.Iterator(dataset)
+    dataset_2 = Dataset.range(10)
+    iterator_3 = datasets.Iterator(dataset_2)
+
+    checkpoint = checkpointable_utils.Checkpoint(
+        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
+    self.assertAllEqual([1, 4], iterator_1.get_next().numpy())
+    self.assertEqual(0, iterator_3.get_next().numpy())
+    self.assertEqual(1, iterator_3.get_next().numpy())
+    self.assertEqual(2, iterator_3.get_next().numpy())
+
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([1, 4], iterator_2.get_next().numpy())
+    self.assertAllEqual([9, 16], iterator_2.get_next().numpy())
+    self.assertEqual(3, iterator_3.get_next().numpy())
+    checkpoint.restore(save_path)
+    self.assertAllEqual([9, 16], iterator_1.get_next().numpy())
+    self.assertAllEqual([1, 4], iterator_2.get_next().numpy())
+    self.assertEqual(3, iterator_3.get_next().numpy())
+
+  def testRestoreExhaustedIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    dataset = Dataset.range(3)
+    iterator = datasets.Iterator(dataset)
+
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertEqual(0, iterator.get_next().numpy())
+    self.assertEqual(1, iterator.get_next().numpy())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertEqual(2, iterator.get_next().numpy())
+    checkpoint.restore(save_path)
+    self.assertEqual(2, iterator.get_next().numpy())
+
 
 class DatasetConstructorBenchmark(test.Benchmark):
 
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist.py b/tensorflow/contrib/eager/python/examples/gan/mnist.py
index 2b7e199fad08c9a5e320b51b3a4de92c2d7dbb1a..b80c90902353709b7f739585291ec3b5890c27c7 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist.py
@@ -32,6 +32,7 @@ import tensorflow as tf
 import tensorflow.contrib.eager as tfe
 from tensorflow.examples.tutorials.mnist import input_data
 
+layers = tf.keras.layers
 FLAGS = None
 
 
@@ -56,15 +57,15 @@ class Discriminator(tf.keras.Model):
     else:
       assert data_format == 'channels_last'
       self._input_shape = [-1, 28, 28, 1]
-    self.conv1 = tf.layers.Conv2D(
+    self.conv1 = layers.Conv2D(
         64, 5, padding='SAME', data_format=data_format, activation=tf.tanh)
-    self.pool1 = tf.layers.AveragePooling2D(2, 2, data_format=data_format)
-    self.conv2 = tf.layers.Conv2D(
+    self.pool1 = layers.AveragePooling2D(2, 2, data_format=data_format)
+    self.conv2 = layers.Conv2D(
         128, 5, data_format=data_format, activation=tf.tanh)
-    self.pool2 = tf.layers.AveragePooling2D(2, 2, data_format=data_format)
-    self.flatten = tf.layers.Flatten()
-    self.fc1 = tf.layers.Dense(1024, activation=tf.tanh)
-    self.fc2 = tf.layers.Dense(1, activation=None)
+    self.pool2 = layers.AveragePooling2D(2, 2, data_format=data_format)
+    self.flatten = layers.Flatten()
+    self.fc1 = layers.Dense(1024, activation=tf.tanh)
+    self.fc2 = layers.Dense(1, activation=None)
 
   def call(self, inputs):
     """Return two logits per image estimating input authenticity.
@@ -112,16 +113,16 @@ class Generator(tf.keras.Model):
     else:
       assert data_format == 'channels_last'
       self._pre_conv_shape = [-1, 6, 6, 128]
-    self.fc1 = tf.layers.Dense(6 * 6 * 128, activation=tf.tanh)
+    self.fc1 = layers.Dense(6 * 6 * 128, activation=tf.tanh)
 
     # In call(), we reshape the output of fc1 to _pre_conv_shape
 
     # Deconvolution layer. Resulting image shape: (batch, 14, 14, 64)
-    self.conv1 = tf.layers.Conv2DTranspose(
+    self.conv1 = layers.Conv2DTranspose(
         64, 4, strides=2, activation=None, data_format=data_format)
 
     # Deconvolution layer. Resulting image shape: (batch, 28, 28, 1)
-    self.conv2 = tf.layers.Conv2DTranspose(
+    self.conv2 = layers.Conv2DTranspose(
         1, 2, strides=2, activation=tf.nn.sigmoid, data_format=data_format)
 
   def call(self, inputs):
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index 6ab847cb78a09ab0a38beefff56f87d8314c0713..4e1380afb2e6e722de65c691d4fbf44621072e87 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -32,6 +32,8 @@ import tensorflow as tf
 
 import tensorflow.contrib.eager as tfe
 
+layers = tf.keras.layers
+
 
 class LinearModel(tf.keras.Model):
   """A TensorFlow linear regression model."""
@@ -39,7 +41,7 @@ class LinearModel(tf.keras.Model):
   def __init__(self):
     """Constructs a LinearModel object."""
     super(LinearModel, self).__init__()
-    self._hidden_layer = tf.layers.Dense(1)
+    self._hidden_layer = layers.Dense(1)
 
   def call(self, xs):
     """Invoke the linear model.
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
index 6b59413141f78fc85474850e109454ecdeb68cd3..a28bc8a43d7c90737c9baf9a634d736e9de52948 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
@@ -28,6 +28,8 @@ import functools
 
 import tensorflow as tf
 
+layers = tf.keras.layers
+
 
 class _IdentityBlock(tf.keras.Model):
   """_IdentityBlock is the block that has no conv layer at shortcut.
@@ -49,23 +51,23 @@ class _IdentityBlock(tf.keras.Model):
     bn_name_base = 'bn' + str(stage) + block + '_branch'
     bn_axis = 1 if data_format == 'channels_first' else 3
 
-    self.conv2a = tf.layers.Conv2D(
+    self.conv2a = layers.Conv2D(
         filters1, (1, 1), name=conv_name_base + '2a', data_format=data_format)
-    self.bn2a = tf.layers.BatchNormalization(
+    self.bn2a = layers.BatchNormalization(
         axis=bn_axis, name=bn_name_base + '2a')
 
-    self.conv2b = tf.layers.Conv2D(
+    self.conv2b = layers.Conv2D(
         filters2,
         kernel_size,
         padding='same',
         data_format=data_format,
         name=conv_name_base + '2b')
-    self.bn2b = tf.layers.BatchNormalization(
+    self.bn2b = layers.BatchNormalization(
         axis=bn_axis, name=bn_name_base + '2b')
 
-    self.conv2c = tf.layers.Conv2D(
+    self.conv2c = layers.Conv2D(
         filters3, (1, 1), name=conv_name_base + '2c', data_format=data_format)
-    self.bn2c = tf.layers.BatchNormalization(
+    self.bn2c = layers.BatchNormalization(
         axis=bn_axis, name=bn_name_base + '2c')
 
   def call(self, input_tensor, training=False):
@@ -113,34 +115,34 @@ class _ConvBlock(tf.keras.Model):
     bn_name_base = 'bn' + str(stage) + block + '_branch'
     bn_axis = 1 if data_format == 'channels_first' else 3
 
-    self.conv2a = tf.layers.Conv2D(
+    self.conv2a = layers.Conv2D(
         filters1, (1, 1),
         strides=strides,
         name=conv_name_base + '2a',
         data_format=data_format)
-    self.bn2a = tf.layers.BatchNormalization(
+    self.bn2a = layers.BatchNormalization(
         axis=bn_axis, name=bn_name_base + '2a')
 
-    self.conv2b = tf.layers.Conv2D(
+    self.conv2b = layers.Conv2D(
         filters2,
         kernel_size,
         padding='same',
         name=conv_name_base + '2b',
         data_format=data_format)
-    self.bn2b = tf.layers.BatchNormalization(
+    self.bn2b = layers.BatchNormalization(
         axis=bn_axis, name=bn_name_base + '2b')
 
-    self.conv2c = tf.layers.Conv2D(
+    self.conv2c = layers.Conv2D(
         filters3, (1, 1), name=conv_name_base + '2c', data_format=data_format)
-    self.bn2c = tf.layers.BatchNormalization(
+    self.bn2c = layers.BatchNormalization(
         axis=bn_axis, name=bn_name_base + '2c')
 
-    self.conv_shortcut = tf.layers.Conv2D(
+    self.conv_shortcut = layers.Conv2D(
         filters3, (1, 1),
         strides=strides,
         name=conv_name_base + '1',
         data_format=data_format)
-    self.bn_shortcut = tf.layers.BatchNormalization(
+    self.bn_shortcut = layers.BatchNormalization(
         axis=bn_axis, name=bn_name_base + '1')
 
   def call(self, input_tensor, training=False):
@@ -219,15 +221,15 @@ class ResNet50(tf.keras.Model):
       return _IdentityBlock(
           3, filters, stage=stage, block=block, data_format=data_format)
 
-    self.conv1 = tf.layers.Conv2D(
+    self.conv1 = layers.Conv2D(
         64, (7, 7),
         strides=(2, 2),
         data_format=data_format,
         padding='same',
         name='conv1')
     bn_axis = 1 if data_format == 'channels_first' else 3
-    self.bn_conv1 = tf.layers.BatchNormalization(axis=bn_axis, name='bn_conv1')
-    self.max_pool = tf.layers.MaxPooling2D(
+    self.bn_conv1 = layers.BatchNormalization(axis=bn_axis, name='bn_conv1')
+    self.max_pool = layers.MaxPooling2D(
         (3, 3), strides=(2, 2), data_format=data_format)
 
     self.l2a = conv_block([64, 64, 256], stage=2, block='a', strides=(1, 1))
@@ -250,11 +252,12 @@ class ResNet50(tf.keras.Model):
     self.l5b = id_block([512, 512, 2048], stage=5, block='b')
     self.l5c = id_block([512, 512, 2048], stage=5, block='c')
 
-    self.avg_pool = tf.layers.AveragePooling2D(
+    self.avg_pool = layers.AveragePooling2D(
         (7, 7), strides=(7, 7), data_format=data_format)
 
     if self.include_top:
-      self.fc1000 = tf.layers.Dense(classes, name='fc1000')
+      self.flatten = layers.Flatten()
+      self.fc1000 = layers.Dense(classes, name='fc1000')
     else:
       reduction_indices = [1, 2] if data_format == 'channels_last' else [2, 3]
       reduction_indices = tf.constant(reduction_indices)
@@ -298,7 +301,7 @@ class ResNet50(tf.keras.Model):
     x = self.avg_pool(x)
 
     if self.include_top:
-      return self.fc1000(tf.layers.flatten(x))
+      return self.fc1000(self.flatten(x))
     elif self.global_pooling:
       return self.global_pooling(x)
     else:
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 65dcc53aab39670cae10846b6996c17d7b4c5ba8..d6923293a374f29ab77be70fa9fea44efd1ea40b 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -64,22 +64,29 @@ def train_one_step(model, images, labels, optimizer):
 
 class ResNet50Test(tf.test.TestCase):
 
-  def _apply(self, defun=False):
+  def _apply(self, defun=False, execution_mode=None):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format)
     if defun:
       model.call = tfe.defun(model.call)
-    with tf.device(device):
+    with tf.device(device), tfe.execution_mode(execution_mode):
       images, _ = random_batch(2)
       output = model(images, training=False)
+      tfe.async_wait()
     self.assertEqual((2, 1000), output.shape)
 
   def test_apply(self):
     self._apply(defun=False)
 
+  def test_apply_async(self):
+    self._apply(defun=False, execution_mode=tfe.ASYNC)
+
   def test_apply_with_defun(self):
     self._apply(defun=True)
 
+  def test_apply_with_defun_async(self):
+    self._apply(defun=True, execution_mode=tfe.ASYNC)
+
   def test_apply_no_top(self):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
@@ -98,7 +105,7 @@ class ResNet50Test(tf.test.TestCase):
       output = model(images, training=False)
     self.assertEqual((2, 2048), output.shape)
 
-  def test_train(self):
+  def _test_train(self, execution_mode=None):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format)
     tf.train.get_or_create_global_step()
@@ -106,15 +113,22 @@ class ResNet50Test(tf.test.TestCase):
     with tf.contrib.summary.create_file_writer(
         logdir, max_queue=0,
         name='t0').as_default(), tf.contrib.summary.always_record_summaries():
-      with tf.device(device):
+      with tf.device(device), tfe.execution_mode(execution_mode):
         optimizer = tf.train.GradientDescentOptimizer(0.1)
         images, labels = random_batch(2)
         train_one_step(model, images, labels, optimizer)
         self.assertEqual(320, len(model.variables))
+        tfe.async_wait()
     events = summary_test_util.events_from_logdir(logdir)
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].tag, 'loss')
 
+  def test_train(self):
+    self._test_train()
+
+  def test_train_async(self):
+    self._test_train(execution_mode=tfe.ASYNC)
+
   def test_no_garbage(self):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format)
@@ -183,59 +197,84 @@ class ResNet50Benchmarks(tf.test.Benchmark):
     # a sync. This is a roundabout way, yes.
     tf.constant(1.).cpu()
 
-  def _benchmark_eager_apply(self, label, defun=False):
-    device, data_format = device_and_data_format()
-    model = resnet50.ResNet50(data_format)
-    if defun:
-      model.call = tfe.defun(model.call)
-    batch_size = 64
-    num_burn = 5
-    num_iters = 30
-    with tf.device(device):
-      images, _ = random_batch(batch_size)
-      for _ in xrange(num_burn):
-        model(images, training=False).cpu()
-      gc.collect()
-      start = time.time()
-      for _ in xrange(num_iters):
-        model(images, training=False).cpu()
-      self._report(label, start, num_iters, device, batch_size, data_format)
-
-  def benchmark_eager_apply(self):
-    self._benchmark_eager_apply('eager_apply', defun=False)
-
-  def benchmark_eager_apply_with_defun(self):
-    self._benchmark_eager_apply('eager_apply_with_defun', defun=True)
-
-  def _benchmark_eager_train(self, label, make_iterator, defun=False):
-    device, data_format = device_and_data_format()
-    for batch_size in self._train_batch_sizes():
-      (images, labels) = random_batch(batch_size)
-      num_burn = 3
-      num_iters = 10
+  def _benchmark_eager_apply(self, label, defun=False, execution_mode=None):
+    with tfe.execution_mode(execution_mode):
+      device, data_format = device_and_data_format()
       model = resnet50.ResNet50(data_format)
       if defun:
         model.call = tfe.defun(model.call)
-      optimizer = tf.train.GradientDescentOptimizer(0.1)
-
+      batch_size = 64
+      num_burn = 5
+      num_iters = 30
       with tf.device(device):
-        iterator = make_iterator((images, labels))
+        images, _ = random_batch(batch_size)
         for _ in xrange(num_burn):
-          (images, labels) = iterator.next()
-          train_one_step(model, images, labels, optimizer)
-        self._force_gpu_sync()
+          model(images, training=False).cpu()
+        if execution_mode:
+          tfe.async_wait()
         gc.collect()
-
         start = time.time()
         for _ in xrange(num_iters):
-          (images, labels) = iterator.next()
-          train_one_step(model, images, labels, optimizer)
-        self._force_gpu_sync()
+          model(images, training=False).cpu()
+        if execution_mode:
+          tfe.async_wait()
         self._report(label, start, num_iters, device, batch_size, data_format)
 
+  def benchmark_eager_apply(self):
+    self._benchmark_eager_apply('eager_apply', defun=False)
+
+  def benchmark_eager_apply_async(self):
+    self._benchmark_eager_apply(
+        'eager_apply_async', defun=False, execution_mode=tfe.ASYNC)
+
+  def benchmark_eager_apply_with_defun(self):
+    self._benchmark_eager_apply('eager_apply_with_defun', defun=True)
+
+  def _benchmark_eager_train(self,
+                             label,
+                             make_iterator,
+                             defun=False,
+                             execution_mode=None):
+    with tfe.execution_mode(execution_mode):
+      device, data_format = device_and_data_format()
+      for batch_size in self._train_batch_sizes():
+        (images, labels) = random_batch(batch_size)
+        num_burn = 3
+        num_iters = 10
+        model = resnet50.ResNet50(data_format)
+        if defun:
+          model.call = tfe.defun(model.call)
+        optimizer = tf.train.GradientDescentOptimizer(0.1)
+
+        with tf.device(device):
+          iterator = make_iterator((images, labels))
+          for _ in xrange(num_burn):
+            (images, labels) = iterator.next()
+            train_one_step(model, images, labels, optimizer)
+          if execution_mode:
+            tfe.async_wait()
+          self._force_gpu_sync()
+          gc.collect()
+
+          start = time.time()
+          for _ in xrange(num_iters):
+            (images, labels) = iterator.next()
+            train_one_step(model, images, labels, optimizer)
+          if execution_mode:
+            tfe.async_wait()
+          self._force_gpu_sync()
+          self._report(label, start, num_iters, device, batch_size, data_format)
+
   def benchmark_eager_train(self):
     self._benchmark_eager_train('eager_train', MockIterator, defun=False)
 
+  def benchmark_eager_train_async(self):
+    self._benchmark_eager_train(
+        'eager_train_async',
+        MockIterator,
+        defun=False,
+        execution_mode=tfe.ASYNC)
+
   def benchmark_eager_train_with_defun(self):
     self._benchmark_eager_train(
         'eager_train_with_defun', MockIterator, defun=True)
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index 29f02324544ede172500f799cd84068984d7d87b..492adbe1d80941f9df96d6636e4933d11239408e 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -60,6 +60,7 @@ import functools
 import os
 import sys
 import time
+import urllib
 
 import six
 import tensorflow as tf
@@ -72,6 +73,8 @@ try:
 except ImportError:
   HAS_MATPLOTLIB = False
 
+layers = tf.keras.layers
+
 
 def parse(line):
   """Parse a line from the colors dataset."""
@@ -89,13 +92,35 @@ def parse(line):
   return rgb, chars, length
 
 
+def maybe_download(filename, work_directory, source_url):
+  """Download the data from source url, unless it's already here.
+
+  Args:
+    filename: string, name of the file in the directory.
+    work_directory: string, path to working directory.
+    source_url: url to download from if file doesn't exist.
+
+  Returns:
+    Path to resulting file.
+  """
+  if not tf.gfile.Exists(work_directory):
+    tf.gfile.MakeDirs(work_directory)
+  filepath = os.path.join(work_directory, filename)
+  if not tf.gfile.Exists(filepath):
+    temp_file_name, _ = urllib.request.urlretrieve(source_url)
+    tf.gfile.Copy(temp_file_name, filepath)
+    with tf.gfile.GFile(filepath) as f:
+      size = f.size()
+      print("Successfully downloaded", filename, size, "bytes.")
+  return filepath
+
+
 def load_dataset(data_dir, url, batch_size):
   """Loads the colors data at path into a PaddedDataset."""
 
   # Downloads data at url into data_dir/basename(url). The dataset has a header
   # row (color_name, r, g, b) followed by comma-separated lines.
-  path = tf.contrib.learn.datasets.base.maybe_download(
-      os.path.basename(url), data_dir, url)
+  path = maybe_download(os.path.basename(url), data_dir, url)
 
   # This chain of commands loads our data by:
   #   1. skipping the header; (.skip(1))
@@ -129,7 +154,7 @@ class RNNColorbot(tf.keras.Model):
 
     self.cells = self._add_cells(
         [tf.nn.rnn_cell.BasicLSTMCell(size) for size in rnn_cell_sizes])
-    self.relu = tf.layers.Dense(
+    self.relu = layers.Dense(
         label_dimension, activation=tf.nn.relu, name="relu")
 
   def call(self, inputs, training=False):
@@ -181,7 +206,7 @@ class RNNColorbot(tf.keras.Model):
 
   def _add_cells(self, cells):
     # "Magic" required for keras.Model classes to track all the variables in
-    # a list of tf.layers.Layer objects.
+    # a list of layers.Layer objects.
     # TODO(ashankar): Figure out API so user code doesn't have to do this.
     for i, c in enumerate(cells):
       setattr(self, "cell-%d" % i, c)
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index 69cd16d12c32c8c7c4744d8f0b4b1feedf946aa1..a90048d813bf345e8be32e9674a452175471b268 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -38,6 +38,8 @@ import tensorflow as tf
 from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
 from tensorflow.contrib.eager.python import tfe
 
+layers = tf.keras.layers
+
 
 class RNN(tf.keras.Model):
   """A static RNN.
@@ -74,14 +76,14 @@ class RNN(tf.keras.Model):
 
   def _add_cells(self, cells):
     # "Magic" required for keras.Model classes to track all the variables in
-    # a list of tf.layers.Layer objects.
+    # a list of Layer objects.
     # TODO(ashankar): Figure out API so user code doesn't have to do this.
     for i, c in enumerate(cells):
       setattr(self, "cell-%d" % i, c)
     return cells
 
 
-class Embedding(tf.layers.Layer):
+class Embedding(layers.Layer):
   """An Embedding layer."""
 
   def __init__(self, vocab_size, embedding_dim, **kwargs):
@@ -132,7 +134,7 @@ class PTBModel(tf.keras.Model):
     else:
       self.rnn = RNN(hidden_dim, num_layers, self.keep_ratio)
 
-    self.linear = tf.layers.Dense(
+    self.linear = layers.Dense(
         vocab_size, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
     self._output_shape = [-1, embedding_dim]
 
diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
index 98d01ad1d5a70788d2d4cb07031a8d76a6bf628f..5966f1d4873e8e77b3ad5914da7bfc7e69d4e341 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/BUILD
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -39,6 +39,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
     tags = [
+        "no-internal-py3",  # flaky
         "no_cuda_on_cpu_tap",
         "no_pip",  # because spinn.py is under third_party/.
     ],
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 081b0af14fcc983a3f85d2a50e2bb04d2f2493b3..9adf47d505fc2933d9c009e5863351bd123c3797 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -33,6 +33,7 @@ import tensorflow as tf
 import tensorflow.contrib.eager as tfe
 from tensorflow.contrib.eager.python.examples.spinn import data
 from third_party.examples.eager.spinn import spinn
+from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
@@ -417,12 +418,17 @@ class SpinnTest(test_util.TensorFlowTestCase):
                     if event.summary.value
                     and event.summary.value[0].tag == "train/loss"]
     self.assertEqual(config.epochs, len(train_losses))
-    self.assertLess(train_losses[-1], train_losses[0])
 
     # 5. Verify that checkpoints exist and contains all the expected variables.
     self.assertTrue(glob.glob(os.path.join(config.logdir, "ckpt*")))
-    ckpt_variable_names = [
-        item[0] for item in checkpoint_utils.list_variables(config.logdir)]
+    object_graph_string = checkpoint_utils.load_variable(
+        config.logdir, name="_CHECKPOINTABLE_OBJECT_GRAPH")
+    object_graph = checkpointable_object_graph_pb2.CheckpointableObjectGraph()
+    object_graph.ParseFromString(object_graph_string)
+    ckpt_variable_names = set()
+    for node in object_graph.nodes:
+      for attribute in node.attributes:
+        ckpt_variable_names.add(attribute.full_name)
     self.assertIn("global_step", ckpt_variable_names)
     for v in trainer.variables:
       variable_name = v.name[:v.name.index(":")] if ":" in v.name else v.name
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index b73dc17e5f9cb15a51426f85e966a49604145f1d..2d2aba6908b168e0bf63f4706b6344cbb4ca82bd 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -1,892 +1,18 @@
-# TensorFlow Eager Execution
-
-## What is this?
+# Eager execution
 
 Eager execution is a feature that makes TensorFlow execute operations
-immediately: concrete values are returned, instead of a computational graph to
-be executed later.
-
-As a result, enabling eager execution provides:
-
--   A [NumPy](http://www.numpy.org/)-like library for numerical computation with
-    support for GPU acceleration and automatic differentiation.
--   A flexible platform for machine learning research and experimentation.
-
-Eager execution is under active development. This guide walks through an
-alpha/preview release. In particular, not all TensorFlow APIs currently work
-with eager execution enabled, and some models may be slow to execute, compared
-to models defined without using eager execution.
-
-## Installation
-
-Eager execution is included in TensorFlow versions 1.5 and above.
-Installation instructions at https://www.tensorflow.org/install/
-
-The contents of this guide are compatible with TensorFlow 1.5. However, if you
-run into bugs that are fixed in source but not the release, you may want to
-either [build from source](https://www.tensorflow.org/install/install_sources)
-or try a nightly build. The nightly builds are available as:
-
-- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
-
-- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
-
-For example, to run the latest nightly docker image:
-
-```sh
-# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
-docker pull tensorflow/tensorflow:nightly-gpu
-docker run --runtime=nvidia -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
-
-# If you do not have a GPU, use the CPU-only image
-docker pull tensorflow/tensorflow:nightly
-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-```
-
-And then visit http://localhost:8888 in your browser for a Jupyter notebook
-environment.
-
-## Getting Started
-
-With TensorFlow installed, eager execution is enabled via a single call:
-
-```python
-import tensorflow as tf
-
-import tensorflow.contrib.eager as tfe
-
-tfe.enable_eager_execution()
-```
-
-Enabling eager execution changes how TensorFlow functions behave (in particular,
-`Tensor` objects will reference concrete values instead of being symbolic
-handles to nodes in a computational graph). As a result, eager execution should
-be enabled at the beginning of a program and cannot be disabled afterwards in
-the same program.
-
-Code examples in the rest of this guide assume that eager execution has been
-enabled.
-
-## A library for numerical computation
-
-A significant fraction of the [TensorFlow
-API](https://www.tensorflow.org/api_docs/python/) consists of numerical
-operations:
-[arithmetic operations](https://www.tensorflow.org/api_guides/python/math_ops#Arithmetic_Operators),
-[matrix operations](https://www.tensorflow.org/api_guides/python/math_ops#Matrix_Math_Functions),
-[linear algebra operations](https://www.tensorflow.org/versions/master/api_docs/python/tf/linalg),
-etc.
-
-With eager execution enabled, these operations consume and return
-multi-dimensional arrays as `Tensor` objects, similar to NumPy
-[`ndarray`s](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.ndarray.html).
-For example:
-
-```python
-# Multiply two 2x2 matrices
-x = tf.matmul([[1, 2],
-               [3, 4]],
-              [[4, 5],
-               [6, 7]])
-# Add one to each element
-# (tf.add supports broadcasting)
-y = tf.add(x, 1)
-
-# Create a random random 5x3 matrix
-z = tf.random_uniform([5, 3])
-
-print(x)
-print(y)
-print(z)
-```
-
-Output:
-
-```
-tf.Tensor(
-[[16 19]
- [36 43]], shape=(2, 2), dtype=int32)
-tf.Tensor(
-[[17 20]
- [37 44]], shape=(2, 2), dtype=int32)
-tf.Tensor(
-[[ 0.25058532  0.0929395   0.54113817]
- [ 0.3108716   0.93350542  0.84909797]
- [ 0.53081679  0.12788558  0.01767385]
- [ 0.29725885  0.33540785  0.83588314]
- [ 0.38877153  0.39720535  0.78914213]], shape=(5, 3), dtype=float32)
-```
-
-For convenience, these operations can also be triggered via operator overloading
-of the `Tensor` object. For example, the `+` operator is equivalent to `tf.add`,
-`-` to `tf.subtract`, `*` to `tf.multiply`, etc.:
-
-```python
-x = (tf.ones([1], dtype=tf.float32) + 1) * 2 - 1
-print(x)
-```
-
-Output:
-
-```
-tf.Tensor([ 3.], shape=(1,), dtype=float32)
-```
-
-### Converting to and from NumPy
-
-The operations above automatically convert Python objects (like lists of
-numbers) and NumPy arrays to `Tensor` objects. `Tensor` objects can also be used
-as NumPy arrays by numpy operations.
-
-```python
-import numpy as np
-
-x = tf.add(1, 1)                     # tf.Tensor with a value of 2
-y = tf.add(np.array(1), np.array(1)) # tf.Tensor with a value of 2
-z = np.multiply(x, y)                # numpy.int64 with a value of 4
-```
-
-Alternatively, they can be explicitly converted using
-[`tf.constant`](https://www.tensorflow.org/api_docs/python/tf/constant), as
-shown in the next example.
-
-Conversely, you can call the `numpy()` method of a `Tensor` object' to obtain
-its NumPy `ndarray` value. For example:
-
-```python
-import numpy as np
-
-np_x = np.array(2., dtype=np.float32)
-x = tf.constant(np_x)
-
-py_y = 3.
-y = tf.constant(py_y)
-
-z = x + y + 1
-
-print(z)
-print(z.numpy())
-```
-
-Output:
-
-```
-tf.Tensor(6.0, shape=(), dtype=float32)
-6.0
-```
-
-### GPU acceleration
-
-Many TensorFlow operations support GPU acceleration. With eager execution
-enabled, [computation is *not* automatically
-offloaded](https://www.tensorflow.org/tutorials/using_gpu) to GPUs. Instead, you
-must explicitly specify when GPUs should be used.
-
-The simplest way to do this is to enclose your computation in a `with
-tf.device('/gpu:0')` block. Also of interest is the `tfe.num_gpus()` function,
-which returns the number of available GPUs.
-
-For example, consider this snippet to measure the time to multiply two 1000x1000
-matrices on CPU:
-
-```python
-import time
-
-def measure(x):
-  # The very first time a GPU is used by TensorFlow, it is initialized.
-  # So exclude the first run from timing.
-  tf.matmul(x, x)
-
-  start = time.time()
-  for i in range(10):
-    tf.matmul(x, x)
-  end = time.time()
-
-  return "Took %s seconds to multiply a %s matrix by itself 10 times" % (end - start, x.shape)
-
-# Run on CPU:
-with tf.device("/cpu:0"):
-  print("CPU: %s" % measure(tf.random_normal([1000, 1000])))
-
-# If a GPU is available, run on GPU:
-if tfe.num_gpus() > 0:
-  with tf.device("/gpu:0"):
-    print("GPU: %s" % measure(tf.random_normal([1000, 1000])))
-```
-
-Output (exact numbers will depend on the characteristics of the hardware):
-
-```python
-CPU: Took 0.145531892776 seconds to multiply a (1000, 1000) matrix by itself 10 times
-GPU: Took 0.000458955764771 seconds to multiply a (1000, 1000) matrix by itself 10 times
-```
-
-Alternatively, methods on the `Tensor` object can be used to explicitly copy the
-`Tensor` to a different device. Operations are typically executed on the device
-on which the inputs are placed. For example:
-
-```python
-x = tf.random_normal([10, 10])
-
-x_gpu0 = x.gpu()
-x_cpu = x.cpu()
-
-_ = tf.matmul(x_cpu, x_cpu)  # Runs on CPU
-_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
-
-if tfe.num_gpus() > 1:
-  x_gpu1 = x.gpu(1)
-  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
-```
-
-### Automatic Differentiation
-
-[Automatic
-differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) is
-very useful when implementing many machine learning algorithms (e.g.,
-[backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
-neural networks). For this purpose, TensorFlow eager execution provides an
-[autograd](https://github.com/HIPS/autograd)-style API for automatic
-differentiation. Specifically, the functions:
-
--   `tfe.gradients_function(f)`: Returns a Python function that computes the
-    derivatives of the Python function `f` with respect to its arguments. `f`
-    must return a scalar value. When the returned function is invoked, it
-    returns a list of `Tensor` objects (one element for each argument of `f`).
--   `tfe.value_and_gradients_function(f)`: Similar to `tfe.gradients_function`,
-    except that when the returned function is invoked, it returns the value of
-    `f` in addition to the list of derivatives of `f` with respect to its
-    arguments.
-
-These functions naturally apply to higher order differentiation as well. For
-example:
-
-```python
-def f(x):
-  return tf.multiply(x, x)  # Or x * x
-assert 9 == f(3.).numpy()
-
-df = tfe.gradients_function(f)
-assert 6 == df(3.)[0].numpy()
-
-# Second order deriviative.
-d2f = tfe.gradients_function(lambda x: df(x)[0])
-assert 2 == d2f(3.)[0].numpy()
-
-# Third order derivative.
-d3f = tfe.gradients_function(lambda x : d2f(x)[0])
-assert 0 == d3f(3.)[0].numpy()
-```
-
-These functions can be used to train models. For example, consider the following
-simple linear regression model:
-
-```python
-def prediction(input, weight, bias):
-  return input * weight + bias
-
-# A toy dataset of points around 3 * x + 2
-NUM_EXAMPLES = 1000
-training_inputs = tf.random_normal([NUM_EXAMPLES])
-noise = tf.random_normal([NUM_EXAMPLES])
-training_outputs = training_inputs * 3 + 2 + noise
-
-# A loss function: Mean-squared error
-def loss(weight, bias):
-  error = prediction(training_inputs, weight, bias) - training_outputs
-  return tf.reduce_mean(tf.square(error))
-
-# Function that returns the derivative of loss with respect to
-# weight and bias
-grad = tfe.gradients_function(loss)
-
-# Train for 200 steps (starting from some random choice for W and B, on the same
-# batch of data).
-W = 5.
-B = 10.
-learning_rate = 0.01
-print("Initial loss: %f" % loss(W, B).numpy())
-for i in range(200):
-  (dW, dB) = grad(W, B)
-  W -= dW * learning_rate
-  B -= dB * learning_rate
-  if i % 20 == 0:
-    print("Loss at step %d: %f" % (i, loss(W, B).numpy()))
-print("Final loss: %f" % loss(W, B).numpy())
-print("W, B = %f, %f" % (W.numpy(), B.numpy()))
-```
-
-Output: (the exact numbers may vary depending on the randomness in noise)
-
-```
-Initial loss: 66.730003
-Loss at step 0: 64.200096
-Loss at step 20: 29.872814
-Loss at step 40: 14.233772
-Loss at step 60: 7.090570
-Loss at step 80: 3.819887
-Loss at step 100: 2.318821
-Loss at step 120: 1.628385
-Loss at step 140: 1.310142
-Loss at step 160: 1.163167
-Loss at step 180: 1.095162
-Final loss: 1.064711
-W, B = 3.094944, 2.161383
-```
-
-To utilize the GPU, place the code above within a `with tf.device("/gpu:0"):`
-block. (However, this particular model, with only two floating point parameters,
-is unlikely to benefit from GPU acceleration.)
-
-### Customizing gradients
-
-One may want to define custom gradients for an operation, or for a function.
-This may be useful for multiple reasons, including providing a more efficient
-or more [numerically stable](https://en.wikipedia.org/wiki/Numerical_stability)
-gradient for a sequence of operations.
-
-For example, consider the function `log(1 + e^x)`, which commonly occurs in the
-computation of cross entropy and log likelihoods.
-
-```python
-def log1pexp(x):
-  return tf.log(1 + tf.exp(x))
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# Works fine at x = 0.
-assert 0.5 == float(grad_log1pexp(0.)[0])
-
-# Returns a `nan` at x = 100 due to numerical instability.
-import math
-assert math.isnan(float(grad_log1pexp(100.)[0]))
-```
-
-We can define a custom gradient for the above function that analytically
-simplifies the gradient expression.
-
-```python
-@tfe.custom_gradient
-def log1pexp(x):
-  e = tf.exp(x)
-  def grad(dy):
-    return dy * (1 - 1 / (1 + e))
-  return tf.log(1 + e), grad
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# Works as before at x = 0.
-assert 0.5 == float(grad_log1pexp(0.)[0])
-
-# But now works at x = 100 as well.
-assert 1.0 == float(grad_log1pexp(100.)[0])
-```
-Also notice how the gradient function implementation reuses an expression
-(`tf.exp(x)`) computed during the forward pass, hence making the gradient
-computation more efficient by avoiding redundant computation.
-
-## Building and training models
-
-In practice, your computation may have many parameters to be optimized (by
-computing derivatives). Encapsulating them into re-usable classes/objects
-makes the code easier to follow than writing a single top-level function with
-many arguments.
-
-In fact, eager execution encourages use of the [Keras](https://keras.io)-style
-"Layer" classes in the
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
-module.
-
-Furthermore, you may want to apply more sophisticated techniques to compute
-parameter updates, such as those in
-[`tf.train.Optimizer`](https://www.tensorflow.org/api_guides/python/train#Optimizers)
-implementations.
-
-This next section walks through using the same `Optimizer` and `Layer` APIs used
-to build trainable TensorFlow graphs in an environment where eager execution is
-enabled.
-
-### Variables and Optimizers
-
-`tfe.Variable` objects store mutable `Tensor` values that can be accessed during
-training, making automatic differentiation easier. In particular, parameters of
-a model can be encapsulated in Python classes as variables.
-
-`tfe.gradients_function(f)` introduced earlier computes the derivatives of `f`
-with respect to its arguments. However, it requires all parameters of interest
-to be arguments of `f`, which becomes cumbersome when `f` depends on a large
-number of trainable parameters.
-
-`tfe.implicit_gradients` is an alternative function with some useful properties:
-
--   It computes the derivatives of `f` with respect to all the `tfe.Variable`s
-    used by `f`.
--   When the returned function is invoked, it returns a list of
-    (gradient value, Variable object) tuples.
-
-Representing model parameters as `Variable` objects, along with the use of
-`tfe.implicit_gradients`, typically results in better encapsulation. For
-example, the linear regression model described above can be written into a
-class:
-
-```python
-class Model(object):
-  def __init__(self):
-    self.W = tfe.Variable(5., name='weight')
-    self.B = tfe.Variable(10., name='bias')
-
-  def predict(self, inputs):
-    return inputs * self.W + self.B
-
-
-# The loss function to be optimized
-def loss(model, inputs, targets):
-  error = model.predict(inputs) - targets
-  return tf.reduce_mean(tf.square(error))
-
-# A toy dataset of points around 3 * x + 2
-NUM_EXAMPLES = 1000
-training_inputs = tf.random_normal([NUM_EXAMPLES])
-noise = tf.random_normal([NUM_EXAMPLES])
-training_outputs = training_inputs * 3 + 2 + noise
-
-# Define:
-# 1. A model
-# 2. Derivatives of a loss function with respect to model parameters
-# 3. A strategy for updating the variables based on the derivatives
-model = Model()
-grad = tfe.implicit_gradients(loss)
-optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-
-# The training loop
-print("Initial loss: %f" %
-      loss(model, training_inputs, training_outputs).numpy())
-for i in range(201):
-  optimizer.apply_gradients(grad(model, training_inputs, training_outputs))
-  if i % 20 == 0:
-    print("Loss at step %d: %f" %
-          (i, loss(model, training_inputs, training_outputs).numpy()))
-print("Final loss: %f" % loss(model, training_inputs, training_outputs).numpy())
-print("W, B = %s, %s" % (model.W.numpy(), model.B.numpy()))
-```
-
-Output:
-
-```
-Initial loss: 69.693184
-Loss at step 0: 66.987854
-Loss at step 20: 30.553387
-Loss at step 40: 14.250237
-Loss at step 60: 6.955020
-Loss at step 80: 3.690550
-Loss at step 100: 2.229739
-Loss at step 120: 1.576032
-Loss at step 140: 1.283496
-Loss at step 160: 1.152584
-Loss at step 180: 1.093999
-Final loss: 1.067780
-W, B = 3.0114281, 2.0865183
-```
-
-Using `implicit_gradients` avoids the need to provide all the trainable
-parameters of the model as arguments to the `loss` function.
-
-### Using Keras and the Layers API
-
-[Keras](https://keras.io) is a popular API for defining model structures. The
-[`tf.keras.layers`](https://www.tensorflow.org/api_docs/python/tf/keras/layers)
-module provides a set of building blocks for models and is implemented using the
-`tf.layers.Layer` subclasses in the
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
-module. We encourage the use of these same building blocks when using
-TensorFlow's eager execution feature. For example, the very same linear
-regression model can be built using `tf.layers.Dense`:
-
-```python
-class Model(object):
-  def __init__(self):
-    self.layer = tf.layers.Dense(1)
-
-  def predict(self, inputs):
-    return self.layer(inputs)
-```
-
-The `tf.layers` API makes it more convenient to define more sophisticated
-models. For example, the following will train an MNIST model:
-
-```python
-class MNISTModel(object):
-  def __init__(self, data_format):
-    # 'channels_first' is typically faster on GPUs
-    # while 'channels_last' is typically faster on CPUs.
-    # See: https://www.tensorflow.org/performance/performance_guide#data_formats
-    if data_format == 'channels_first':
-      self._input_shape = [-1, 1, 28, 28]
-    else:
-      self._input_shape = [-1, 28, 28, 1]
-    self.conv1 = tf.layers.Conv2D(32, 5,
-                                  padding='same',
-                                  activation=tf.nn.relu,
-                                  data_format=data_format)
-    self.max_pool2d = tf.layers.MaxPooling2D(
-        (2, 2), (2, 2), padding='same', data_format=data_format)
-    self.conv2 = tf.layers.Conv2D(64, 5,
-                                  padding='same',
-                                  activation=tf.nn.relu,
-                                  data_format=data_format)
-    self.dense1 = tf.layers.Dense(1024, activation=tf.nn.relu)
-    self.dropout = tf.layers.Dropout(0.5)
-    self.dense2 = tf.layers.Dense(10)
-
-  def predict(self, inputs):
-    x = tf.reshape(inputs, self._input_shape)
-    x = self.max_pool2d(self.conv1(x))
-    x = self.max_pool2d(self.conv2(x))
-    x = tf.layers.flatten(x)
-    x = self.dropout(self.dense1(x))
-    return self.dense2(x)
-
-def loss(model, inputs, targets):
-  return tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(
-          logits=model.predict(inputs), labels=targets))
-
-
-# Load the training and validation data
-from tensorflow.examples.tutorials.mnist import input_data
-data = input_data.read_data_sets("./mnist_data", one_hot=True)
-
-# Train
-device = "gpu:0" if tfe.num_gpus() else "cpu:0"
-model = MNISTModel('channels_first' if tfe.num_gpus() else 'channels_last')
-optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
-grad = tfe.implicit_gradients(loss)
-for i in range(20001):
-  with tf.device(device):
-    (inputs, targets) = data.train.next_batch(50)
-    optimizer.apply_gradients(grad(model, inputs, targets))
-    if i % 100 == 0:
-      print("Step %d: Loss on training set : %f" %
-            (i, loss(model, inputs, targets).numpy()))
-print("Loss on test set: %f" % loss(model, data.test.images, data.test.labels).numpy())
-```
-
-For a more complete example, see [the example in the tensorflow/models
-repository](https://github.com/tensorflow/models/tree/master/official/mnist/mnist_eager.py).
-
-### Checkpointing trained variables
-
-TensorFlow Variables (`tfe.Variable`) provide a way to represent shared,
-persistent state of your model. The `tfe.Checkpoint` class provides a means to
-save and restore variables to and from _checkpoints_.
-
-For example:
-
-```python
-# Create variables.
-x = tfe.Variable(10.)
-y = tfe.Variable(5.)
-
-# Indicate that the variables should be saved as "x" and "y".
-checkpoint = tfe.Checkpoint(x=x, y=y)
-
-# Assign new values to the variables and save.
-x.assign(2.)
-checkpoint.save('/tmp/ckpt')
-
-# Change the variable after saving.
-x.assign(11.)
-assert 16. == (x + y).numpy()  # 11 + 5
-
-# Restore the values in the checkpoint.
-checkpoint.restore('/tmp/ckpt-1')
-
-assert 7. == (x + y).numpy()  # 2 + 5
-```
-
-### `tf.keras.Model`
-
-You may often want to organize your models using classes, like the `MNISTModel`
-class described above. We recommend inheriting from the `tf.keras.Model` class
-as it provides conveniences like keeping track of all model variables.
-
-Sub-classes of `tf.keras.Model` may register `Layer`s (like classes in
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers), or [Keras
-layers](https://www.tensorflow.org/api_docs/python/tf/keras/layers)) by
-assigning them to attributes (`self.name = layer_object`) and define the
-computation in an implementation of `call()`.
-
-Note that `tf.layers.Layer` objects (like `tf.layers.Dense`) create variables
-lazily, when the first input is encountered.
-
-For example, consider the following two-layer neural network:
-
-```python
-class TwoLayerNet(tf.keras.Model):
-  def __init__(self):
-    super(TwoLayerNet, self).__init__()
-    self.layer1 = tf.layers.Dense(2, activation=tf.nn.relu, use_bias=False)
-    self.layer2 = tf.layers.Dense(3, use_bias=False)
-
-  def call(self, x):
-    return self.layer2(self.layer1(x))
-
-net = TwoLayerNet()
-
-# No variables created yet
-assert 0 == len(net.variables)
-
-# They are created on first input:
-inp = tf.constant([[1.]])
-
-# Since input is a 1x1 matrix, net.l1 has 2 units and net.l2 has 3 units,
-# the output is the product of a 1x1 matrix with a 1x2 matrix with a 2x3
-# matrix.
-assert [1, 3] == net(inp).shape.as_list()  # Invoke net; get output shape.
-assert 1 == len(net.layer1.variables)
-assert 1 == len(net.layer2.variables)
-assert 2 == len(net.variables)  # weights for each layer.
-assert [1, 2] == net.variables[0].shape.as_list()  # weights of layer1.
-assert [2, 3] == net.variables[1].shape.as_list()  # weights of layer2.
-```
-
-The `tf.keras.Model` class is itself a sub-class of `tf.layers.Layer`. This
-allows instances of `tf.keras.Model` to be embedded in other models. For
-example:
-
-```python
-class ThreeLayerNet(tf.keras.Model):
-  def __init__(self):
-    super(ThreeLayerNet, self).__init__()
-    self.a = TwoLayerNet()
-    self.b = tf.layers.Dense(4, use_bias=False)
-
-  def call(self, x):
-    return self.b(self.a(x))
-
-net = ThreeLayerNet()
-
-assert [1, 4] == net(inp).shape.as_list()
-assert 3 == len(net.variables)
-assert [1, 2] == net.variables[0].shape.as_list()
-assert [2, 3] == net.variables[1].shape.as_list()
-assert [3, 4] == net.variables[2].shape.as_list()
-```
-
-See more examples in
-[`tensorflow/contrib/eager/python/examples`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples).
-
-`tfe.Checkpoint` provides a convenient way to save and load training
-checkpoints. Let's define something simple to train. We set an objective for the
-output of our network, choose an optimizer, and a location for the checkpoint:
-
-```python
-objective = tf.constant([[2., 3., 4., 5.]])
-optimizer = tf.train.AdamOptimizer(0.01)
-checkpoint_directory = '/tmp/tfe_example'
-checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-net = ThreeLayerNet()
-```
-
-We group them in a `tfe.Checkpoint` and request that it be restored. This
-ensures that variables created by these objects are restored before their values
-are used. Our training loop is the same whether starting training or resuming
-from a previous checkpoint:
-
-```python
-global_step = tf.train.get_or_create_global_step()
-checkpoint = tfe.Checkpoint(
-    global_step=global_step, optimizer=optimizer, network=net)
-checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
-for _ in range(100):
-  loss_fn = lambda: tf.norm(net(inp) - objective)
-  optimizer.minimize(loss_fn, global_step=global_step)
-  if tf.equal(global_step % 20, 0):
-    print("Step %d, output %s" % (global_step.numpy(),
-                                  net(inp).numpy()))
-    # Save the checkpoint.
-    checkpoint.save(checkpoint_prefix)
-```
-
-The first time it runs, `Model` variables are initialized randomly. Then the
-output is trained to match the objective we've set:
-
-```
-Step 20, output [[ 0.03575622  0.29863232  0.03474367  0.24735749]]
-Step 40, output [[ 0.40646029  0.9856872   0.46851286  0.95358551]]
-Step 60, output [[ 1.74541104  2.800704    1.79055595  2.74783421]]
-Step 80, output [[ 2.14977384  3.44340849  3.96120024  5.16242075]]
-Step 100, output [[ 1.99943113  3.02364397  3.93500996  4.9610076 ]]
-```
-
-In subsequent iterations, variables are initialized with the values read from
-the latest checkpoint. Running the same code again, we continue from where we
-left off:
-
-```
-Step 120, output [[ 1.99234128  3.0271616   3.98732996  4.96401167]]
-Step 140, output [[ 2.00133467  3.01270437  4.00616646  5.00406504]]
-Step 160, output [[ 1.99647415  2.9956708   3.99064088  4.99632359]]
-Step 180, output [[ 2.00699997  3.00904822  4.00706148  5.01193142]]
-Step 200, output [[ 1.98334622  2.98249531  3.97375059  4.97123432]]
-```
-
-
-### Summaries, metrics and TensorBoard
-
-[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard)
-is a popular tool for understanding, debugging and optimizing the model training
-process. To benefit from the visualizations offered by TensorBoard, summary
-events need to be written during the course of execution of your program. You
-might find many Tensorflow programs that include the
-[`tf.summary`](https://www.tensorflow.org/api_guides/python/summary) operations
-during graph construction.
-
-`tf.summary` operations are *not* compatible with eager execution, but an
-equivalent alternative exists in
-[`tf.contrib.summary`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/summary)
-that is compatible with both eager execution and graph construction.
-
-During model construction simply insert summary operations like
-`tf.contrib.summary.scalar`. These operations do nothing by default, unless a
-summary writer is currently active and a writing policy is set.
-
-For example, to record summaries once every 100 global steps, use:
-
-```python
-tf.train.get_or_create_global_step()  # Ensuring the global step variable exists
-writer = tf.contrib.summary.create_file_writer(logdir)
-
-for _ in range(iterations):
-  with writer.as_default():
-    with tf.contrib.summary.record_summaries_every_n_global_steps(100):
-      # your model code goes here
-      tf.contrib.summary.scalar('loss', loss)
-      # ...
-```
-
-See the full mnist example in
-[`tensorflow/contrib/eager/python/examples/mnist`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
-for a full model using `tf.contrib.summary`.
-
-Similarly to summaries, the metrics in `tf.metrics` are currently not compatible
-with eager execution. We instead provide object-oriented metrics in the
-`tfe.metrics` package, which are compatible with graph construction as well.
-
-Metrics in the `tfe.metrics`, such as `tfe.metrics.Mean` and
-`tfe.Metrics.Accuracy`, all implement an intuitive object-oriented
-interface. Here's an example of how to use the `tfe.metrics.Mean` metric:
-
-```python
-# Metrics are objects, which can be created and destroyed.
-my_mean = tfe.metrics.Mean(name='my_mean')
-# While a metric is active, you can call it as a function to accumulate into its
-# internal state.
-my_mean(0.0)
-my_mean(10.0)
-# Once you've finished updating the metric, you can get its result. In this case
-# a simple average over all the calls to it. If a summary writer is active the
-# metric will write the appropriate summaries using the metric name.
-assert 5.0 == my_mean.result().numpy()
-```
-
-For a full example of a model using metrics for evaluation, see the mnist
-example in
-[`tensorflow/contrib/eager/python/examples/mnist`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist).
-
-### Input Pipelines
-
-The discussion above has been centered around the computation executed by your
-model. The
-[`tf.data`](https://www.tensorflow.org/api_docs/python/tf/data)
-module provides APIs to build complex input pipelines from simple, reusable
-pieces.
-
-If you're familiar with constructing `tf.data.Dataset` objects when building
-TensorFlow graphs, the same API calls are used when eager execution is enabled.
-However, the process of iterating over elements of the dataset differs between
-eager execution and graph construction. When eager execution is enabled, the
-discussion on iterator creation using `make_one_shot_iterator()` and
-`get_next()` in the
-[Programmer's Guide](https://www.tensorflow.org/programmers_guide/datasets) is
-*not* applicable. Instead, a more Pythonic `Iterator` class is available.
-
-For example:
-
-```python
-# Create a source Dataset from in-memory numpy arrays.
-# For reading from files on disk, you may want to use other Dataset classes
-# like the TextLineDataset or the TFRecordDataset.
-dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])
-
-# Apply transformations, shuffling, batching etc.
-dataset = dataset.map(tf.square).shuffle(2).batch(2)
-
-# Use tfe.Iterator to iterate over the dataset.
-for x in tfe.Iterator(dataset):
-  print(x)
-```
-
-Output:
-
-```
-tf.Tensor([4 9], shape=(2,), dtype=int32)
-tf.Tensor([16 25], shape=(2,), dtype=int32)
-tf.Tensor([36  1], shape=(2,), dtype=int32)
-```
-
-## Interoperating with Graphs
-
-Eager execution improves the process of model development in Python; however,
-because it is in its earliest stages, it does not yet support some features
-available to [TensorFlow
-graphs](https://www.tensorflow.org/get_started/get_started#the_computational_graph)
-that are desirable when deploying models in production. In particular, eager
-execution does not yet support distributed training, exporting models (to other
-[programming languages](https://www.tensorflow.org/api_docs/), [TensorFlow
-serving](https://www.tensorflow.org/serving/), and mobile applications), and
-various memory and computation optimizations that are applied to TensorFlow's
-dataflow graphs.
-
-That said, the APIs used to build modes are exactly the same whether executing
-eagerly or constructing graphs. This means that you can iteratively develop your
-model with eager execution enabled and later, if needed, use the same code to
-reap the benefits of representing models as computational graphs.
-
-For example, the same model definition used to construct a graph in
-[mnist.py`](https://github.com/tensorflow/models/tree/master/official/mnist/mnist.py)
-can be trained with eager execution enabled as in [`mnist_eager.py`](https://github.com/tensorflow/models/tree/master/official/mnist/mnist_eager.py).
-
-Other models in the [examples
-directory](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/)
-demonstrate this as well.
-
-Some differences worth noting:
-
--   There is no notion of a `tf.placeholder` or a `tf.Session` when eager
-    execution is enabled.
--   Many properties on the `tf.Tensor` object, like `tf.Tensor.name`,
-    `tf.Tensor.op`, `tf.Tensor.inputs` are not meaningful when eager execution
-    is enabled and their use will raise an `AttributeError`.
--   To use `tfe.implicit_gradients` in graph construction, variables must be
-    created with [`use_resource=True`] provided to
-    [`tf.get_variable()`](https://www.tensorflow.org/api_docs/python/tf/get_variable)
-    or
-    [`tf.variable_scope()`](https://www.tensorflow.org/api_docs/python/tf/variable_scope).
--   Some API calls (such as the functional-style `tf.layers.dense`,
-    `tf.layers.conv2d`) are not compatible with eager execution. Use of such
-    methods should raise an error indicating the alternative (e.g., the
-    `tf.layers.Dense` and `tf.layers.Conv2D` classes).
-
-## What next?
+immediately: concrete values are returned, instead of creating a computational
+graph that is executed later.
 
-Please give eager execution a spin. This feature is in early stages and is
-evolving, so we welcome your feedback via issues on GitHub (see [known
-issues](https://github.com/tensorflow/tensorflow/labels/comp:eager)).
+A user guide is available: https://www.tensorflow.org/programmers_guide/eager
+([source file](../../../../docs_src/programmers_guide/eager.md))
 
-You may want to browse through some sample code, including benchmarks for some:
+We welcome feedback through [GitHub issues](https://github.com/tensorflow/tensorflow/labels/comp:eager).
 
--   [Linear Regression](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/linear_regression)
--   [MNIST handwritten digit classifier](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
--   [ResNet50 image classification](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/resnet50)
--   [RNN to generate colors](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_colorbot)
--   [RNN language model](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_ptb)
+Sample code is available, including benchmarks for some:
 
+- [Linear Regression](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/linear_regression)
+- [MNIST handwritten digit classifier](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
+- [ResNet50 image classification](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/resnet50)
+- [RNN to generate colors](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_colorbot)
+- [RNN language model](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_ptb)
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 1490c2ccacd55156bcc1cf8c07d9941336e18e1b..2f2347736a073c7d9b3fb6685f52f8d58cc40570 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -109,6 +109,18 @@ class Metric(checkpointable.CheckpointableBase):
       pos = scope.name.rfind(scope_name)
       self._name = name + scope.name[pos + len(scope_name):]
       self._scope = scope
+
+    # Ensures that if the user calls build directly we still set self._built to
+    # True to prevent variables from being recreated.
+    self._build = self.build
+
+    def actual_build(*args, **kwargs):
+      self._build(*args, **kwargs)
+      self._built = True
+    self.build = actual_build
+    self.build.__doc__ = self._build.__doc__
+
+    # Captures construction scope for proper initialization.
     if context.executing_eagerly():
       self._construction_scope = context.eager_mode
     else:
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 6b5450ba89bdfa6e0195f488b75f596b58c463d5..15ac889191e0fe51269bc5740d5e0ab1bc0e2b72 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -195,6 +195,15 @@ class MetricsTest(test.TestCase):
         m2 = metrics.Mean()
         m2(2)
 
+  def testBuildMean(self):
+    # Verify that calling build() on Mean and then calling it won't recreate
+    # variables.
+    m = metrics.Mean()
+    m.build()
+    old_numer = m.numer
+    m(0.0)
+    self.assertTrue(old_numer is m.numer)
+
   def testMetricsChain(self):
     with context.graph_mode(), self.test_session():
       m1 = metrics.Mean()
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index 4c937716e8df7c8cda26d6431885ce33346b77fb..e55a9276ab53f44f76dc5e537b3bdde7c975f463 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -149,7 +149,7 @@ class Network(base.Layer):
     # check we might have name collisions if the parent scope on init gets
     # closed before build is called.
     self._variable_scope_counts_on_init = (
-        variable_scope._get_default_variable_store().variable_scopes_count)
+        variable_scope.get_variable_scope_store().variable_scopes_count)
 
   def _name_scope_name(self, current_variable_scope):
     """Overrides Layer op naming to match variable naming."""
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 5aabc9aae868021284e83a4c4d80d65c2ee63fca..c6f3f20e781147140f2c4b339ed465ab7e919d37 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -62,12 +62,18 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 
 @@executing_eagerly
 @@in_eager_mode
+@@set_execution_mode
+@@execution_mode
+@@async_wait
+@@async_clear_error
 
 @@run_test_in_graph_and_eager_modes
 
 @@DEVICE_PLACEMENT_EXPLICIT
 @@DEVICE_PLACEMENT_WARN
 @@DEVICE_PLACEMENT_SILENT
+@@SYNC
+@@ASYNC
 """
 
 from __future__ import absolute_import
@@ -95,6 +101,12 @@ from tensorflow.python.eager.context import DEVICE_PLACEMENT_WARN
 from tensorflow.python.eager.context import DEVICE_PLACEMENT_SILENT
 from tensorflow.python.eager.context import executing_eagerly
 from tensorflow.python.eager.context import list_devices
+from tensorflow.python.eager.context import set_execution_mode
+from tensorflow.python.eager.context import execution_mode
+from tensorflow.python.eager.context import async_wait
+from tensorflow.python.eager.context import async_clear_error
+from tensorflow.python.eager.context import SYNC
+from tensorflow.python.eager.context import ASYNC
 from tensorflow.python.eager.context import num_gpus
 from tensorflow.python.eager.execution_callbacks import add_execution_callback
 from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 773c6ab6c79217698c7c598a133082e2553f28f6..bec0329ebbd82b06fba6a8283500ad7f3a11b6a2 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -9,23 +9,12 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "estimator_py",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":boosted_trees",
         ":dnn",
         ":dnn_linear_combined",
         ":extenders",
@@ -38,6 +27,36 @@ py_library(
     ],
 )
 
+py_library(
+    name = "boosted_trees",
+    srcs = ["python/estimator/boosted_trees.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:boosted_trees",
+    ],
+)
+
+py_test(
+    name = "boosted_trees_test",
+    size = "medium",
+    srcs = ["python/estimator/boosted_trees_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":boosted_trees",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "dnn",
     srcs = ["python/estimator/dnn.py"],
@@ -70,6 +89,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -110,6 +130,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -142,6 +163,7 @@ py_test(
     deps = [
         ":extenders",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/predictor",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
@@ -174,6 +196,7 @@ py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:summary",
+        "//tensorflow/python:training",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:head",
         "//tensorflow/python/estimator:metric_keys",
@@ -245,6 +268,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -291,6 +315,8 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:head",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
@@ -354,6 +380,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["python/estimator/replicate_model_fn_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:dnn",
         "//tensorflow/python/estimator:export_export",
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 6b9f9575b606f1822d760e8597c55994dd8af04c..d2fc2c4bfa448227819c8d706387c1c75062b80b 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.contrib.estimator.python.estimator.boosted_trees import *
 from tensorflow.contrib.estimator.python.estimator.dnn import *
 from tensorflow.contrib.estimator.python.estimator.dnn_linear_combined import *
 from tensorflow.contrib.estimator.python.estimator.extenders import *
@@ -44,6 +45,8 @@ _allowed_symbols = [
     'DNNEstimator',
     'DNNLinearCombinedEstimator',
     'LinearEstimator',
+    'boosted_trees_classifier_train_in_memory',
+    'boosted_trees_regressor_train_in_memory',
     'call_logit_fn',
     'dnn_logit_fn_builder',
     'linear_logit_fn_builder',
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
new file mode 100644
index 0000000000000000000000000000000000000000..314c54ed00372eca62ffc6930e6d492dd7d57163
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -0,0 +1,323 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Boosted Trees estimators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
+
+
+class _BoostedTreesEstimator(estimator.Estimator):
+  """An Estimator for Tensorflow Boosted Trees models."""
+
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               head,
+               model_dir=None,
+               weight_column=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               config=None):
+    """Initializes a `BoostedTreesEstimator` instance.
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      n_batches_per_layer: the number of batches to collect statistics per
+        layer.
+      head: the `Head` instance defined for Estimator.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      n_trees: number trees to be created.
+      max_depth: maximum depth of the tree to grow.
+      learning_rate: shrinkage parameter to be used when a tree added to the
+        model.
+      l1_regularization: regularization multiplier applied to the absolute
+        weights of the tree leafs.
+      l2_regularization: regularization multiplier applied to the square weights
+        of the tree leafs.
+      tree_complexity: regularization factor to penalize trees with more leaves.
+      config: `RunConfig` object to configure the runtime settings.
+    """
+    # pylint:disable=protected-access
+    # HParams for the model.
+    tree_hparams = canned_boosted_trees._TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity)
+
+    def _model_fn(features, labels, mode, config):
+      return canned_boosted_trees._bt_model_fn(
+          features, labels, mode, head, feature_columns, tree_hparams,
+          n_batches_per_layer, config)
+
+    super(_BoostedTreesEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+    # pylint:enable=protected-access
+
+
+def boosted_trees_classifier_train_in_memory(
+    train_input_fn,
+    feature_columns,
+    model_dir=None,
+    n_classes=canned_boosted_trees._HOLD_FOR_MULTI_CLASS_SUPPORT,
+    weight_column=None,
+    label_vocabulary=None,
+    n_trees=100,
+    max_depth=6,
+    learning_rate=0.1,
+    l1_regularization=0.,
+    l2_regularization=0.,
+    tree_complexity=0.,
+    config=None,
+    train_hooks=None):
+  """Trains a boosted tree classifier with in memory dataset.
+
+  Example:
+
+  ```python
+  bucketized_feature_1 = bucketized_column(
+    numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+  bucketized_feature_2 = bucketized_column(
+    numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+  def input_fn_train():
+    dataset = create-dataset-from-training-data
+    # Don't use repeat or cache, since it is assumed to be one epoch
+    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    return dataset
+
+  classifier = boosted_trees_classifier_train_in_memory(
+      train_input_fn,
+      feature_columns=[bucketized_feature_1, bucketized_feature_2],
+      n_trees=100,
+      ... <some other params>
+  )
+
+  def input_fn_eval():
+    ...
+    return dataset
+
+  metrics = classifier.evaluate(input_fn=input_fn_eval, steps=10)
+  ```
+
+  Args:
+    train_input_fn: the input function returns a dataset containing a single
+      epoch of *unbatched* features and labels.
+    feature_columns: An iterable containing all the feature columns used by
+      the model. All items in the set should be instances of classes derived
+      from `FeatureColumn`.
+    model_dir: Directory to save model parameters, graph and etc. This can
+      also be used to load checkpoints from the directory into a estimator
+      to continue training a previously saved model.
+    n_classes: number of label classes. Default is binary classification.
+      Multiclass support is not yet implemented.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to downweight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
+    label_vocabulary: A list of strings represents possible label values. If
+      given, labels must be string type and have any value in
+      `label_vocabulary`. If it is not given, that means labels are
+      already encoded as integer or float within [0, 1] for `n_classes=2` and
+      encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+      Also there will be errors if vocabulary is not provided and labels are
+      string.
+    n_trees: number trees to be created.
+    max_depth: maximum depth of the tree to grow.
+    learning_rate: shrinkage parameter to be used when a tree added to the
+      model.
+    l1_regularization: regularization multiplier applied to the absolute
+      weights of the tree leafs.
+    l2_regularization: regularization multiplier applied to the square weights
+      of the tree leafs.
+    tree_complexity: regularization factor to penalize trees with more leaves.
+    config: `RunConfig` object to configure the runtime settings.
+    train_hooks: a list of Hook instances to be passed to estimator.train().
+
+  Returns:
+    a `BoostedTreesClassifier` instance created with the given arguments and
+      trained with the data loaded up on memory from the input_fn.
+
+  Raises:
+    ValueError: when wrong arguments are given or unsupported functionalities
+       are requested.
+  """
+  # pylint: disable=protected-access
+  # TODO(nponomareva): Support multi-class cases.
+  if n_classes == canned_boosted_trees._HOLD_FOR_MULTI_CLASS_SUPPORT:
+    n_classes = 2
+  head, closed_form = (
+      canned_boosted_trees._create_classification_head_and_closed_form(
+          n_classes, weight_column, label_vocabulary=label_vocabulary))
+
+  # HParams for the model.
+  tree_hparams = canned_boosted_trees._TreeHParams(
+      n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+      tree_complexity)
+
+  def _model_fn(features, labels, mode, config):
+    return canned_boosted_trees._bt_model_fn(
+        features,
+        labels,
+        mode,
+        head,
+        feature_columns,
+        tree_hparams,
+        n_batches_per_layer=1,
+        config=config,
+        closed_form_grad_and_hess_fn=closed_form,
+        train_in_memory=True)
+
+  in_memory_classifier = estimator.Estimator(
+      model_fn=_model_fn, model_dir=model_dir, config=config)
+
+  in_memory_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
+
+  return in_memory_classifier
+  # pylint: enable=protected-access
+
+
+def boosted_trees_regressor_train_in_memory(
+    train_input_fn,
+    feature_columns,
+    model_dir=None,
+    label_dimension=canned_boosted_trees._HOLD_FOR_MULTI_DIM_SUPPORT,
+    weight_column=None,
+    n_trees=100,
+    max_depth=6,
+    learning_rate=0.1,
+    l1_regularization=0.,
+    l2_regularization=0.,
+    tree_complexity=0.,
+    config=None,
+    train_hooks=None):
+  """Trains a boosted tree regressor with in memory dataset.
+
+  Example:
+
+  ```python
+  bucketized_feature_1 = bucketized_column(
+    numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+  bucketized_feature_2 = bucketized_column(
+    numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+  def input_fn_train():
+    dataset = create-dataset-from-training-data
+    # Don't use repeat or cache, since it is assumed to be one epoch
+    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    return dataset
+
+  regressor = boosted_trees_regressor_train_in_memory(
+      train_input_fn,
+      feature_columns=[bucketized_feature_1, bucketized_feature_2],
+      n_trees=100,
+      ... <some other params>
+  )
+
+  def input_fn_eval():
+    ...
+    return dataset
+
+  metrics = regressor.evaluate(input_fn=input_fn_eval, steps=10)
+  ```
+
+  Args:
+    train_input_fn: the input function returns a dataset containing a single
+      epoch of *unbatched* features and labels.
+    feature_columns: An iterable containing all the feature columns used by
+      the model. All items in the set should be instances of classes derived
+      from `FeatureColumn`.
+    model_dir: Directory to save model parameters, graph and etc. This can
+      also be used to load checkpoints from the directory into a estimator
+      to continue training a previously saved model.
+    label_dimension: Number of regression targets per example.
+      Multi-dimensional support is not yet implemented.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to downweight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
+    n_trees: number trees to be created.
+    max_depth: maximum depth of the tree to grow.
+    learning_rate: shrinkage parameter to be used when a tree added to the
+      model.
+    l1_regularization: regularization multiplier applied to the absolute
+      weights of the tree leafs.
+    l2_regularization: regularization multiplier applied to the square weights
+      of the tree leafs.
+    tree_complexity: regularization factor to penalize trees with more leaves.
+    config: `RunConfig` object to configure the runtime settings.
+    train_hooks: a list of Hook instances to be passed to estimator.train().
+
+  Returns:
+    a `BoostedTreesClassifier` instance created with the given arguments and
+      trained with the data loaded up on memory from the input_fn.
+
+  Raises:
+    ValueError: when wrong arguments are given or unsupported functionalities
+       are requested.
+  """
+  # pylint: disable=protected-access
+  # TODO(nponomareva): Extend it to multi-dimension cases.
+  if label_dimension == canned_boosted_trees._HOLD_FOR_MULTI_DIM_SUPPORT:
+    label_dimension = 1
+  head = canned_boosted_trees._create_regression_head(label_dimension,
+                                                      weight_column)
+
+  # HParams for the model.
+  tree_hparams = canned_boosted_trees._TreeHParams(
+      n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+      tree_complexity)
+
+  def _model_fn(features, labels, mode, config):
+    return canned_boosted_trees._bt_model_fn(
+        features,
+        labels,
+        mode,
+        head,
+        feature_columns,
+        tree_hparams,
+        n_batches_per_layer=1,
+        config=config,
+        train_in_memory=True)
+
+  in_memory_regressor = estimator.Estimator(
+      model_fn=_model_fn, model_dir=model_dir, config=config)
+
+  in_memory_regressor.train(input_fn=train_input_fn, hooks=train_hooks)
+
+  return in_memory_regressor
+  # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e99a87f3b3c0e7c5840fa250506e600645bf6a29
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests boosted_trees estimators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.estimator.python.estimator import boosted_trees
+from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import checkpoint_utils
+
+NUM_FEATURES = 3
+
+BUCKET_BOUNDARIES = [-2., .5, 12.]  # Boundaries for all the features.
+INPUT_FEATURES = np.array(
+    [
+        [12.5, 1.0, -2.001, -2.0001, -1.999],  # feature_0 quantized:[3,2,0,0,1]
+        [2.0, -3.0, 0.5, 0.0, 0.4995],         # feature_1 quantized:[2,0,2,1,1]
+        [3.0, 20.0, 50.0, -100.0, 102.75],     # feature_2 quantized:[2,3,3,0,3]
+    ],
+    dtype=np.float32)
+CLASSIFICATION_LABELS = [[0.], [1.], [1.], [0.], [0.]]
+REGRESSION_LABELS = [[1.5], [0.3], [0.2], [2.], [5.]]
+FEATURES_DICT = {'f_%d' % i: INPUT_FEATURES[i] for i in range(NUM_FEATURES)}
+
+
+def _make_train_input_fn(is_classification):
+  """Makes train input_fn for classification/regression."""
+
+  def _input_fn():
+    features = dict(FEATURES_DICT)
+    if is_classification:
+      labels = CLASSIFICATION_LABELS
+    else:
+      labels = REGRESSION_LABELS
+    return features, labels
+
+  return _input_fn
+
+
+class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._head = canned_boosted_trees._create_regression_head(label_dimension=1)
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES)
+        for i in range(NUM_FEATURES)
+    }
+
+  def _assert_checkpoint(self, model_dir, expected_global_step):
+    self.assertEqual(expected_global_step,
+                     checkpoint_utils.load_variable(model_dir,
+                                                    ops.GraphKeys.GLOBAL_STEP))
+
+  def testTrainAndEvaluateEstimator(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        head=self._head,
+        max_depth=5)
+
+    # It will stop after 10 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(est.model_dir, 11)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 0.913176)
+
+  def testInferEstimator(self):
+    train_input_fn = _make_train_input_fn(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5,
+        head=self._head)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(est.model_dir, 6)
+
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertEquals(5, len(predictions))
+    self.assertAllClose([0.703549], predictions[0]['predictions'])
+    self.assertAllClose([0.266539], predictions[1]['predictions'])
+    self.assertAllClose([0.256479], predictions[2]['predictions'])
+    self.assertAllClose([1.088732], predictions[3]['predictions'])
+    self.assertAllClose([1.901732], predictions[4]['predictions'])
+
+
+class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES)
+        for i in range(NUM_FEATURES)
+    }
+
+  def _assert_checkpoint(self, model_dir, expected_global_step):
+    self.assertEqual(expected_global_step,
+                     checkpoint_utils.load_variable(model_dir,
+                                                    ops.GraphKeys.GLOBAL_STEP))
+
+  def testBinaryClassifierTrainInMemoryAndEvalAndInfer(self):
+    train_input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_classifier_train_in_memory(
+        train_input_fn=train_input_fn,
+        feature_columns=self._feature_columns,
+        n_trees=1,
+        max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(est.model_dir, 6)
+
+    # Check eval.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+
+    # Check predict that all labels are correct.
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertEquals(5, len(predictions))
+    self.assertAllClose([0], predictions[0]['class_ids'])
+    self.assertAllClose([1], predictions[1]['class_ids'])
+    self.assertAllClose([1], predictions[2]['class_ids'])
+    self.assertAllClose([0], predictions[3]['class_ids'])
+    self.assertAllClose([0], predictions[4]['class_ids'])
+
+
+class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES)
+        for i in range(NUM_FEATURES)
+    }
+
+  def _assert_checkpoint(self, model_dir, expected_global_step):
+    self.assertEqual(expected_global_step,
+                     checkpoint_utils.load_variable(model_dir,
+                                                    ops.GraphKeys.GLOBAL_STEP))
+
+  def testRegressorTrainInMemoryAndEvalAndInfer(self):
+    train_input_fn = _make_train_input_fn(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_regressor_train_in_memory(
+        train_input_fn=train_input_fn,
+        feature_columns=self._feature_columns,
+        n_trees=1,
+        max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(est.model_dir, 6)
+
+    # Check eval.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.2136638)
+
+    # Validate predictions.
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertEquals(5, len(predictions))
+    self.assertAllClose([0.703549], predictions[0]['predictions'])
+    self.assertAllClose([0.266539], predictions[1]['predictions'])
+    self.assertAllClose([0.256479], predictions[2]['predictions'])
+    self.assertAllClose([1.088732], predictions[3]['predictions'])
+    self.assertAllClose([1.901732], predictions[4]['predictions'])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
index b5e4d34dc70ccaa4806ae8b8ed5001bd971ee7b4..dd009a6753f3231638f93e50fc8f19eae8820139 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -52,7 +53,9 @@ def _dnn_only_estimator_fn(
     config=None):
   return dnn_linear_combined.DNNLinearCombinedEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       model_dir=model_dir,
       dnn_feature_columns=feature_columns,
       dnn_optimizer=optimizer,
@@ -100,7 +103,9 @@ def _linear_only_estimator_fn(
     partitioner=None):
   return dnn_linear_combined.DNNLinearCombinedEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       model_dir=model_dir,
       linear_feature_columns=feature_columns,
       linear_optimizer=optimizer,
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_test.py
index 71f810acec856d42d389260e7b9fea32123348b4..75e3107670d658e55ce23d983e47311f1c180104 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -41,7 +42,9 @@ def _dnn_estimator_fn(weight_column=None, label_dimension=1, *args, **kwargs):
   """Returns a DNNEstimator that uses regression_head."""
   return dnn.DNNEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       *args, **kwargs)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 2b6881b81487dfdb682d5d6261a0318c59d461f6..266ae933052b11b9ab3edb662e95c90aae207dae 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import util as estimator_util
+from tensorflow.python.estimator.export.export_output import PredictOutput
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.ops import clip_ops
@@ -233,7 +234,17 @@ def forward_features(estimator, keys=None):
             'argument of forward_features to filter unwanted features. Type of '
             'features[{}] is {}.'.format(key, key, type(feature)))
       predictions[key] = feature
-    return spec._replace(predictions=predictions)
+    spec = spec._replace(predictions=predictions)
+    if spec.export_outputs:
+      for ekey in ['predict', 'serving_default']:
+        if (ekey in spec.export_outputs and
+            isinstance(spec.export_outputs[ekey],
+                       PredictOutput)):
+          export_outputs = spec.export_outputs[ekey].outputs
+          for key in get_keys(features):
+            export_outputs[key] = predictions[key]
+
+    return spec
 
   return estimator_lib.Estimator(
       model_fn=new_model_fn,
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders_test.py b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
index ad1a8ef152b07ecbab33d9eb3184a2ae89def27d..407af2deaf0928361a4f0b0e44e842b7750118cb 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
@@ -18,20 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import tempfile
 import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import extenders
+from tensorflow.contrib.predictor import from_saved_model
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import linear
 from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import training
+from tensorflow.python.util import compat
 
 
 def get_input_fn(x, y):
@@ -177,6 +184,44 @@ class ForwardFeaturesTest(test.TestCase):
     self.assertIn('id', predictions)
     self.assertEqual(101, predictions['id'])
 
+  def test_forward_in_exported(self):
+
+    def serving_input_fn():
+      features_ph = {
+          'x': array_ops.placeholder(dtypes.float32, [None]),
+          'id': array_ops.placeholder(dtypes.int32, [None])
+      }
+      features = {
+          key: array_ops.expand_dims(tensor, -1)
+          for key, tensor in features_ph.items()
+      }
+      return estimator_lib.export.ServingInputReceiver(features, features_ph)
+    def input_fn():
+      return {'x': [[3.], [5.]], 'id': [[101], [102]]}, [[1.], [2.]]
+    # create estimator
+    feature_columns = [fc.numeric_column('x')]
+    estimator = linear.LinearRegressor(feature_columns)
+    estimator.train(input_fn=input_fn, steps=1)
+    estimator = extenders.forward_features(estimator, 'id')
+
+    # export saved model
+    tmpdir = tempfile.mkdtemp()
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = estimator.export_savedmodel(export_dir_base, serving_input_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+    # restore model
+    predict_fn = from_saved_model(export_dir, signature_def_key='predict')
+    predictions = predict_fn({'x': [3], 'id': [101]})
+
+    # verify that 'id' exists in predictions
+    self.assertIn('id', predictions)
+    self.assertEqual(101, predictions['id'])
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
   def test_forward_list(self):
 
     def input_fn():
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index f95fcc8039cb54c26543781b31013a7676168b0b..85ef3291bae44d3c3126d778eba718ebe15993b5 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -36,10 +36,12 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
 
+# TODO(b/65403806): Switch loss_reduction default to SUM_OVER_BATCH_SIZE.
 def multi_class_head(n_classes,
                      weight_column=None,
                      label_vocabulary=None,
@@ -176,7 +178,7 @@ def binary_classification_head(
 
 def regression_head(weight_column=None,
                     label_dimension=1,
-                    loss_reduction=losses.Reduction.SUM,
+                    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                     loss_fn=None,
                     inverse_link_fn=None,
                     name=None):
@@ -216,7 +218,9 @@ def regression_head(weight_column=None,
       of the last dimension of the labels `Tensor` (typically, this has shape
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch and label dimension. Defaults to
+      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
+      `batch size * label_dimension`. See `tf.losses.Reduction`.
     loss_fn: Optional loss function. Defaults to `mean_squared_error`.
     inverse_link_fn: Optional inverse link function, also known as 'mean
       function'. Defaults to identity.
@@ -241,7 +245,7 @@ def regression_head(weight_column=None,
 def poisson_regression_head(
     weight_column=None,
     label_dimension=1,
-    loss_reduction=losses.Reduction.SUM,
+    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
     compute_full_loss=True,
     name=None):
   """Creates a `_Head` for poisson regression using `tf.nn.log_poisson_loss`.
@@ -273,7 +277,9 @@ def poisson_regression_head(
       of the last dimension of the labels `Tensor` (typically, this has shape
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch and label dimension. Defaults to
+      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
+      `batch size * label_dimension`. See `tf.losses.Reduction`.
     compute_full_loss: Whether to include the constant `log(z!)` term in
       computing the poisson loss. See `tf.nn.log_poisson_loss` for the full
       documentation.
@@ -302,7 +308,7 @@ def multi_label_head(n_classes,
                      weight_column=None,
                      thresholds=None,
                      label_vocabulary=None,
-                     loss_reduction=losses.Reduction.SUM,
+                     loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                      loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi-label classification.
@@ -353,7 +359,8 @@ def multi_label_head(n_classes,
       string type and have any value in `label_vocabulary`. Also there will be
       errors if vocabulary is not provided and labels are string.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
@@ -402,7 +409,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                weight_column=None,
                thresholds=None,
                label_vocabulary=None,
-               loss_reduction=losses.Reduction.SUM,
+               loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                loss_fn=None,
                name=None):
     self._n_classes = n_classes
@@ -489,8 +496,8 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         processed_labels=processed_labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -502,8 +509,11 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with
         `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when
         `mode` equals `TRAIN` or `EVAL`.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Required in TRAIN mode.
+        `train_op`. Used if `optimizer` is `None`.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses. These losses are
         usually expressed as a batch average, so for best results users need to
@@ -513,7 +523,8 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
     Returns:
       `EstimatorSpec`.
     Raises:
-      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode, or if both are set.
     """
     with ops.name_scope(self._name, 'head'):
       logits = head_lib._check_logits_final_dim(logits, self.logits_dimension)  # pylint:disable=protected-access
@@ -565,8 +576,16 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                 regularization_loss=regularization_loss))
 
       # Train.
-      if train_op_fn is None:
-        raise ValueError('train_op_fn can not be None.')
+      if optimizer is not None:
+        if train_op_fn is not None:
+          raise ValueError('train_op_fn and optimizer cannot both be set.')
+        train_op = optimizer.minimize(
+            regularized_training_loss,
+            global_step=training_util.get_global_step())
+      elif train_op_fn is not None:
+        train_op = train_op_fn(regularized_training_loss)
+      else:
+        raise ValueError('train_op_fn and optimizer cannot both be None.')
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -592,7 +611,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
-        train_op=train_op_fn(regularized_training_loss))
+        train_op=train_op)
 
   def _eval_metric_ops(
       self, labels, probabilities, weights, unreduced_loss,
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index dc30dde877ab5f912e3f6a724d481b151a3ed044..98962ca4277a3e8fbbdb3fb2d26df9acc45168b5 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -272,9 +272,9 @@ class MultiLabelHead(test.TestCase):
 
     logits = np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32)
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    expected_training_loss = np.sum(
+    # loss = (labels * -log(sigmoid(logits)) +
+    #         (1 - labels) * -log(1 - sigmoid(logits))) / 2
+    expected_training_loss = 0.5 * np.sum(
         _sigmoid_cross_entropy(labels=labels, logits=logits))
     actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
@@ -298,7 +298,7 @@ class MultiLabelHead(test.TestCase):
     # For large logits, this is approximated as:
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits
-    expected_training_loss = np.sum(
+    expected_training_loss = 0.5 * np.sum(
         np.array([[(10. + 10.) / 2.], [(15. + 0.) / 2.]], dtype=np.float32))
     actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
@@ -361,7 +361,7 @@ class MultiLabelHead(test.TestCase):
         labels=labels_input)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+      self.assertAllClose(np.sum(loss) / 2., actual_training_loss.eval())
 
   def test_eval_create_loss_loss_fn_wrong_shape(self):
     """Tests custom loss_fn that returns Tensor of unexpected shape."""
@@ -438,12 +438,13 @@ class MultiLabelHead(test.TestCase):
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples.
-    expected_loss = np.sum(_sigmoid_cross_entropy(labels=labels, logits=logits))
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels, logits=logits))
     keys = metric_keys.MetricKeys
     expected_metrics = {
         # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss / 2,
+        keys.LOSS_MEAN: expected_loss,
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
@@ -468,14 +469,13 @@ class MultiLabelHead(test.TestCase):
     labels_multi_hot = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples.
-    expected_loss = (
-        np.sum(_sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
-    )
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
     keys = metric_keys.MetricKeys
     expected_metrics = {
         # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss / 2,
+        keys.LOSS_MEAN: expected_loss,
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
@@ -533,14 +533,13 @@ class MultiLabelHead(test.TestCase):
     labels_multi_hot = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples.
-    expected_loss = (
-        np.sum(_sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
-    )
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
     keys = metric_keys.MetricKeys
     expected_metrics = {
         # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss / 2,
+        keys.LOSS_MEAN: expected_loss,
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
@@ -562,15 +561,14 @@ class MultiLabelHead(test.TestCase):
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples.
-    expected_loss = (
-        np.sum(_sigmoid_cross_entropy(labels=labels, logits=logits))
-    )
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels, logits=logits))
 
     keys = metric_keys.MetricKeys
     expected_metrics = {
         # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss / 2,
+        keys.LOSS_MEAN: expected_loss,
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
@@ -603,8 +601,9 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, weighted sum over examples.
-    expected_loss = 25.
+    # Average over classes, weighted sum over examples, divide by batch_size.
+    # loss = ( 1 * (10 + 10) / 2 + 2 * (15 + 0) / 2) / 2
+    expected_loss = 12.5
 
     spec = head.create_estimator_spec(
         features={
@@ -617,8 +616,8 @@ class MultiLabelHead(test.TestCase):
 
     keys = metric_keys.MetricKeys
     expected_metrics = {
-        # Average loss over weighted examples.
-        keys.LOSS_MEAN: expected_loss / 3,
+        # Average loss over weighted examples (denominator is sum(weights)).
+        keys.LOSS_MEAN: expected_loss * (2. / 3.),
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.2000,
@@ -663,7 +662,7 @@ class MultiLabelHead(test.TestCase):
     #        (1 - labels) * (logits > 0) * logits
     expected_unreduced_loss = [[(10. + 10.) / 2.], [(15. + 0.) / 2.]]
     expected_weights = [[1.], [2.]]
-    expected_training_loss = 1. * (10. + 10.) / 2. + 2. * (15. + 0.) / 2.
+    expected_training_loss = (1. * (10. + 10.) / 2. + 2. * (15. + 0.) / 2.) / 2.
     training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features={
             'x': np.array(((42,),), dtype=np.int32),
@@ -809,11 +808,8 @@ class MultiLabelHead(test.TestCase):
       self.assertEqual(
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # Average loss over examples.
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-      }, summary_str, tol)
+      _assert_simple_summaries(
+          self, {metric_keys.MetricKeys.LOSS: expected_loss}, summary_str, tol)
 
   def test_train(self):
     head = head_lib.multi_label_head(n_classes=2)
@@ -823,8 +819,9 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over weights.
-    expected_loss = 17.5
+    # Average over classes, sum over examples, divide by batch_size.
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
+    expected_loss = 8.75
     self._test_train(
         head=head, logits=logits, labels=labels, expected_loss=expected_loss)
 
@@ -840,8 +837,9 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over weights.
-    expected_loss = 17.5
+    # Average over classes, sum over examples, divide by batch_size.
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
+    expected_loss = 8.75
     self._test_train(
         head=head, logits=logits, labels=labels, expected_loss=expected_loss)
 
@@ -858,11 +856,49 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over weights.
-    expected_loss = 17.5
+    # Average over classes, sum over examples, divide by batch_size.
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
+    expected_loss = 8.75
     self._test_train(
         head=head, logits=logits, labels=labels, expected_loss=expected_loss)
 
+  def test_train_with_optimizer(self):
+    head = head_lib.multi_label_head(n_classes=2)
+    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
+    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    # For large logits, sigmoid cross entropy loss is approximated as:
+    # loss = labels * (logits < 0) * (-logits) +
+    #        (1 - labels) * (logits > 0) * logits =>
+    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # Average over classes, sum over examples, divide by batch_size.
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
+    expected_loss = 8.75
+    expected_train_result = 'my_train_op'
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        return string_ops.string_join(
+            [constant_op.constant(expected_train_result),
+             string_ops.as_string(loss, precision=3)])
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    tol = 1e-3
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
+          train_result)
+
   def test_train_with_regularization_losses(self):
     head = head_lib.multi_label_head(
         n_classes=2, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
@@ -916,8 +952,9 @@ class MultiLabelHead(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, weighted sum over examples.
-    expected_loss = 25.
+    # Average over classes, weighted sum over examples, divide by batch_size.
+    # loss = ( 1 * (10 + 10) / 2 + 2 * (15 + 0) / 2 ) / 2
+    expected_loss = 12.5
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
       return string_ops.string_join(
@@ -951,11 +988,8 @@ class MultiLabelHead(test.TestCase):
       self.assertEqual(
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # Average loss over weighted examples.
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 3,
-      }, summary_str, tol)
+      _assert_simple_summaries(
+          self, {metric_keys.MetricKeys.LOSS: expected_loss,}, summary_str, tol)
 
   def test_multi_dim_weighted_train_create_loss(self):
     """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
@@ -972,8 +1006,8 @@ class MultiLabelHead(test.TestCase):
     expected_unreduced_loss = [[[20./3.], [10./3.]], [[4.], [8.]]]
     # weights are reshaped to [2, 2, 1] to match logits.
     expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
-    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_training_loss = 39.6667
+    # loss = (1*20/3 + 1.5*10/3 + 2*4 + 2.5*8) / 4 = 9.9167
+    expected_training_loss = 9.9167
     training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features={'weights': weights},
         mode=model_fn.ModeKeys.TRAIN,
@@ -999,8 +1033,8 @@ class MultiLabelHead(test.TestCase):
     weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
     # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
     #      = [[20/3, 10/3], [4, 8]]
-    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_loss = 39.6667
+    # loss = (1*20/3 + 1.5*10/3 + 2*4 + 2.5*8) / 4 = 9.9167
+    expected_loss = 9.9167
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
       return string_ops.string_join(
@@ -1088,11 +1122,11 @@ class MultiLabelHead(test.TestCase):
     weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
     # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
     #      = [[20/3, 10/3], [4, 8]]
-    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_loss = 39.6667
+    # loss = (1*20/3 + 1.5*10/3 + 2*4 + 2.5*8) / 4 = 9.9167
+    expected_loss = 9.9167
     keys = metric_keys.MetricKeys
     expected_metrics = {
-        keys.LOSS_MEAN: expected_loss / np.sum(weights),
+        keys.LOSS_MEAN: expected_loss * (4. / np.sum(weights)),
         # auc and auc_pr cannot be reliably calculated for only 4 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.4977,
@@ -1128,8 +1162,8 @@ class PoissonRegressionHead(test.TestCase):
     #         exp(-1) - 2 * (-1) + 2*ln(2) - 2 + 0.5*ln(2*pi*2),
     #         exp(1) - 3 * 1 + 3*ln(3) - 3 + 0.5*ln(2*pi*3)]
     #      = [1.0, 3.020, 1.482]
-    # sum_loss = 5.502
-    expected_loss = 5.502
+    # training_loss = (1.0 + 3.020 + 1.482) / 3
+    expected_loss = 1.834
     atol = 0.001
     expected_train_result = b'my_train_op'
     def _train_op_fn(loss):
diff --git a/tensorflow/contrib/estimator/python/estimator/linear_test.py b/tensorflow/contrib/estimator/python/estimator/linear_test.py
index c63514eb688af48577f0a3b7ce9e7478309f2c30..c41996b9c6871d294f157411662f2eb9d4c09e5c 100644
--- a/tensorflow/contrib/estimator/python/estimator/linear_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/linear_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -42,7 +43,9 @@ def _linear_estimator_fn(
   """Returns a LinearEstimator that uses regression_head."""
   return linear.LinearEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       *args, **kwargs)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index 0346ddc24bffd61068177f4622bd03be4acd53d9..bbbc19cc4dfb4b23f9b707023fbfdd124f1f48de 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.export import export_output as export_output_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -30,6 +31,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -226,8 +228,10 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
         weights=example_weights_by_head,
         processed_labels=labels_by_head)
 
+  # TODO(b/65403806): Support regularization_losses arg.
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None):
     """See `_Head`."""
     if isinstance(logits, dict):
       logits_dict = logits
@@ -248,9 +252,10 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
               train_op_fn=_no_op_train_fn))
 
     if mode == model_fn.ModeKeys.TRAIN:
-      if train_op_fn is None:
-        raise ValueError('train_op_fn can not be None in TRAIN mode.')
-      spec = self._merge_train(all_estimator_spec, train_op_fn)
+      spec = self._merge_train(
+          all_estimator_spec=all_estimator_spec,
+          optimizer=optimizer,
+          train_op_fn=train_op_fn)
       with ops.name_scope(''):
         summary.scalar(metric_keys.MetricKeys.LOSS, spec.loss)
       return spec
@@ -279,16 +284,21 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
         begin_idx += head.logits_dimension
     return logits_dict
 
-  def _merge_train(self, all_estimator_spec, train_op_fn):
+  def _merge_train(self, all_estimator_spec, optimizer, train_op_fn):
     """Merges list of `EstimatorSpec` for training.
 
     Args:
       all_estimator_spec: list of `EstimatorSpec` for the individual heads.
-      train_op_fn: Function to create train op. See `create_estimator_spec`
-        documentation for more details.
+      optimizer: `Optimizer` instance to create train op. See
+        `create_estimator_spec` documentation for more details.
+      train_op_fn: Function to create train op. Used if `optimizer` is `None`.
 
     Returns:
       `EstimatorSpec` that merges all heads for TRAIN.
+
+    Raises:
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode.
     """
     losses = []
     metrics = {}
@@ -297,11 +307,20 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
       # Metric keys already contain head.name.
       metrics.update(spec.eval_metric_ops or {})
     loss = _merge_losses(losses, self._head_weights)
+    if optimizer is not None:
+      if train_op_fn is not None:
+        raise ValueError('train_op_fn and optimizer cannot both be set.')
+      train_op = optimizer.minimize(
+          loss, global_step=training_util.get_global_step())
+    elif train_op_fn is not None:
+      train_op = train_op_fn(loss)
+    else:
+      raise ValueError('train_op_fn and optimizer cannot both be None.')
 
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         loss=loss,
-        train_op=train_op_fn(loss),
+        train_op=train_op,
         eval_metric_ops=metrics)
 
   def _merge_predict(self, all_estimator_spec):
@@ -319,6 +338,7 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
             all_estimator_spec[0].export_outputs,
             self._heads[0].name),
     }
+    merged_predict_outputs = {}
     for head, spec in zip(self._heads, all_estimator_spec):
       head_name = head.name
       for k, v in six.iteritems(spec.export_outputs):
@@ -327,8 +347,15 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
         else:
           key = '%s/%s' % (k, head_name)
         export_outputs[key] = v
+        if (k == head_lib._PREDICT_SERVING_KEY and  # pylint:disable=protected-access
+            isinstance(v, export_output_lib.PredictOutput)):
+          for kp, vp in six.iteritems(v.outputs):
+            key = '%s/%s' % (head_name, kp)
+            merged_predict_outputs[key] = vp
       for k, v in six.iteritems(spec.predictions):
         predictions[(head_name, k)] = v
+    export_outputs[head_lib._PREDICT_SERVING_KEY] = (  # pylint:disable=protected-access
+        export_output_lib.PredictOutput(merged_predict_outputs))
 
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.PREDICT,
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index 65ea89ba1b9236d0bf4d2de430fab168ef50bf97..d9e5aca2952d25a7d917f9d76f95ab89733115a0 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -127,8 +127,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'head1', 'classification/head1', 'predict/head1',
-         'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'classification/head1',
+         'predict/head1', 'head2', 'classification/head2', 'predict/head2'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -158,6 +158,22 @@ class MultiHeadTest(test.TestCase):
       self.assertAllClose(
           expected_probabilities['head2'],
           sess.run(spec.export_outputs['head2'].scores))
+      self.assertAllClose(
+          expected_probabilities['head1'],
+          sess.run(
+              spec.export_outputs['predict'].outputs['head1/probabilities']))
+      self.assertAllClose(
+          expected_probabilities['head2'],
+          sess.run(
+              spec.export_outputs['predict'].outputs['head2/probabilities']))
+      self.assertAllClose(
+          expected_probabilities['head1'],
+          sess.run(
+              spec.export_outputs['predict/head1'].outputs['probabilities']))
+      self.assertAllClose(
+          expected_probabilities['head2'],
+          sess.run(
+              spec.export_outputs['predict/head2'].outputs['probabilities']))
 
   def test_predict_two_heads_logits_tensor(self):
     """Tests predict with logits as Tensor."""
@@ -181,8 +197,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'head1', 'classification/head1', 'predict/head1',
-         'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'classification/head1',
+         'predict/head1', 'head2', 'classification/head2', 'predict/head2'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -238,8 +254,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'head1', 'regression/head1', 'predict/head1',
-         'head2', 'regression/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'regression/head1',
+         'predict/head1', 'head2', 'regression/head2', 'predict/head2'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -283,10 +299,11 @@ class MultiHeadTest(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # head1: expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
     # head2: expected_unweighted_loss = [[20., 20., 20.], [30., 0., 0]]
-    # Average over classes, weighted sum over batch and heads.
-    expected_loss_head1 = 17.5
-    expected_loss_head2 = 30.0
+    # loss = ( (20 + 20 + 20) / 3 + (30 + 0 + 0) / 3 ) / 2 = 15
+    expected_loss_head1 = 8.75
+    expected_loss_head2 = 15.
     expected_loss = 1. * expected_loss_head1 + 2. * expected_loss_head2
 
     spec = multi_head.create_estimator_spec(
@@ -300,8 +317,8 @@ class MultiHeadTest(test.TestCase):
         keys.LOSS + '/head1': expected_loss_head1,
         keys.LOSS + '/head2': expected_loss_head2,
         # Average loss over examples.
-        keys.LOSS_MEAN + '/head1': expected_loss_head1 / 2,
-        keys.LOSS_MEAN + '/head2': expected_loss_head2 / 2,
+        keys.LOSS_MEAN + '/head1': expected_loss_head1,
+        keys.LOSS_MEAN + '/head2': expected_loss_head2,
         # auc and auc_pr cannot be reliably calculated for only 4-6 samples, but
         # this assert tests that the algorithm remains consistent.
         keys.AUC + '/head1': 0.1667,
@@ -347,8 +364,8 @@ class MultiHeadTest(test.TestCase):
     tol = 1e-3
     with self.test_session():
       # Unreduced loss of the head is [[(10 + 10) / 2], (15 + 0) / 2]
-      # (averaged over classes, sum-reduced over examples).
-      self.assertAllClose(17.5, loss.eval(), rtol=tol, atol=tol)
+      # (averaged over classes, averaged over examples).
+      self.assertAllClose(8.75, loss.eval(), rtol=tol, atol=tol)
 
   def test_train_create_loss_two_heads_with_weights(self):
     # Use different example weighting for each head weighting.
@@ -383,18 +400,18 @@ class MultiHeadTest(test.TestCase):
     with self.test_session():
       # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
       # = [10, 7.5]
-      # training_loss = 1 * 10 + 2 * 7.5 = 25
+      # training_loss = (1 * 10 + 2 * 7.5) / 2 = 12.5
       # head-weighted unreduced_loss = 1 * [10, 7.5]
       self.assertAllClose(
           [[10.], [7.5]], unreduced_losses['head1'].eval(), rtol=tol, atol=tol)
       # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
       # = [20, 10]
-      # training_loss = 2 * 20 + 3 * 10 = 70
+      # training_loss = (2 * 20 + 3 * 10) / 2 = 35
       # head-weighted unreduced_loss = 2 * [20, 10]
       self.assertAllClose(
           [[40.], [20.]], unreduced_losses['head2'].eval(), rtol=tol, atol=tol)
-      # head-weighted training_loss = 1 * 25 + 2 * 70 = 165
-      self.assertAllClose(165, training_loss.eval(), rtol=tol, atol=tol)
+      # head-weighted training_loss = 1 * 12.5 + 2 * 35 = 82.5
+      self.assertAllClose(82.5, training_loss.eval(), rtol=tol, atol=tol)
       # head-weighted example weights
       self.assertAllClose(
           [[1.], [2.]], weights['head1'].eval(), rtol=tol, atol=tol)
@@ -431,18 +448,18 @@ class MultiHeadTest(test.TestCase):
     with self.test_session():
       # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
       # = [10, 7.5]
-      # training_loss = 1 * 10 + 2 * 7.5 = 25
+      # training_loss = (1 * 10 + 2 * 7.5) / 2 = 12.5
       # head-weighted unreduced_loss = 1 * [10, 7.5]
       self.assertAllClose(
           [[10.], [7.5]], unreduced_losses['head1'].eval(), rtol=tol, atol=tol)
       # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
       # = [20, 10]
-      # training_loss = 2 * 20 + 3 * 10 = 70
+      # training_loss = (2 * 20 + 3 * 10) / 2 = 35
       # head-weighted unreduced_loss = 2 * [20, 10]
       self.assertAllClose(
           [[40.], [20.]], unreduced_losses['head2'].eval(), rtol=tol, atol=tol)
-      # head-weighted training_loss = 1 * 25 + 2 * 70 = 165
-      self.assertAllClose(165, training_loss.eval(), rtol=tol, atol=tol)
+      # head-weighted training_loss = 1 * 12.5 + 2 * 35 = 82.5
+      self.assertAllClose(82.5, training_loss.eval(), rtol=tol, atol=tol)
       # head-weighted example weights
       self.assertAllClose(
           [[1.], [2.]], weights['head1'].eval(), rtol=tol, atol=tol)
@@ -466,14 +483,14 @@ class MultiHeadTest(test.TestCase):
                            [[2., 2., 0.], [2., 2., 0.]]], dtype=np.float32),
     }
     # Loss for the first head:
-    # loss1 = (1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
-    #         (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2
-    #       = 28
+    # loss1 = ((1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
+    #          (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2) / 8
+    #       = 3.5
     # Loss for the second head:
-    # loss2 = (0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
-    #         (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2
-    #       = 74
-    expected_training_loss = 28. + 74.
+    # loss2 = ((0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
+    #          (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2) / 12
+    #       = 6.167
+    expected_training_loss = 3.5 + 6.167
 
     training_loss = multi_head.create_loss(
         features={},
@@ -495,8 +512,8 @@ class MultiHeadTest(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over weights.
-    expected_loss = 17.5
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
+    expected_loss = 8.75
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
       return string_ops.string_join(
@@ -530,10 +547,46 @@ class MultiHeadTest(test.TestCase):
       _assert_simple_summaries(self, {
           metric_keys.MetricKeys.LOSS: expected_loss,
           metric_keys.MetricKeys.LOSS + '/head1': expected_loss,
-          # Average loss over examples.
-          metric_keys.MetricKeys.LOSS_MEAN + '/head1': expected_loss / 2,
       }, summary_str, tol)
 
+  def test_train_one_head_with_optimizer(self):
+    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
+    multi_head = multi_head_lib.multi_head([head1])
+
+    logits = {'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)}
+    labels = {'head1': np.array([[1, 0], [1, 1]], dtype=np.int64)}
+    # For large logits, sigmoid cross entropy loss is approximated as:
+    # loss = labels * (logits < 0) * (-logits) +
+    #        (1 - labels) * (logits > 0) * logits =>
+    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
+    expected_loss = 8.75
+    expected_train_result = 'my_train_op'
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        return string_ops.string_join(
+            [constant_op.constant(expected_train_result),
+             string_ops.as_string(loss, precision=3)])
+
+    spec = multi_head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    tol = 1e-3
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
+          train_result)
+
   def test_train_two_heads_with_weights(self):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     head2 = head_lib.multi_label_head(n_classes=3, name='head2')
@@ -553,10 +606,12 @@ class MultiHeadTest(test.TestCase):
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits =>
     # head1: expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
     # head2: expected_unweighted_loss = [[20., 20., 20.], [30., 0., 0]]
+    # loss = ( (20 + 20 + 20) / 3 + (30 + 0 + 0) / 3 ) / 2 = 15
     # Average over classes, weighted sum over batch and heads.
-    expected_loss_head1 = 17.5
-    expected_loss_head2 = 30.0
+    expected_loss_head1 = 8.75
+    expected_loss_head2 = 15.0
     expected_loss = 1. * expected_loss_head1 + 2. * expected_loss_head2
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
@@ -592,9 +647,6 @@ class MultiHeadTest(test.TestCase):
           metric_keys.MetricKeys.LOSS: expected_loss,
           metric_keys.MetricKeys.LOSS + '/head1': expected_loss_head1,
           metric_keys.MetricKeys.LOSS + '/head2': expected_loss_head2,
-          # Average loss over examples.
-          metric_keys.MetricKeys.LOSS_MEAN + '/head1': expected_loss_head1 / 2,
-          metric_keys.MetricKeys.LOSS_MEAN + '/head2': expected_loss_head2 / 2,
       }, summary_str, tol)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index e0fae2c99292385c6dd32cc6002cee2076a2bb20..fa2697800ec1a44f215f3d5fc9be2197a9e58219 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -136,7 +136,7 @@ def replicate_model_fn(model_fn,
       the train_op argument of `EstimatorSpec`.
     loss_reduction: controls whether losses are summed or averaged.
     devices: Optional list of devices to replicate the model across.  This
-      argument can be used to replice only on the subset of available GPUs.
+      argument can be used to replicate only on the subset of available GPUs.
       If `None`, then all available GPUs are going to be used for replication.
       If no GPUs are available, then the model is going to be placed on the CPU.
 
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index d46a18aacfcd911c56a9f22dc9581060c7b458a6..144b45982c8aec2e2b115c812b24e8843d60ce1e 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import re
 import shutil
 import tempfile
+from absl.testing import parameterized
 import numpy as np
 import six
 
@@ -57,26 +58,19 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import training
 
 
-# TODO(isaprykin):  Parametrize all the tests on
-#   replicate_model_fn._VariableDistributionMode when it's supported.
-class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
+class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase,
+                                   parameterized.TestCase):
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
 
-  def test_complete_flow_with_public_version(self):
-    return self._complete_flow_with_mode(mode=None)
-
-  def test_complete_flow_with_mode_local_ps_server(self):
-    return self._complete_flow_with_mode(
-        replicate_model_fn._VariableDistributionMode.
-        SHARED_LOCAL_PARAMETER_SERVER)
-
-  def test_complete_flow_with_mode_round_robin(self):
-    return self._complete_flow_with_mode(
-        replicate_model_fn._VariableDistributionMode.SHARED_ROUND_ROBIN)
-
-  def _complete_flow_with_mode(self, mode):
+  @parameterized.named_parameters(
+      ('PublicInterface', None),
+      ('ParameterServerMode', replicate_model_fn._VariableDistributionMode.
+       SHARED_LOCAL_PARAMETER_SERVER),
+      ('RoundRobinMode',
+       replicate_model_fn._VariableDistributionMode.SHARED_ROUND_ROBIN))
+  def test_complete_flow_with_mode(self, mode):
     n_classes = 3
     input_dimension = 2
     batch_size = 12
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index c56c92a0a4a01218d1da5a6b366df3272d14b861..0a648d5d40e431bedb42017b15cabe078ac22fa7 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -66,6 +66,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -241,6 +242,7 @@ py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -345,16 +347,3 @@ cuda_py_test(
     ],
     main = "python/kernel_tests/masked_matmul_benchmark.py",
 )
-
-# All files
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/factorization/examples/BUILD b/tensorflow/contrib/factorization/examples/BUILD
index bbe842bd5ccc7357805adda1df42ba8799fcd8f2..363baa121ab3854a802ca3606e35597d31b35a57 100644
--- a/tensorflow/contrib/factorization/examples/BUILD
+++ b/tensorflow/contrib/factorization/examples/BUILD
@@ -21,14 +21,3 @@ tf_py_test(
     ],
     tags = ["notsan"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/factorization/kernels/BUILD b/tensorflow/contrib/factorization/kernels/BUILD
index 44eab56011dad2f6fbe843b3569b4acc5c5e542a..ea8b9a17a27093cb57564861815edd6ecb18a014 100644
--- a/tensorflow/contrib/factorization/kernels/BUILD
+++ b/tensorflow/contrib/factorization/kernels/BUILD
@@ -67,14 +67,3 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
index dd61f59585aee2e0245cfd6797b313b972c19bc5..2a6c97e8b9526894eba057505a2bf823ad778f56 100644
--- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
@@ -353,7 +353,7 @@ class NearestNeighborsOp : public OpKernel {
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
     const int64 num_threads = worker_threads.num_threads;
     // This kernel might be configured to use fewer than the total number of
-    // available CPUs on the host machine. To avoid descructive interference
+    // available CPUs on the host machine. To avoid destructive interference
     // with other jobs running on the host machine, we must only use a fraction
     // of total available L3 cache. Unfortunately, we cannot query the host
     // machine to get the number of physical CPUs. So, we use a fixed per-CPU
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 23137e0a973c0bdd2cdbd97159f7fd310178bf54..84e80791f4991ad2b67d0a00ee1e00cf0d0daadc 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -41,11 +41,12 @@ from tensorflow.python.platform import resource_loader
 _clustering_ops = loader.load_op_library(
     resource_loader.get_path_to_datafile('_clustering_ops.so'))
 
-# Euclidean distance between vectors U and V is defined as ||U - V||_F which is
-# the square root of the sum of the absolute squares of the elements difference.
+# Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
+# which is the square root of the sum of the absolute squares of the elements
+# difference.
 SQUARED_EUCLIDEAN_DISTANCE = 'squared_euclidean'
 # Cosine distance between vectors U and V is defined as
-# 1 - (U \dot V) / (||U||_F ||V||_F)
+# \\(1 - (U \dot V) / (||U||_F ||V||_F)\\)
 COSINE_DISTANCE = 'cosine'
 
 RANDOM_INIT = 'random'
@@ -472,8 +473,8 @@ class KMeans(object):
         # Locally compute the sum of inputs mapped to each id.
         # For a cluster with old cluster value x, old count n, and with data
         # d_1,...d_k newly assigned to it, we recompute the new value as
-        # x += (sum_i(d_i) - k * x) / (n + k).
-        # Compute sum_i(d_i), see comment above.
+        # \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
+        # Compute \\(sum_i(d_i)\\), see comment above.
         cluster_center_updates = math_ops.unsorted_segment_sum(
             inp, unique_idx, num_unique_cluster_idx)
         # Shape to enable broadcasting count_updates and learning_rate to inp.
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 054888e734086c153f7af59f4548d4d20abab813..811fa89bc38c61b16710a441b99d9e5dfac67668 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -51,9 +51,9 @@ class WALSModel(object):
   r"""A model for Weighted Alternating Least Squares matrix factorization.
 
   It minimizes the following loss function over U, V:
-   \\(
-   \|\sqrt W \odot (A - U V^T) \|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)
-   )\\
+  $$
+   \|\sqrt W \odot (A - U V^T)\|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)
+  $$
     where,
     A: input matrix,
     W: weight matrix. Note that the (element-wise) square root of the weights
@@ -61,12 +61,12 @@ class WALSModel(object):
     U, V: row_factors and column_factors matrices,
     \\(\lambda)\\: regularization.
   Also we assume that W is of the following special form:
-  \\( W_{ij} = W_0 + R_i * C_j )\\  if \\(A_{ij} \ne 0)\\,
-  \\(W_{ij} = W_0)\\ otherwise.
+  \\( W_{ij} = W_0 + R_i * C_j \\)  if \\(A_{ij} \ne 0\\),
+  \\(W_{ij} = W_0\\) otherwise.
   where,
-  \\(W_0)\\: unobserved_weight,
-  \\(R_i)\\: row_weights,
-  \\(C_j)\\: col_weights.
+  \\(W_0\\): unobserved_weight,
+  \\(R_i\\): row_weights,
+  \\(C_j\\): col_weights.
 
   Note that the current implementation supports two operation modes: The default
   mode is for the condition where row_factors and col_factors can individually
@@ -82,14 +82,15 @@ class WALSModel(object):
   normalized as follows:
     _, _, unregularized_loss, regularization, sum_weights =
         update_row_factors(sp_input)
-  if sp_input contains the rows {A_i, i \in I}, and the input matrix A has n
-  total rows, then the minibatch loss = unregularized_loss + regularization is
-   \\(
+  if sp_input contains the rows \\({A_i, i \in I}\\), and the input matrix A
+  has n total rows, then the minibatch loss = unregularized_loss +
+  regularization is
+   $$
    (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 + \lambda \|U_I\|_F^2) * n / |I| +
    \lambda \|V\|_F^2
-   )\\
+   $$
   The sum_weights tensor contains the normalized sum of weights
-  sum(W_I) * n / |I|.
+  \\(sum(W_I) * n / |I|\\).
 
   A typical usage example (pseudocode):
 
@@ -106,7 +107,7 @@ class WALSModel(object):
       # the prep_gramian_op for row(column) can be run.
       worker_init_op = model.worker_init
 
-      # To be run once per interation sweep before the row(column) update
+      # To be run once per integration sweep before the row(column) update
       # initialize ops can be run. Note that in the distributed training
       # situations, this should only be run by the chief trainer. All other
       # trainers need to block until this is done.
@@ -118,9 +119,9 @@ class WALSModel(object):
       init_row_update_op = model.initialize_row_update_op
       init_col_update_op = model.initialize_col_update_op
 
-      # Ops to upate row(column). This can either take the entire sparse tensor
-      # or slices of sparse tensor. For distributed trainer, each trainer
-      # handles just part of the matrix.
+      # Ops to update row(column). This can either take the entire sparse
+      # tensor or slices of sparse tensor. For distributed trainer, each
+      # trainer handles just part of the matrix.
       _, row_update_op, unreg_row_loss, row_reg, _ = model.update_row_factors(
            sp_input=matrix_slices_from_queue_for_worker_shard)
       row_loss = unreg_row_loss + row_reg
@@ -220,10 +221,10 @@ class WALSModel(object):
         in the form of [[w_0, w_1, ...], [w_k, ... ], [...]], with the number of
         inner lists matching the number of row factor shards and the elements in
         each inner list are the weights for the rows of the corresponding row
-        factor shard. In this case,  w_ij = unonbserved_weight +
+        factor shard. In this case,  w_ij = unobserved_weight +
                                             row_weights[i] * col_weights[j].
         - If this is a single non-negative real number, this value is used for
-        all row weights and w_ij = unobserved_weight + row_weights *
+        all row weights and \\(w_ij\\) = unobserved_weight + row_weights *
                                    col_weights[j].
         Note that it is allowed to have row_weights as a list while col_weights
         a single number or vice versa.
@@ -435,7 +436,7 @@ class WALSModel(object):
       gramian: Variable storing the gramian calculated from the factors.
 
     Returns:
-      A op that updates the gramian with the calcuated value from the factors.
+      A op that updates the gramian with the calculated value from the factors.
     """
     partial_gramians = []
     for f in factors:
@@ -564,7 +565,7 @@ class WALSModel(object):
 
     Note that specifically this initializes the cache of the row and column
     weights on workers when `use_factors_weights_cache` is True. In this case,
-    if these weights are being calcualted and reset after the object is created,
+    if these weights are being calculated and reset after the object is created,
     it is important to ensure this ops is run afterwards so the cache reflects
     the correct values.
     """
@@ -665,18 +666,18 @@ class WALSModel(object):
         factors.
       unregularized_loss: A tensor (scalar) that contains the normalized
         minibatch loss corresponding to sp_input, without the regularization
-        term. If sp_input contains the rows {A_{i, :}, i \in I}, and the input
-        matrix A has n total rows, then the unregularized loss is:
-        (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 * n / |I|
+        term. If sp_input contains the rows \\({A_{i, :}, i \in I}\\), and the
+        input matrix A has n total rows, then the unregularized loss is:
+        \\(\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 * n / |I|\\)
         The total loss is unregularized_loss + regularization.
       regularization: A tensor (scalar) that contains the normalized
         regularization term for the minibatch loss corresponding to sp_input.
-        If sp_input contains the rows {A_{i, :}, i \in I}, and the input matrix
-        A has n total rows, then the regularization term is:
-        \lambda \|U_I\|_F^2) * n / |I| + \lambda \|V\|_F^2.
+        If sp_input contains the rows \\({A_{i, :}, i \in I}\\), and the input
+        matrix A has n total rows, then the regularization term is:
+        \\(\lambda \|U_I\|_F^2) * n / |I| + \lambda \|V\|_F^2\\).
       sum_weights: The sum of the weights W_I corresponding to sp_input,
-        normalized by a factor of n / |I|. The root weighted squared error is:
-        \sqrt(unregularized_loss / sum_weights).
+        normalized by a factor of \\(n / |I|\\). The root weighted squared
+        error is: \sqrt(unregularized_loss / sum_weights).
     """
     return self._process_input_helper(
         True, sp_input=sp_input, transpose_input=transpose_input)
@@ -698,18 +699,18 @@ class WALSModel(object):
         factors.
       unregularized_loss: A tensor (scalar) that contains the normalized
         minibatch loss corresponding to sp_input, without the regularization
-        term. If sp_input contains the columns {A_{:, j}, j \in J}, and the
-        input matrix A has m total columns, then the unregularized loss is:
-        (\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 * m / |I|
+        term. If sp_input contains the columns \\({A_{:, j}, j \in J}\\), and
+        the input matrix A has m total columns, then the unregularized loss is:
+        \\(\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 * m / |I|\\)
         The total loss is unregularized_loss + regularization.
       regularization: A tensor (scalar) that contains the normalized
         regularization term for the minibatch loss corresponding to sp_input.
-        If sp_input contains the columns {A_{:, j}, j \in J}, and the input
-        matrix A has m total columns, then the regularization term is:
-        \lambda \|V_J\|_F^2) * m / |J| + \lambda \|U\|_F^2.
+        If sp_input contains the columns \\({A_{:, j}, j \in J}\\), and the
+        input matrix A has m total columns, then the regularization term is:
+        \\(\lambda \|V_J\|_F^2) * m / |J| + \lambda \|U\|_F^2\\).
       sum_weights: The sum of the weights W_J corresponding to sp_input,
-        normalized by a factor of m / |J|. The root weighted squared error is:
-        \sqrt(unregularized_loss / sum_weights).
+        normalized by a factor of \\(m / |J|\\). The root weighted squared
+        error is: \sqrt(unregularized_loss / sum_weights).
     """
     return self._process_input_helper(
         False, sp_input=sp_input, transpose_input=transpose_input)
@@ -720,8 +721,8 @@ class WALSModel(object):
                           projection_weights=None):
     """Projects the row factors.
 
-    This computes the row embedding u_i for an observed row a_i by solving
-    one iteration of the update equations.
+    This computes the row embedding \\(u_i\\) for an observed row \\(a_i\\) by
+    solving one iteration of the update equations.
 
     Args:
       sp_input: A SparseTensor representing a set of rows. Please note that the
@@ -753,8 +754,8 @@ class WALSModel(object):
                           projection_weights=None):
     """Projects the column factors.
 
-    This computes the column embedding v_j for an observed column a_j by solving
-    one iteration of the update equations.
+    This computes the column embedding \\(v_j\\) for an observed column
+    \\(a_j\\) by solving one iteration of the update equations.
 
     Args:
       sp_input: A SparseTensor representing a set of columns. Please note that
@@ -938,7 +939,7 @@ class WALSModel(object):
     loss_sp_input = (sparse_ops.sparse_transpose(new_sp_input)
                      if transpose_input else new_sp_input)
     # sp_approx is the low rank estimate of the input matrix, formed by
-    # computing the product <u_i, v_j> for (i, j) in loss_sp_input.indices.
+    # computing the product <\\(u_i, v_j\\)> for (i, j) in loss_sp_input.indices.
     sp_approx_vals = gen_factorization_ops.masked_matmul(
         new_left_values,
         right,
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
index c8137339155ef1da8ee53967eea84a550f12ecbc..bb5140aeb3bf0238ca7cb52067ea6328dd1736d5 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
@@ -210,7 +210,7 @@ class WalsModelTest(test.TestCase):
 
       # Test row projection.
       # Using the specified projection weights for the 2 row feature vectors.
-      # This is expected to reprodue the same row factors in the model as the
+      # This is expected to reproduce the same row factors in the model as the
       # weights and feature vectors are identical to that used in model
       # training.
       projected_rows = wals_model.project_row_factors(
@@ -283,8 +283,8 @@ class WalsModelTest(test.TestCase):
 
       # Test column projection.
       # Using the specified projection weights for the 3 column feature vectors.
-      # This is expected to reprodue the same column factors in the model as the
-      # weights and feature vectors are identical to that used in model
+      # This is expected to reproduce the same column factors in the model as
+      # the weights and feature vectors are identical to that used in model
       # training.
       projected_cols = wals_model.project_col_factors(
           sp_input=sp_feeder,
@@ -385,7 +385,7 @@ class WalsModelTest(test.TestCase):
 
       # Test row projection.
       # Using the specified projection weights for the 2 row feature vectors.
-      # This is expected to reprodue the same row factors in the model as the
+      # This is expected to reproduce the same row factors in the model as the
       # weights and feature vectors are identical to that used in model
       # training.
       projected_rows = wals_model.project_row_factors(
@@ -462,8 +462,8 @@ class WalsModelTest(test.TestCase):
 
       # Test column projection.
       # Using the specified projection weights for the 2 column feature vectors.
-      # This is expected to reprodue the same column factors in the model as the
-      # weights and feature vectors are identical to that used in model
+      # This is expected to reproduce the same column factors in the model as
+      # the weights and feature vectors are identical to that used in model
       # training.
       projected_cols = wals_model.project_col_factors(
           sp_input=sp_feeder,
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index 98d6434f4752b224201e38bed05ccd14428a758b..5d77bc77e124378e13667673e4e841c0a1135b31 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -280,7 +280,7 @@ class GmmAlgorithm(object):
     self._define_score_samples()
 
   def _define_full_covariance_probs(self, shard_id, shard):
-    """Defines the full covariance probabilties per example in a class.
+    """Defines the full covariance probabilities per example in a class.
 
     Updates a matrix with dimension num_examples X num_classes.
 
@@ -344,7 +344,7 @@ class GmmAlgorithm(object):
   def _define_prior_log_prob_operation(self, shard_id):
     """Computes the prior probability of all samples.
 
-    Updates a vector where each item is the prior probabibility of an
+    Updates a vector where each item is the prior probability of an
     input example.
 
     Args:
@@ -357,8 +357,8 @@ class GmmAlgorithm(object):
     # Shape broadcasting.
     probs = array_ops.expand_dims(self._probs[shard_id], 0)
     # Membership weights are computed as:
-    # w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}
-    #               {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}
+    # $$w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}$$
+    # $$            {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}$$
     # where "i" is the i-th example, "k" is the k-th mixture, theta are
     # the model parameters and y_i the observations.
     # These are defined for each shard.
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_test.py b/tensorflow/contrib/factorization/python/ops/gmm_test.py
index 00a4734eb6d89cd02484f1c5161366377cc71208..4fc9c96e9d0a317ef757d5e1bb6563ed7c8832af 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_test.py
@@ -210,7 +210,7 @@ class GMMTestQueues(test.TestCase):
     return _fn
 
   # This test makes sure that there are no deadlocks when using a QueueRunner.
-  # Note that since cluster initialization is dependendent on inputs, if input
+  # Note that since cluster initialization is dependent on inputs, if input
   # is generated using a QueueRunner, one has to make sure that these runners
   # are started before the initialization.
   def test_queues(self):
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 7319eaa7de8db8e4677bdf64af3b0a72c1007a90..bfe338c9f9a7b761cfcd627b92f1682af97630c9 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.factorization.python.ops import clustering_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export_output
+from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -105,24 +106,32 @@ class _InitializeClustersHook(session_run_hook.SessionRunHook):
         logging.info(e)
 
 
-def _parse_tensor_or_dict(features):
+def _parse_features_if_necessary(features, feature_columns):
   """Helper function to convert the input points into a usable format.
 
   Args:
-    features: The input points.
+    features: The input features.
+    feature_columns: An optionable iterable containing all the feature columns
+      used by the model. All items in the set should be feature column instances
+      that can be passed to `tf.feature_column.input_layer`. If this is None,
+      all features will be used.
 
   Returns:
-    If `features` is a dict of `k` features, each of which is a vector of `n`
-    scalars, the return value is a Tensor of shape `(n, k)` representing `n`
-    input points, where the items in the `k` dimension are sorted
-    lexicographically by `features` key. If `features` is not a dict, it is
-    returned unmodified.
+    If `features` is a dict of `k` features (optionally filtered by
+    `feature_columns`), each of which is a vector of `n` scalars, the return
+    value is a Tensor of shape `(n, k)` representing `n` input points, where the
+    items in the `k` dimension are sorted lexicographically by `features` key.
+    If `features` is not a dict, it is returned unmodified.
   """
-  if isinstance(features, dict):
-    keys = sorted(features.keys())
-    with ops.colocate_with(features[keys[0]]):
-      features = array_ops.concat([features[k] for k in keys], axis=1)
-  return features
+  if not isinstance(features, dict):
+    return features
+
+  if feature_columns:
+    return fc.input_layer(features, feature_columns)
+
+  keys = sorted(features.keys())
+  with ops.colocate_with(features[keys[0]]):
+    return array_ops.concat([features[k] for k in keys], axis=1)
 
 
 class _ModelFn(object):
@@ -130,7 +139,8 @@ class _ModelFn(object):
 
   def __init__(self, num_clusters, initial_clusters, distance_metric,
                random_seed, use_mini_batch, mini_batch_steps_per_iteration,
-               kmeans_plus_plus_num_retries, relative_tolerance):
+               kmeans_plus_plus_num_retries, relative_tolerance,
+               feature_columns):
     self._num_clusters = num_clusters
     self._initial_clusters = initial_clusters
     self._distance_metric = distance_metric
@@ -139,6 +149,7 @@ class _ModelFn(object):
     self._mini_batch_steps_per_iteration = mini_batch_steps_per_iteration
     self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
     self._relative_tolerance = relative_tolerance
+    self._feature_columns = feature_columns
 
   def model_fn(self, features, mode, config):
     """Model function for the estimator.
@@ -166,7 +177,7 @@ class _ModelFn(object):
     # input_points is a single Tensor. Therefore, the sharding functionality
     # in clustering_ops is unused, and some of the values below are lists of a
     # single item.
-    input_points = _parse_tensor_or_dict(features)
+    input_points = _parse_features_if_necessary(features, self._feature_columns)
 
     # Let N = the number of input_points.
     # all_distances: A list of one matrix of shape (N, num_clusters). Each value
@@ -316,7 +327,8 @@ class KMeansClustering(estimator.Estimator):
                mini_batch_steps_per_iteration=1,
                kmeans_plus_plus_num_retries=2,
                relative_tolerance=None,
-               config=None):
+               config=None,
+               feature_columns=None):
     """Creates an Estimator for running KMeans training and inference.
 
     This Estimator implements the following variants of the K-means algorithm:
@@ -362,11 +374,11 @@ class KMeansClustering(estimator.Estimator):
               than `num_clusters`, a TensorFlow runtime error occurs.
       distance_metric: The distance metric used for clustering. One of:
         * `KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE`: Euclidean distance
-             between vectors `u` and `v` is defined as `||u - v||_2` which is
-             the square root of the sum of the absolute squares of the elements'
-             difference.
+             between vectors `u` and `v` is defined as `\\(||u - v||_2\\)`
+             which is the square root of the sum of the absolute squares of
+             the elements' difference.
         * `KMeansClustering.COSINE_DISTANCE`: Cosine distance between vectors
-             `u` and `v` is defined as `1 - (u . v) / (||u||_2 ||v||_2)`.
+             `u` and `v` is defined as `\\(1 - (u . v) / (||u||_2 ||v||_2)\\)`.
       random_seed: Python integer. Seed for PRNG used to initialize centers.
       use_mini_batch: A boolean specifying whether to use the mini-batch k-means
         algorithm. See explanation above.
@@ -383,6 +395,10 @@ class KMeansClustering(estimator.Estimator):
         iterations. Stops learning if the loss changes less than this amount.
         This may not work correctly if `use_mini_batch=True`.
       config: See @{tf.estimator.Estimator}.
+      feature_columns: An optionable iterable containing all the feature columns
+        used by the model. All items in the set should be feature column
+        instances that can be passed to `tf.feature_column.input_layer`. If this
+        is None, all features will be used.
 
     Raises:
       ValueError: An invalid argument was passed to `initial_clusters` or
@@ -402,7 +418,8 @@ class KMeansClustering(estimator.Estimator):
         model_fn=_ModelFn(
             num_clusters, initial_clusters, distance_metric, random_seed,
             use_mini_batch, mini_batch_steps_per_iteration,
-            kmeans_plus_plus_num_retries, relative_tolerance).model_fn,
+            kmeans_plus_plus_num_retries, relative_tolerance,
+            feature_columns).model_fn,
         model_dir=model_dir,
         config=config)
 
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index f9598bfc08c05ea3bba88b3135da0cf2e6bb0c95..88eb9cf692992fe2e1fc4f060ac98dd721c22307 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -27,6 +27,7 @@ from sklearn.cluster import KMeans as SklearnKMeans
 # pylint: disable=g-import-not-at-top
 from tensorflow.contrib.factorization.python.ops import kmeans as kmeans_lib
 from tensorflow.python.estimator import run_config
+from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -226,6 +227,44 @@ class KMeansTest(KMeansTestBase):
     self._infer_helper(kmeans, clusters, 10)
     self._infer_helper(kmeans, clusters, 1)
 
+  def _parse_feature_dict_helper(self, features, parsed_feature_dict):
+    # Perform a sanity check.
+    self.assertEqual(features.shape, parsed_feature_dict.shape)
+    self.assertEqual(features.dtype, parsed_feature_dict.dtype)
+    # Then check that running the tensor yields the original list of points.
+    with self.test_session() as sess:
+      parsed_points = sess.run(parsed_feature_dict)
+      self.assertAllEqual(self.points, parsed_points)
+
+  def test_parse_features(self):
+    """Tests the various behaviours of kmeans._parse_features_if_necessary."""
+
+    # No-op if a tensor is passed in.
+    features = constant_op.constant(self.points)
+    parsed_features = kmeans_lib._parse_features_if_necessary(features, None)
+    self.assertAllEqual(features, parsed_features)
+
+    # All values from a feature dict are transformed into a tensor.
+    feature_dict = {
+        'x': [[point[0]] for point in self.points],
+        'y': [[point[1]] for point in self.points]
+    }
+    parsed_feature_dict = kmeans_lib._parse_features_if_necessary(
+        feature_dict, None)
+    self._parse_feature_dict_helper(features, parsed_feature_dict)
+
+    # Only the feature_columns of a feature dict are transformed into a tensor.
+    feature_dict_with_extras = {
+        'foo': 'bar',
+        'x': [[point[0]] for point in self.points],
+        'baz': {'fizz': 'buzz'},
+        'y': [[point[1]] for point in self.points]
+    }
+    feature_columns = [fc.numeric_column(key='x'), fc.numeric_column(key='y')]
+    parsed_feature_dict = kmeans_lib._parse_features_if_necessary(
+        feature_dict_with_extras, feature_columns)
+    self._parse_feature_dict_helper(features, parsed_feature_dict)
+
 
 class KMeansTestMultiStageInit(KMeansTestBase):
 
@@ -374,7 +413,7 @@ class KMeansCosineDistanceTest(KMeansTestBase):
     self.assertAllClose(score, self.true_score, atol=1e-2)
 
   def test_predict_kmeans_plus_plus(self):
-    # Most points are concetrated near one center. KMeans++ is likely to find
+    # Most points are concentrated near one center. KMeans++ is likely to find
     # the less populated centers.
     points = np.array(
         [[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3], [-3.1, -3.2],
@@ -394,7 +433,6 @@ class KMeansCosineDistanceTest(KMeansTestBase):
     true_assignments = [0] * 2 + [1] * 2 + [2] * 8
     true_score = len(points) - np.tensordot(
         normalize(points), true_centers[true_assignments])
-
     kmeans = kmeans_lib.KMeansClustering(
         3,
         initial_clusters=self.initial_clusters,
@@ -566,7 +604,7 @@ class KMeansTestQueues(test.TestCase):
     return _fn
 
   # This test makes sure that there are no deadlocks when using a QueueRunner.
-  # Note that since cluster initialization is dependendent on inputs, if input
+  # Note that since cluster initialization is dependent on inputs, if input
   # is generated using a QueueRunner, one has to make sure that these runners
   # are started before the initialization.
   def test_queues(self):
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
index 4fe22ea26ec5f5a43f1c99d1fee518b1d326c5c9..ca46c39baa16a7fddb96121e0402fc35d24ce1c2 100644
--- a/tensorflow/contrib/factorization/python/ops/wals.py
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -216,7 +216,7 @@ def _wals_factorization_model_function(features, labels, mode, params):
         name=WALSMatrixFactorization.LOSS,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     # The root weighted squared error =
-    #   \sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )
+    #   \\(\sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )\\)
     rwse_var = variable_scope.variable(
         0.,
         trainable=False,
@@ -235,7 +235,7 @@ def _wals_factorization_model_function(features, labels, mode, params):
         num_items: An integer, the total number of items of this axis.
         update_fn: A function that takes one argument (`sp_input`), and that
         returns a tuple of
-          * new_factors: A flot Tensor of the factor values after update.
+          * new_factors: A float Tensor of the factor values after update.
           * update_op: a TensorFlow op which updates the factors.
           * loss: A float Tensor, the unregularized loss.
           * reg_loss: A float Tensor, the regularization loss.
@@ -490,11 +490,11 @@ class WALSMatrixFactorization(estimator.Estimator):
           and the problem simplifies to ALS. Note that, in this case,
           col_weights must also be set to "None".
         - List of lists of non-negative scalars, of the form
-          [[w_0, w_1, ...], [w_k, ... ], [...]],
+          \\([[w_0, w_1, ...], [w_k, ... ], [...]]\\),
           where the number of inner lists equal to the number of row factor
           shards and the elements in each inner list are the weights for the
           rows of that shard. In this case,
-          w_ij = unonbserved_weight + row_weights[i] * col_weights[j].
+          \\(w_ij = unonbserved_weight + row_weights[i] * col_weights[j]\\).
         - A non-negative scalar: This value is used for all row weights.
           Note that it is allowed to have row_weights as a list and col_weights
           as a scalar, or vice-versa.
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index 3614b2b15a6cbdd73f9f24c7e4e4534228d31499..aab7d0c9e8874269bfa5f33193b0dc0ba4bbc9cd 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -8,18 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "feature_column_py",
     srcs = ["__init__.py"],
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index e60116966fc8d8bb0745f50a0238f10f02af4167..555beddeaab419bcb23d06f960d370b706d744c8 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -166,6 +166,10 @@ def sequence_categorical_column_with_identity(
 
   Returns:
     A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: if `num_buckets` is less than one.
+    ValueError: if `default_value` is not in range `[0, num_buckets)`.
   """
   return fc._SequenceCategoricalColumn(
       fc.categorical_column_with_identity(
@@ -205,6 +209,10 @@ def sequence_categorical_column_with_hash_bucket(
 
   Returns:
     A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: `hash_bucket_size` is not greater than 1.
+    ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
       fc.categorical_column_with_hash_bucket(
@@ -257,6 +265,13 @@ def sequence_categorical_column_with_vocabulary_file(
 
   Returns:
     A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: `vocabulary_file` is missing or cannot be opened.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
       fc.categorical_column_with_vocabulary_file(
@@ -311,6 +326,12 @@ def sequence_categorical_column_with_vocabulary_list(
 
   Returns:
     A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: if `dtype` is not integer or string.
   """
   return fc._SequenceCategoricalColumn(
       fc.categorical_column_with_vocabulary_list(
@@ -352,8 +373,17 @@ def sequence_numeric_column(
 
   Returns:
     A `_SequenceNumericColumn`.
+
+  Raises:
+    TypeError: if any dimension in shape is not an int.
+    ValueError: if any dimension in shape is not a positive integer.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
   """
-  # TODO(b/73160931): Add validations.
+  shape = fc._check_shape(shape=shape, key=key)
+  if not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype must be convertible to float. '
+                     'dtype: {}, key: {}'.format(dtype, key))
+
   return _SequenceNumericColumn(
       key,
       shape=shape,
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index b64f086376dad65c1f32bee4bfce9334a60fd24a..88f5d535162939e063eb1e7f43d495137c5adef4 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -662,6 +662,32 @@ class SequenceIndicatorColumnTest(test.TestCase):
 
 class SequenceNumericColumnTest(test.TestCase):
 
+  def test_defaults(self):
+    a = sfc.sequence_numeric_column('aaa')
+    self.assertEqual('aaa', a.key)
+    self.assertEqual('aaa', a.name)
+    self.assertEqual('aaa', a._var_scope_name)
+    self.assertEqual((1,), a.shape)
+    self.assertEqual(0., a.default_value)
+    self.assertEqual(dtypes.float32, a.dtype)
+
+  def test_shape_saved_as_tuple(self):
+    a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
+    self.assertEqual((1, 2), a.shape)
+
+  def test_shape_must_be_positive_integer(self):
+    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+      sfc.sequence_numeric_column('aaa', shape=[1.0])
+
+    with self.assertRaisesRegexp(
+        ValueError, 'shape dimensions must be greater than 0'):
+      sfc.sequence_numeric_column('aaa', shape=[0])
+
+  def test_dtype_is_convertible_to_float(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'dtype must be convertible to float'):
+      sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
+
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
deleted file mode 100644
index 4ed7268e7a921284eed7767d870e56ecac39a3b1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Experimental methods for tf.feature_column sequence input."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-import abc
-import collections
-
-
-from tensorflow.python.feature_column import feature_column as fc
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import variable_scope
-
-# TODO(b/73160931): Fix pydoc.
-# pylint: disable=g-doc-args,missing-docstring,protected-access
-# TODO(b/73827486): Support SequenceExample.
-
-
-def sequence_input_layer(
-    features,
-    feature_columns,
-    weight_collections=None,
-    trainable=True,
-    scope=None):
-  """"Builds input layer for sequence input.
-
-  All `feature_columns` must be sequence dense columns with the same
-  `sequence_length`. The output of this method can be fed into sequence
-  networks, such as RNN.
-
-  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
-  `T` is the maximum sequence length for this batch, which could differ from
-  batch to batch.
-
-  If multiple `feature_columns` are given with `Di` `num_elements` each, their
-  outputs are concatenated. So, the final `Tensor` has shape
-  `[batch_size, T, D0 + D1 + ... + Dn]`.
-
-  Example:
-
-  ```python
-  rating = sequence_numeric_column('rating')
-  watches = sequence_categorical_column_with_identity(
-      'watches', num_buckets=1000)
-  watches_embedding = embedding_column(watches, dimension=10)
-  columns = [rating, watches]
-
-  features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
-
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
-  ```
-
-  Returns:
-    An `(input_layer, sequence_length)` tuple where:
-    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
-        `T` is the maximum sequence length for this batch, which could differ
-        from batch to batch. `D` is the sum of `num_elements` for all
-        `feature_columns`.
-    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
-        length for each example.
-  Raises:
-    ValueError: If any of the `feature_columns` is the wrong type.
-  """
-  feature_columns = fc._clean_feature_columns(feature_columns)
-  for c in feature_columns:
-    if not isinstance(c, _SequenceDenseColumn):
-      raise ValueError(
-          'All feature_columns must be of type _SequenceDenseColumn. '
-          'Given (type {}): {}'.format(type(c), c))
-
-  with variable_scope.variable_scope(
-      scope, default_name='sequence_input_layer', values=features.values()):
-    builder = fc._LazyBuilder(features)
-    output_tensors = []
-    sequence_lengths = []
-    ordered_columns = []
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):
-        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
-            builder,
-            weight_collections=weight_collections,
-            trainable=trainable)
-        # Flattens the final dimension to produce a 3D Tensor.
-        num_elements = column._variable_shape.num_elements()
-        shape = array_ops.shape(dense_tensor)
-        output_tensors.append(
-            array_ops.reshape(
-                dense_tensor,
-                shape=array_ops.concat([shape[:2], [num_elements]], axis=0)))
-        sequence_lengths.append(sequence_length)
-    fc._verify_static_batch_size_equality(output_tensors, ordered_columns)
-    # TODO(b/73160931): Verify sequence_length equality.
-    return array_ops.concat(output_tensors, -1), sequence_lengths[0]
-
-
-# TODO(b/73160931): Add remaining categorical columns.
-def sequence_categorical_column_with_identity(
-    key, num_buckets, default_value=None):
-  return _SequenceCategoricalColumn(
-      fc.categorical_column_with_identity(
-          key=key,
-          num_buckets=num_buckets,
-          default_value=default_value))
-
-
-# TODO(b/73160931): Merge with embedding_column
-def _sequence_embedding_column(
-    categorical_column, dimension, initializer=None, ckpt_to_load_from=None,
-    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
-  if not isinstance(categorical_column, _SequenceCategoricalColumn):
-    raise ValueError(
-        'categorical_column must be of type _SequenceCategoricalColumn. '
-        'Given (type {}): {}'.format(
-            type(categorical_column), categorical_column))
-  return _SequenceEmbeddingColumn(
-      fc.embedding_column(
-          categorical_column,
-          dimension=dimension,
-          initializer=initializer,
-          ckpt_to_load_from=ckpt_to_load_from,
-          tensor_name_in_ckpt=tensor_name_in_ckpt,
-          max_norm=max_norm,
-          trainable=trainable))
-
-
-def sequence_numeric_column(
-    key,
-    shape=(1,),
-    default_value=0.,
-    dtype=dtypes.float32):
-  # TODO(b/73160931): Add validations.
-  return _SequenceNumericColumn(
-      key,
-      shape=shape,
-      default_value=default_value,
-      dtype=dtype)
-
-
-class _SequenceDenseColumn(fc._FeatureColumn):
-  """Represents dense sequence data."""
-
-  __metaclass__ = abc.ABCMeta
-
-  TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
-      'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
-
-  @abc.abstractproperty
-  def _variable_shape(self):
-    """`TensorShape` without batch and sequence dimensions."""
-    pass
-
-  @abc.abstractmethod
-  def _get_sequence_dense_tensor(
-      self, inputs, weight_collections=None, trainable=None):
-    """Returns a `TensorSequenceLengthPair`."""
-    pass
-
-
-def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
-  with ops.name_scope(None, 'sequence_length') as name_scope:
-    row_ids = sp_tensor.indices[:, 0]
-    column_ids = sp_tensor.indices[:, 1]
-    column_ids += array_ops.ones_like(column_ids)
-    seq_length = (
-        math_ops.segment_max(column_ids, segment_ids=row_ids) / num_elements)
-    # If the last n rows do not have ids, seq_length will have shape
-    # [batch_size - n]. Pad the remaining values with zeros.
-    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
-    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
-    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
-
-
-class _SequenceCategoricalColumn(
-    fc._CategoricalColumn,
-    collections.namedtuple(
-        '_SequenceCategoricalColumn', ['categorical_column'])):
-
-  @property
-  def name(self):
-    return self.categorical_column.name
-
-  @property
-  def _parse_example_spec(self):
-    return self.categorical_column._parse_example_spec
-
-  def _transform_feature(self, inputs):
-    return self.categorical_column._transform_feature(inputs)
-
-  @property
-  def _num_buckets(self):
-    return self.categorical_column._num_buckets
-
-  def _get_sparse_tensors(self, inputs, weight_collections=None,
-                          trainable=None):
-    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)
-    id_tensor = sparse_tensors.id_tensor
-    weight_tensor = sparse_tensors.weight_tensor
-    # Expands final dimension, so that embeddings are not combined during
-    # embedding lookup.
-    check_id_rank = check_ops.assert_equal(
-        array_ops.rank(id_tensor), 2,
-        data=[
-            'Column {} expected ID tensor of rank 2. '.format(self.name),
-            'id_tensor shape: ', array_ops.shape(id_tensor)])
-    with ops.control_dependencies([check_id_rank]):
-      id_tensor = sparse_ops.sparse_reshape(
-          id_tensor,
-          shape=array_ops.concat([id_tensor.dense_shape, [1]], axis=0))
-    if weight_tensor is not None:
-      check_weight_rank = check_ops.assert_equal(
-          array_ops.rank(weight_tensor), 2,
-          data=[
-              'Column {} expected weight tensor of rank 2.'.format(self.name),
-              'weight_tensor shape:', array_ops.shape(weight_tensor)])
-      with ops.control_dependencies([check_weight_rank]):
-        weight_tensor = sparse_ops.sparse_reshape(
-            weight_tensor,
-            shape=array_ops.concat([weight_tensor.dense_shape, [1]], axis=0))
-    return fc._CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
-
-  def _sequence_length(self, inputs):
-    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)
-    return _sequence_length_from_sparse_tensor(sparse_tensors.id_tensor)
-
-
-class _SequenceEmbeddingColumn(
-    _SequenceDenseColumn,
-    collections.namedtuple('_SequenceEmbeddingColumn', ['embedding_column'])):
-
-  @property
-  def name(self):
-    return self.embedding_column.name
-
-  @property
-  def _parse_example_spec(self):
-    return self.embedding_column._parse_example_spec
-
-  def _transform_feature(self, inputs):
-    return self.embedding_column._transform_feature(inputs)
-
-  @property
-  def _variable_shape(self):
-    return self.embedding_column._variable_shape
-
-  def _get_sequence_dense_tensor(
-      self, inputs, weight_collections=None, trainable=None):
-    dense_tensor = self.embedding_column._get_dense_tensor(
-        inputs=inputs,
-        weight_collections=weight_collections,
-        trainable=trainable)
-    sequence_length = self.embedding_column.categorical_column._sequence_length(
-        inputs)
-    return _SequenceDenseColumn.TensorSequenceLengthPair(
-        dense_tensor=dense_tensor, sequence_length=sequence_length)
-
-
-class _SequenceNumericColumn(
-    _SequenceDenseColumn,
-    collections.namedtuple(
-        '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype'])):
-
-  @property
-  def name(self):
-    return self.key
-
-  @property
-  def _parse_example_spec(self):
-    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
-
-  def _transform_feature(self, inputs):
-    return inputs.get(self.key)
-
-  @property
-  def _variable_shape(self):
-    return tensor_shape.TensorShape(self.shape)
-
-  def _get_sequence_dense_tensor(
-      self, inputs, weight_collections=None, trainable=None):
-    # Do nothing with weight_collections and trainable since no variables are
-    # created in this function.
-    del weight_collections
-    del trainable
-    sp_tensor = inputs.get(self)
-    dense_tensor = sparse_ops.sparse_tensor_to_dense(
-        sp_tensor, default_value=self.default_value)
-    # Reshape into [batch_size, T, variable_shape].
-    dense_shape = array_ops.concat(
-        [array_ops.shape(dense_tensor)[:1], [-1], self._variable_shape],
-        axis=0)
-    dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape)
-    sequence_length = _sequence_length_from_sparse_tensor(
-        sp_tensor, num_elements=self._variable_shape.num_elements())
-    return _SequenceDenseColumn.TensorSequenceLengthPair(
-        dense_tensor=dense_tensor, sequence_length=sequence_length)
-
-# pylint: enable=g-doc-args,missing-docstring,protected-access
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column_test.py
deleted file mode 100644
index 59674869a27c3a40ab9cb3dcede384d1cda7ce27..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column_test.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for sequential_feature_column."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.feature_column.python.feature_column import sequential_feature_column as sfc
-from tensorflow.python.feature_column.feature_column import _LazyBuilder
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.platform import test
-from tensorflow.python.training import monitored_session
-
-
-class SequenceInputLayerTest(test.TestCase):
-
-  def test_embedding_column(self):
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [1]
-        # example 1, ids [2, 0]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-
-    embedding_dimension_a = 2
-    embedding_values_a = (
-        (1., 2.),  # id 0
-        (3., 4.),  # id 1
-        (5., 6.)  # id 2
-    )
-    embedding_dimension_b = 3
-    embedding_values_b = (
-        (11., 12., 13.),  # id 0
-        (14., 15., 16.),  # id 1
-        (17., 18., 19.)  # id 2
-    )
-    def _get_initializer(embedding_dimension, embedding_values):
-      def _initializer(shape, dtype, partition_info):
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertEqual(dtypes.float32, dtype)
-        self.assertIsNone(partition_info)
-        return embedding_values
-      return _initializer
-
-    expected_input_layer = [
-        # example 0, ids_a [2], ids_b [1]
-        [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
-        # example 1, ids_a [0, 1], ids_b [2, 0]
-        [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],
-    ]
-    expected_sequence_length = [1, 2]
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = sfc._sequence_embedding_column(
-        categorical_column_a, dimension=embedding_dimension_a,
-        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = sfc._sequence_embedding_column(
-        categorical_column_b, dimension=embedding_dimension_b,
-        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
-
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        # Test that columns are reordered alphabetically.
-        feature_columns=[embedding_column_b, embedding_column_a])
-
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('sequence_input_layer/aaa_embedding/embedding_weights:0',
-         'sequence_input_layer/bbb_embedding/embedding_weights:0'),
-        tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
-      self.assertAllEqual(embedding_values_b, global_vars[1].eval(session=sess))
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-  def test_numeric_column(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    expected_input_layer = [
-        [[0.], [1.]],
-        [[10.], [0.]],
-    ]
-    expected_sequence_length = [2, 1]
-    numeric_column = sfc.sequence_numeric_column('aaa')
-
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-  def test_numeric_column_multi_dim(self):
-    """Tests sequence_input_layer for multi-dimensional numeric_column."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
-        # example 1, [[[10., 11.],  [12., 13.]]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
-                 (1, 0), (1, 1), (1, 2), (1, 3)),
-        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-        dense_shape=(2, 8))
-    # The output of numeric_column._get_dense_tensor should be flattened.
-    expected_input_layer = [
-        [[0., 1., 2., 3.], [4., 5., 6., 7.]],
-        [[10., 11., 12., 13.], [0., 0., 0., 0.]],
-    ]
-    expected_sequence_length = [2, 1]
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
-
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-
-def _assert_sparse_tensor_value(test_case, expected, actual):
-  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
-  test_case.assertAllEqual(expected.indices, actual.indices)
-
-  test_case.assertEqual(
-      np.array(expected.values).dtype, np.array(actual.values).dtype)
-  test_case.assertAllEqual(expected.values, actual.values)
-
-  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
-  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
-
-
-class SequenceCategoricalColumnWithIdentityTest(test.TestCase):
-
-  def test_get_sparse_tensors(self):
-    column = sfc.sequence_categorical_column_with_identity(
-        'aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-    expected_sparse_ids = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=np.array((1, 2, 0), dtype=np.int64),
-        dense_shape=(2, 2, 1))
-
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self,
-          expected_sparse_ids,
-          id_weight_pair.id_tensor.eval(session=sess))
-
-  def test_get_sparse_tensors_inputs3d(self):
-    """Tests _get_sparse_tensors when the input is already 3D Tensor."""
-    column = sfc.sequence_categorical_column_with_identity(
-        'aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2, 1))
-
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        r'Column aaa expected ID tensor of rank 2\.\s*'
-        r'id_tensor shape:\s*\[2 2 1\]'):
-      id_weight_pair = column._get_sparse_tensors(
-          _LazyBuilder({'aaa': inputs}))
-      with monitored_session.MonitoredSession() as sess:
-        id_weight_pair.id_tensor.eval(session=sess)
-
-  def test_sequence_length(self):
-    column = sfc.sequence_categorical_column_with_identity(
-        'aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-    expected_sequence_length = [1, 2]
-
-    sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-  def test_sequence_length_with_zeros(self):
-    column = sfc.sequence_categorical_column_with_identity(
-        'aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((1, 0), (3, 0), (3, 1)),
-        values=(1, 2, 0),
-        dense_shape=(5, 2))
-    expected_sequence_length = [0, 1, 0, 2, 0]
-
-    sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-
-class SequenceEmbeddingColumnTest(test.TestCase):
-
-  def test_get_sequence_dense_tensor(self):
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 2))
-
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    expected_lookups = [
-        # example 0, ids [2]
-        [[7., 11.], [0., 0.]],
-        # example 1, ids [0, 1]
-        [[1., 2.], [3., 5.]],
-        # example 2, ids []
-        [[0., 0.], [0., 0.]],
-        # example 3, ids [1]
-        [[3., 5.], [0., 0.]],
-    ]
-
-    categorical_column = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = sfc._sequence_embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        initializer=_initializer)
-
-    embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval(session=sess))
-
-  def test_sequence_length(self):
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length = [1, 2]
-
-    categorical_column = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = sfc._sequence_embedding_column(
-        categorical_column, dimension=2)
-
-    _, sequence_length = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-  def test_sequence_length_with_empty_rows(self):
-    """Tests _sequence_length when some examples do not have ids."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids []
-        # example 1, ids [2]
-        # example 2, ids [0, 1]
-        # example 3, ids []
-        # example 4, ids [1]
-        # example 5, ids []
-        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(6, 2))
-    expected_sequence_length = [0, 1, 2, 0, 1, 0]
-
-    categorical_column = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = sfc._sequence_embedding_column(
-        categorical_column, dimension=2)
-
-    _, sequence_length = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-
-class SequenceNumericColumnTest(test.TestCase):
-
-  def test_get_sequence_dense_tensor(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    expected_dense_tensor = [
-        [[0.], [1.]],
-        [[10.], [0.]],
-    ]
-    numeric_column = sfc.sequence_numeric_column('aaa')
-
-    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
-
-  def test_get_sequence_dense_tensor_with_shape(self):
-    """Tests get_sequence_dense_tensor with shape !=(1,)."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
-        # example 1, [[10., 11., 12.]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
-                 (1, 0), (1, 1), (1, 2)),
-        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
-        dense_shape=(2, 6))
-    expected_dense_tensor = [
-        [[0., 1., 2.], [3., 4., 5.]],
-        [[10., 11., 12.], [0., 0., 0.]],
-    ]
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
-
-    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
-
-  def test_get_dense_tensor_multi_dim(self):
-    """Tests get_sequence_dense_tensor for multi-dim numeric_column."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
-        # example 1, [[[10., 11.],  [12., 13.]]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
-                 (1, 0), (1, 1), (1, 2), (1, 3)),
-        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-        dense_shape=(2, 8))
-    expected_dense_tensor = [
-        [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
-        [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]],
-    ]
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
-
-    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
-
-  def test_sequence_length(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
-        # example 1, [[10., 11., 12.]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
-                 (1, 0), (1, 1), (1, 2)),
-        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
-        dense_shape=(2, 6))
-    expected_sequence_length = [2, 1]
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
-
-    _, sequence_length = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-  def test_sequence_length_with_shape(self):
-    """Tests _sequence_length with shape !=(1,)."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    expected_sequence_length = [2, 1]
-    numeric_column = sfc.sequence_numeric_column('aaa')
-
-    _, sequence_length = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-  def test_sequence_length_with_empty_rows(self):
-    """Tests _sequence_length when some examples do not have ids."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values []
-        # example 1, values [[0.], [1.]]
-        # example 2, [[2.]]
-        # example 3, values []
-        # example 4, [[3.]]
-        # example 5, values []
-        indices=((1, 0), (1, 1), (2, 0), (4, 0)),
-        values=(0., 1., 2., 3.),
-        dense_shape=(6, 2))
-    expected_sequence_length = [0, 2, 1, 0, 1, 0]
-    numeric_column = sfc.sequence_numeric_column('aaa')
-
-    _, sequence_length = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index eccce99071dc1477cf4f3bb152f3304b3b0fc35a..f7b3273a4d35eadb9fad49399b7bf18d4bd33503 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -180,15 +180,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/ffmpeg/default/BUILD b/tensorflow/contrib/ffmpeg/default/BUILD
index 6b455567d766dbe6d380a498bd7f521db27e077b..59bad8982dd163f89f37e1a0a9d5017d0c495de3 100644
--- a/tensorflow/contrib/ffmpeg/default/BUILD
+++ b/tensorflow/contrib/ffmpeg/default/BUILD
@@ -74,15 +74,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index ac043fda0638e61f422e769ab3047a53a1b377bd..b1c8ad49eaf8d2400e431fcf4820fca6e0314557 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -321,15 +321,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 3398b3fd1c1036091bfadf548f7d44dbf9eb1046..cbb68bd3eb257f9472515e5c29ce4f02057be321 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -83,6 +83,7 @@ See the @{$python/contrib.framework} guide.
 @@load_linear_multiclass_bias_initializer
 @@load_variable_slot_initializer
 
+@@argsort
 @@py_func
 @@sort
 
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index 3cad1fee1984042e3a9ab91a0af70cbaca25cece..5b150339953f961c756c0909dd1795341159b9cd 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -68,7 +68,7 @@ from tensorflow.python.util import tf_decorator
 
 __all__ = [
     'arg_scope', 'add_arg_scope', 'current_arg_scope', 'has_arg_scope',
-    'arg_scoped_arguments'
+    'arg_scoped_arguments', 'arg_scope_func_key'
 ]
 
 _ARGSTACK = [{}]
@@ -89,7 +89,7 @@ def current_arg_scope():
   return stack[-1]
 
 
-def _key_op(op):
+def arg_scope_func_key(op):
   return getattr(op, '_key_op', str(op))
 
 
@@ -103,9 +103,9 @@ def _kwarg_names(func):
 
 
 def _add_op(op):
-  key_op = _key_op(op)
-  if key_op not in _DECORATED_OPS:
-    _DECORATED_OPS[key_op] = _kwarg_names(op)
+  key = arg_scope_func_key(op)
+  if key not in _DECORATED_OPS:
+    _DECORATED_OPS[key] = _kwarg_names(op)
 
 
 @tf_contextlib.contextmanager
@@ -147,16 +147,16 @@ def arg_scope(list_ops_or_scope, **kwargs):
     try:
       current_scope = current_arg_scope().copy()
       for op in list_ops_or_scope:
-        key_op = _key_op(op)
+        key = arg_scope_func_key(op)
         if not has_arg_scope(op):
           raise ValueError('%s is not decorated with @add_arg_scope',
                            _name_op(op))
-        if key_op in current_scope:
-          current_kwargs = current_scope[key_op].copy()
+        if key in current_scope:
+          current_kwargs = current_scope[key].copy()
           current_kwargs.update(kwargs)
-          current_scope[key_op] = current_kwargs
+          current_scope[key] = current_kwargs
         else:
-          current_scope[key_op] = kwargs.copy()
+          current_scope[key] = kwargs.copy()
       _get_arg_stack().append(current_scope)
       yield current_scope
     finally:
@@ -176,14 +176,14 @@ def add_arg_scope(func):
   def func_with_args(*args, **kwargs):
     current_scope = current_arg_scope()
     current_args = kwargs
-    key_func = _key_op(func)
+    key_func = arg_scope_func_key(func)
     if key_func in current_scope:
       current_args = current_scope[key_func].copy()
       current_args.update(kwargs)
     return func(*args, **current_args)
 
   _add_op(func)
-  setattr(func_with_args, '_key_op', _key_op(func))
+  setattr(func_with_args, '_key_op', arg_scope_func_key(func))
   return tf_decorator.make_decorator(func, func_with_args)
 
 
@@ -196,7 +196,7 @@ def has_arg_scope(func):
   Returns:
     a boolean.
   """
-  return _key_op(func) in _DECORATED_OPS
+  return arg_scope_func_key(func) in _DECORATED_OPS
 
 
 def arg_scoped_arguments(func):
@@ -209,4 +209,4 @@ def arg_scoped_arguments(func):
     a list of kwargs names.
   """
   assert has_arg_scope(func)
-  return _DECORATED_OPS[_key_op(func)]
+  return _DECORATED_OPS[arg_scope_func_key(func)]
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope_test.py b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
index 7ba9d4ffa90f6860629b15a2ea91e0c573bf6368..4c3879d4fc08b53ea8be5f1256a830a64fb39af6 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope_test.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
@@ -170,6 +170,30 @@ class ArgScopeTest(test.TestCase):
         self.assertTupleEqual(args, func1_args)
         self.assertDictEqual(kwargs, func1_kwargs)
 
+  def testNestedArgScopeObjectCreatedOutsideScopeOverridesArgScope(self):
+
+    def get_scope_object():
+      with arg_scope([func1], a=1, b=None, c=[1]) as sc:
+        return sc
+
+    scope_object = get_scope_object()
+    with arg_scope([func1], b=2, d=10):
+      with arg_scope(scope_object):
+        args, kwargs = func1(0)
+        self.assertTupleEqual(args, (0,))
+        self.assertDictEqual(kwargs, {'a': 1, 'b': None, 'c': [1]})
+
+  def testArgScopeObjectCreatedWithinScopeInheritsArgScope(self):
+    def get_scope_object():
+      with arg_scope([func1], a=1, b=None, c=[1]) as sc:
+        return sc
+
+    with arg_scope([func1], b=2, d=10):
+      with arg_scope(get_scope_object()):
+        args, kwargs = func1(0)
+        self.assertTupleEqual(args, (0,))
+        self.assertDictEqual(kwargs, {'a': 1, 'b': None, 'c': [1], 'd': 10})
+
   def testSharedArgScope(self):
     func1_args = (0,)
     func1_kwargs = {'a': 1, 'b': None, 'c': [1]}
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
index cc19372acf956371c2d029c7b8eb5534c3789413..bd764ed57a6da0a4d356235108e998a80ac34362 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_ops.py
+++ b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
@@ -24,10 +24,8 @@ import collections
 # from tensorflow.core.protobuf import critical_section_pb2
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
@@ -48,6 +46,26 @@ class _ExecutionSignature(
   pass
 
 
+def _identity(x):
+  """Identity op that recognizes `TensorArray`, `Operation`, and `Tensor`."""
+  if isinstance(x, tensor_array_ops.TensorArray):
+    return x.identity()
+  elif isinstance(x, ops.Operation):
+    return control_flow_ops.group(x)
+  elif context.executing_eagerly() and x is None:
+    return None
+  else:
+    return array_ops.identity(x)
+
+
+def _get_colocation(op):
+  """Get colocation symbol from op, if any."""
+  try:
+    return op.get_attr("_class")
+  except ValueError:
+    return None
+
+
 class CriticalSection(object):
   """Critical section.
 
@@ -180,8 +198,8 @@ class CriticalSection(object):
       The tensors returned from `fn(*args, **kwargs)`.
 
     Raises:
-      ValueError: If `fn` attempts to use this `CriticalSection` in any nested
-        way.
+      ValueError: If `fn` attempts to lock this `CriticalSection` in any nested
+        or lazy way that may cause a deadlock.
       ValueError: If `exclusive_resource_access` is not provided (is `True`) and
         another `CriticalSection` has an execution requesting the same
         resources as in `*args`, `**kwargs`, and any additionaly captured
@@ -193,69 +211,52 @@ class CriticalSection(object):
     exclusive_resource_access = kwargs.pop("exclusive_resource_access", True)
 
     with ops.name_scope(name, "critical_section_execute", []):
-      lock = gen_resource_variable_ops.mutex_lock(self._handle)
-
-      with ops.control_dependencies([lock]):
-        c_known_ops = set()
-        c_captured_tensors = set()
 
-        def add_op_internal(op):
-          c_known_ops.add(op)
-          for i in op.inputs:
-            if i.op not in c_known_ops:
-              c_captured_tensors.add(i)
+      # Ensure that mutex locking only happens *after* all args and
+      # kwargs have been executed.  This avoids certain types of deadlocks.
+      lock = gen_resource_variable_ops.mutex_lock(self._handle)
 
-        c = function.HelperContext(add_op_internal)
-        with c:
+      if not context.executing_eagerly():
+        # NOTE(ebrevdo): This is to ensure we don't pick up spurious
+        # Operations created by other threads.
+        with ops.get_default_graph()._lock:  # pylint: disable=protected-access
+          existing_ops = ops.get_default_graph().get_operations()
+          with ops.control_dependencies([lock]):
+            r = fn(*args, **kwargs)
+          # TODO(ebrevdo): If creating critical sections in a python loop, this
+          # makes graph creation time quadratic.  Revisit if this
+          # becomes a problem.
+          created_ops = (set(ops.get_default_graph().get_operations())
+                         .difference(existing_ops))
+      else:
+        with ops.control_dependencies([lock]):
           r = fn(*args, **kwargs)
 
-        resource_inputs = set([
-            x for x in
-            list(nest.flatten(args)) + nest.flatten(kwargs.values()) +
-            list(c_captured_tensors)
-            if tensor_util.is_tensor(x) and x.dtype == dtypes.resource])
-
-      if self._handle in resource_inputs:
-        raise ValueError("The function fn attempts to access the "
-                         "CriticalSection in which it would be running.  "
-                         "This is illegal and would cause deadlocks.  "
-                         "CriticalSection: %s." % self._handle)
-
       if not context.executing_eagerly():
-        # Collections and op introspection does not work in eager
-        # mode.  This is generally ok; since eager mode (as of
-        # writing) executes sequentially anyway.
-        for sg in ops.get_collection(CRITICAL_SECTION_EXECUTIONS):
-          sg_handle_name = ops.convert_to_tensor(sg.handle).name
-          self_handle_name = ops.convert_to_tensor(self._handle).name
-          if sg_handle_name == self_handle_name:
-            # Other executions in the same critical section are allowed.
-            continue
-          if not (exclusive_resource_access or sg.exclusive_resource_access):
-            # Neither execution requested exclusive access.
-            continue
-          resource_intersection = resource_inputs.intersection(sg.resources)
-          if resource_intersection:
-            raise ValueError(
-                "This execution would access resources: %s.  Either this "
-                "lock (CriticalSection: %s) or lock '%s' "
-                "(CriticalSection: %s) requested exclusive resource access "
-                "of this resource.  Did you mean to call execute with keyword "
-                "argument exclusive_resource_access=False?" %
-                (list(resource_intersection), self._handle.name,
-                 sg.op.name, sg.handle.name))
-
-      def identity(x):  # pylint: disable=invalid-name
-        if isinstance(x, tensor_array_ops.TensorArray):
-          return x.identity()
-        elif isinstance(x, ops.Operation):
-          return control_flow_ops.group(x)
-        elif context.executing_eagerly() and x is None:
-          return None
-        else:
-          return array_ops.identity(x)
-
-      r_flat = [identity(x) for x in nest.flatten(r)]
+        self._add_control_dependencies_to_lock(created_ops, lock.op)
+
+        # captured_resources is a list of resources that are directly
+        # accessed only by ops created during fn(), not by any
+        # ancestors of those ops in the graph.
+        captured_resources = set([
+            input_ for op in created_ops
+            for input_ in op.inputs
+            if input_.dtype == dtypes.resource
+        ])
+
+        # NOTE(ebrevdo): The only time self._is_self_handle() is True
+        # in this call is if one of the recently created ops, within
+        # the execute(), themselves attempt to access the
+        # CriticalSection.  This will cause a deadlock.
+        if any(self._is_self_handle(x) for x in captured_resources):
+          raise ValueError("The function fn attempts to directly access the "
+                           "CriticalSection in which it would be running.  "
+                           "This is illegal and would cause deadlocks.")
+
+        self._check_multiple_access_to_resources(
+            captured_resources, exclusive_resource_access)
+
+      r_flat = [_identity(x) for x in nest.flatten(r)]
 
       with ops.control_dependencies(r_flat):
         # The identity must run on the same machine as self._handle
@@ -268,23 +269,105 @@ class CriticalSection(object):
 
         # Make sure that if any element of r is accessed, all of
         # them are executed together.
-        r = nest.pack_sequence_as(
-            r, control_flow_ops.tuple(nest.flatten(r)))
+        r = nest.pack_sequence_as(r, control_flow_ops.tuple(nest.flatten(r)))
 
       with ops.control_dependencies([ensure_lock_exists]):
-        outputs = nest.map_structure(identity, r)
+        outputs = nest.map_structure(_identity, r)
 
       if not context.executing_eagerly():
         signature = _ExecutionSignature(
             op=lock.op,
             handle=self._handle,
-            resources=list(resource_inputs),
+            resources=list(captured_resources),
             exclusive_resource_access=exclusive_resource_access)
         ops.add_to_collections(
             CRITICAL_SECTION_EXECUTIONS, signature)
 
       return outputs
 
+  def _add_control_dependencies_to_lock(self, created_ops, lock_op):
+    """To avoid deadlocks, all args must be executed before lock_op."""
+    # Get all arguments (explicit and captured) of all ops created by fn().
+    all_args = set([input_.op for op in created_ops for input_ in op.inputs])
+    all_args.update(
+        input_op for op in created_ops for input_op in op.control_inputs)
+    # Unfortunately, we can't use sets throughout because TF seems to
+    # create new Operation objects for the same op sometimes; and we
+    # can't rely on id(op).
+
+    # pylint: disable=protected-access
+    all_args_dict = dict((op._id, op) for op in all_args)
+
+    # Remove ops created within fn, or that lock_op already has a
+    # control dependency on.  Also remove a possible self-loop.
+    for op in created_ops:
+      all_args_dict.pop(op._id, None)
+    for op in lock_op.control_inputs:
+      all_args_dict.pop(op._id, None)
+    for input_ in lock_op.inputs:
+      all_args_dict.pop(input_.op._id, None)
+    all_args_dict.pop(lock_op._id, None)
+
+    all_args = all_args_dict.values()
+
+    if not all_args:
+      # No control dependencies to add; return early.
+      return
+
+    # This group is important: it ensures that any ops in all_args
+    # outside the control context of the lock_op (and this fn, which
+    # runs in the same context) are added to this context before
+    # being added to the control dependencies of lock_op.
+    all_args = control_flow_ops.group(*all_args)
+
+    lock_op._add_control_input(all_args)
+    # pylint: enable=protected-access
+
+  def _is_self_handle(self, x):
+    """Check if the tensor `x` is the same Mutex as `self._handle`."""
+    return (x.op.type == "MutexV2"
+            # blank shared_name means the op will create a unique one.
+            and x.op.get_attr("shared_name")
+            and (x.op.get_attr("shared_name") ==
+                 self._handle.op.get_attr("shared_name"))
+            and (x.op.device == self._handle.op.device
+                 or _get_colocation(x.op) == _get_colocation(self._handle.op)))
+
+  def _check_multiple_access_to_resources(
+      self, captured_resources, exclusive_resource_access):
+    """Raise if captured_resources are accessed by another CriticalSection.
+
+    Args:
+      captured_resources: Set of tensors of type resource.
+      exclusive_resource_access: Whether this execution requires exclusive
+        resource access.
+
+    Raises:
+      ValueError: If any tensors in `captured_resources` are also accessed
+        by another `CriticalSection`, and at least one of them requires
+        exclusive resource access.
+    """
+    # Collections and op introspection does not work in eager
+    # mode.  This is generally ok; since eager mode (as of
+    # writing) executes sequentially anyway.
+    for sg in ops.get_collection(CRITICAL_SECTION_EXECUTIONS):
+      if self._is_self_handle(sg.handle):
+        # Other executions in the same critical section are allowed.
+        continue
+      if not (exclusive_resource_access or sg.exclusive_resource_access):
+        # Neither execution requested exclusive access.
+        continue
+      resource_intersection = captured_resources.intersection(sg.resources)
+      if resource_intersection:
+        raise ValueError(
+            "This execution would access resources: %s.  Either this "
+            "lock (CriticalSection: %s) or lock '%s' "
+            "(CriticalSection: %s) requested exclusive resource access "
+            "of this resource.  Did you mean to call execute with keyword "
+            "argument exclusive_resource_access=False?" %
+            (list(resource_intersection), self._handle.name,
+             sg.op.name, sg.handle.name))
+
   # TODO(ebrevdo): Re-enable once CriticalSection is in core.
 
   # def to_proto(self, export_scope=None):
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_test.py b/tensorflow/contrib/framework/python/ops/critical_section_test.py
index c916592ce1979fe3a79cf28ad4bdac44284cce97..ba660295cb3c97d26da7bf892c78bceee53cf2d4 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_test.py
+++ b/tensorflow/contrib/framework/python/ops/critical_section_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 # TODO(ebrevdo): Re-enable once CriticalSection is in core.
 # from tensorflow.python.training import saver as saver_lib
 
@@ -37,7 +38,7 @@ class CriticalSectionTest(test.TestCase):
     v = resource_variable_ops.ResourceVariable(0.0, name="v")
 
     def fn(a, b):
-      c = v.read_value()
+      c = v.value()
       with ops.control_dependencies([c]):
         nv = v.assign_add(a * b)
         with ops.control_dependencies([nv]):
@@ -140,15 +141,151 @@ class CriticalSectionTest(test.TestCase):
          ops.get_collection(critical_section_ops.CRITICAL_SECTION_EXECUTIONS)])
 
   def testRecursiveCriticalSectionAccessIsIllegal(self):
+    # This does not work properly in eager mode.  Eager users will
+    # just hit a deadlock if they do this.  But at least it'll be easier
+    # to debug.
+    cs = critical_section_ops.CriticalSection()
+    def fn(x):
+      return cs.execute(lambda y: y + 1, x)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"attempts to directly access the CriticalSection in which it "
+        r"would be running"):
+      cs.execute(fn, 1.0)
+
+  def testRecursiveCriticalSectionAccessViaCapturedTensorIsProtected(self):
+    # This one is subtle; and we're being overly cautious here.  The
+    # deadlock we are ensuring we catch is:
+    #
+    # to_capture = CS[lambda x: x + 1](1.0)
+    # deadlocked = CS[lambda x: x + to_capture](1.0)
+    #
+    # This would have caused a deadlock because executing `deadlocked` will
+    # lock the mutex on CS; but then due to dependencies, will attempt
+    # to compute `to_capture`.  This computation requires locking CS,
+    # but that is not possible now because CS is already locked by
+    # `deadlocked`.
+    #
+    # We check that CriticalSection.execute properly inserts new
+    # control dependencies to its lock to ensure all captured
+    # operations are finished before anything runs within the critical section.
+    cs = critical_section_ops.CriticalSection(shared_name="cs")
+    fn = array_ops.identity
+    to_capture = cs.execute(fn, 1.0)
+    fn_captures = lambda x: x + to_capture
+    to_capture_too = array_ops.identity(to_capture)
+
+    ex_0 = cs.execute(fn_captures, 1.0)
+
+    with ops.control_dependencies([to_capture]):
+      # This is OK because to_capture will execute before this next call
+      ex_1 = cs.execute(fn_captures, 1.0)
+
+    dependency = array_ops.identity(to_capture)
+
+    fn_captures_dependency = lambda x: x + dependency
+
+    ex_2 = cs.execute(fn_captures_dependency, 1.0)
+
+    with ops.control_dependencies([to_capture_too]):
+      ex_3 = cs.execute(fn_captures_dependency, 1.0)
+
+    # Ensure there's no actual deadlock on to_execute.
+    self.assertEquals(2.0, self.evaluate(ex_0))
+    self.assertEquals(2.0, self.evaluate(ex_1))
+    self.assertEquals(2.0, self.evaluate(ex_2))
+    self.assertEquals(2.0, self.evaluate(ex_3))
+
+  def testRecursiveCriticalSectionAccessWithinLoopIsProtected(self):
+    cs = critical_section_ops.CriticalSection(shared_name="cs")
+
+    def body_implicit_capture(i, j):
+      # This would have caused a deadlock if not for logic in execute
+      # that inserts additional control dependencies onto the lock op:
+      #   * Loop body argument j is captured by fn()
+      #   * i is running in parallel to move forward the execution
+      #   * j is not being checked by the predicate function
+      #   * output of cs.execute() is returned as next j.
+      fn = lambda: j + 1
+      return (i + 1, cs.execute(fn))
+
+    (i_n, j_n) = control_flow_ops.while_loop(
+        lambda i, _: i < 1000,
+        body_implicit_capture,
+        [0, 0],
+        parallel_iterations=25)
+    logging.warn(
+        "\n==============\nRunning "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_implicit_capture'\n"
+        "==============\n")
+    self.assertEquals((1000, 1000), self.evaluate((i_n, j_n)))
+    logging.warn(
+        "\n==============\nSuccessfully finished running "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_implicit_capture'\n"
+        "==============\n")
+
+    def body_implicit_capture_protected(i, j):
+      # This version is ok because we manually add a control
+      # dependency on j, which is an argument to the while_loop body
+      # and captured by fn.
+      fn = lambda: j + 1
+      with ops.control_dependencies([j]):
+        return (i + 1, cs.execute(fn))
+
+    (i_n, j_n) = control_flow_ops.while_loop(
+        lambda i, _: i < 1000,
+        body_implicit_capture_protected,
+        [0, 0],
+        parallel_iterations=25)
+    logging.warn(
+        "\n==============\nRunning "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_implicit_capture_protected'\n"
+        "==============\n")
+    self.assertEquals((1000, 1000), self.evaluate((i_n, j_n)))
+    logging.warn(
+        "\n==============\nSuccessfully finished running "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_implicit_capture_protected'\n"
+        "==============\n")
+
+    def body_args_capture(i, j):
+      # This version is ok because j is an argument to fn and we can
+      # ensure there's a control dependency on j.
+      fn = lambda x: x + 1
+      return (i + 1, cs.execute(fn, j))
+
+    (i_n, j_n) = control_flow_ops.while_loop(
+        lambda i, _: i < 1000,
+        body_args_capture,
+        [0, 0],
+        parallel_iterations=25)
+    logging.warn(
+        "\n==============\nRunning "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_args_capture'\n"
+        "==============\n")
+    self.assertEquals((1000, 1000), self.evaluate((i_n, j_n)))
+    logging.warn(
+        "\n==============\nSuccessfully finished running "
+        "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
+        "body_args_capture'\n"
+        "==============\n")
+
+  def testRecursiveCriticalSectionAccessIsIllegalSameSharedName(self):
     # This does not work properly in eager mode.  Eager users will
     # just hit a deadlock if they do this.  But at least it'll be easier
     # to debug.
     cs = critical_section_ops.CriticalSection(shared_name="cs")
+    cs_same = critical_section_ops.CriticalSection(shared_name="cs")
     def fn(x):
-      return cs.execute(lambda x: x+1, x)
+      return cs_same.execute(lambda x: x+1, x)
     with self.assertRaisesRegexp(
         ValueError,
-        r"attempts to access the CriticalSection in which it would be running"):
+        r"attempts to directly access the CriticalSection in which it "
+        r"would be running"):
       cs.execute(fn, 1.0)
 
   def testMultipleCSExecutionsRequestSameResource(self):
@@ -179,6 +316,20 @@ class CriticalSectionTest(test.TestCase):
         ValueError, "requested exclusive resource access"):
       cs1.execute(lambda: v2 + 1)
 
+  def testControlDependencyFromOutsideWhileLoopMixedWithInsideLoop(self):
+    cs = critical_section_ops.CriticalSection()
+    v = resource_variable_ops.ResourceVariable(0, name="v")
+    # Make sure that the control dependencies on v do not cause issues
+    # in the lock_op's automatic control dependency adder.
+    #
+    # Note, here v must be a resource variable (or something similar),
+    # otherwise it gets hoisted into the while_loop by the time we add
+    # control dependencies to the lock_op.
+    out = control_flow_ops.while_loop(
+        lambda i: i < 10, lambda i: cs.execute(lambda j: v + j + 1, i), [0])
+    self.evaluate(v.initializer)
+    self.assertEqual(10, self.evaluate(out))
+
   # TODO(ebrevdo): Re-enable once CriticalSection is in core.
   #
   # def testCriticalSectionAndExecuteOpSaverRoundTrip(self):
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops.py b/tensorflow/contrib/framework/python/ops/sort_ops.py
index 8f62f0ea7b9b561f235b9496ffda97a9f378d530..1921a77c1e96ee3531d1ed0f98e41c27c9d427ac 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops.py
+++ b/tensorflow/contrib/framework/python/ops/sort_ops.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Support for sorting tensors.
 
+@@argsort
 @@sort
 """
 
@@ -21,6 +22,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -47,64 +51,141 @@ def sort(values, axis=-1, direction='ASCENDING', name=None):
     ValueError: If axis is not a constant scalar, or the direction is invalid.
   """
   with framework_ops.name_scope(name, 'sort'):
-    if direction not in _SORT_IMPL:
-      raise ValueError('%s should be one of %s' %
-                       (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
-    # Axis must be an integer, not a Tensor.
-    axis = framework_ops.convert_to_tensor(axis, name='axis')
-    axis_static = tensor_util.constant_value(axis)
-    if axis.shape.ndims != 0 or axis_static is None:
-      raise ValueError('axis must be a constant scalar')
-    axis_static = int(axis_static)  # Avoids NumPy casting error
+    return _sort_or_argsort(values, axis, direction, return_argsort=False)
+
+
+def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
+  """Returns the indices of a tensor that give its sorted order along an axis.
+
+  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
+  `tf.sort(values)`. For higher dimensions, the output has the same shape as
+  `values`, but along the given axis, values represent the index of the sorted
+  element in that slice of the tensor at the given position.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+        axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+        `'DESCENDING'`).
+    stable: If True, equal elements in the original tensor will not be
+        re-ordered in the returned order. Unstable sort is not yet implemented,
+        but will eventually be the default for performance reasons. If you
+        require a stable order, pass `stable=True` for forwards compatibility.
+    name: Optional name for the operation.
+
+  Returns:
+    An int32 `Tensor` with the same shape as `values`. The indices that would
+        sort each slice of the given `values` along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  del stable  # Unused.
+  with framework_ops.name_scope(name, 'argsort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=True)
+
+
+def _sort_or_argsort(values, axis, direction, return_argsort):
+  """Internal sort/argsort implementation.
+
+  Args:
+    values: The input values.
+    axis: The axis along which to sort.
+    direction: 'ASCENDING' or 'DESCENDING'.
+    return_argsort: Whether to return the argsort result.
+
+  Returns:
+    Either the sorted values, or the indices of the sorted values in the
+        original tensor. See the `sort` and `argsort` docstrings.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  if direction not in _SORT_IMPL:
+    raise ValueError('%s should be one of %s' %
+                     (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
+  # Axis must be an integer, not a Tensor.
+  axis = framework_ops.convert_to_tensor(axis, name='axis')
+  axis_static = tensor_util.constant_value(axis)
+  if axis.shape.ndims != 0 or axis_static is None:
+    raise ValueError('axis must be a constant scalar')
+  axis_static = int(axis_static)  # Avoids NumPy casting error
 
-    values = framework_ops.convert_to_tensor(values, name='values')
+  values = framework_ops.convert_to_tensor(values, name='values')
 
-    return _SORT_IMPL[direction](values, axis_static)
+  return _SORT_IMPL[direction](values, axis_static, return_argsort)
 
 
-def _descending_sort(values, axis):
+def _descending_sort(values, axis, return_argsort=False):
   """Sorts values in reverse using `top_k`.
 
   Args:
     values: Tensor of numeric values.
     axis: Index of the axis which values should be sorted along.
+    return_argsort: If False, return the sorted values. If True, return the
+        indices that would sort the values.
 
   Returns:
     The sorted values.
   """
   k = array_ops.shape(values)[axis]
   rank = array_ops.rank(values)
+  static_rank = values.shape.ndims
   # Fast path: sorting the last axis.
   if axis == -1 or axis + 1 == values.get_shape().ndims:
-    return nn_ops.top_k(values, k)[0]
-
-  # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
-  if axis < 0:
-    # Make axis a Tensor with the real axis index if needed.
-    axis += rank
-  transposition = array_ops.concat(
-      [
-          # Axes up to axis are unchanged.
-          math_ops.range(axis),
-          # Swap axis and rank - 1.
-          [rank - 1],
-          # Axes in [axis + 1, rank - 1) are unchanged.
-          math_ops.range(axis + 1, rank - 1),
-          # Swap axis and rank - 1.
-          [axis]
-      ],
-      axis=0)
-  top_k_input = array_ops.transpose(values, transposition)
-  values, unused_indices = nn_ops.top_k(top_k_input, k)
-  # transposition contains a single cycle of length 2 (swapping 2 elements),
-  # so it is an involution (it is its own inverse).
-  return array_ops.transpose(values, transposition)
-
-
-def _ascending_sort(values, axis):
+    top_k_input = values
+    transposition = None
+  else:
+    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
+    if axis < 0:
+      # Calculate the actual axis index if counting from the end. Use the static
+      # rank if available, or else make the axis back into a tensor.
+      axis += static_rank or rank
+    if static_rank is not None:
+      # Prefer to calculate the transposition array in NumPy and make it a
+      # constant.
+      transposition = constant_op.constant(
+          np.r_[
+              # Axes up to axis are unchanged.
+              np.arange(axis),
+              # Swap axis and rank - 1.
+              [static_rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              np.arange(axis + 1, static_rank - 1),
+              # Swap axis and rank - 1.
+              [axis]],
+          name='transposition')
+    else:
+      # Generate the transposition array from the tensors.
+      transposition = array_ops.concat(
+          [
+              # Axes up to axis are unchanged.
+              math_ops.range(axis),
+              # Swap axis and rank - 1.
+              [rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              math_ops.range(axis + 1, rank - 1),
+              # Swap axis and rank - 1.
+              [axis]
+          ],
+          axis=0)
+    top_k_input = array_ops.transpose(values, transposition)
+
+  values, indices = nn_ops.top_k(top_k_input, k)
+  return_value = indices if return_argsort else values
+  if transposition is not None:
+    # transposition contains a single cycle of length 2 (swapping 2 elements),
+    # so it is an involution (it is its own inverse).
+    return_value = array_ops.transpose(return_value, transposition)
+  return return_value
+
+
+def _ascending_sort(values, axis, return_argsort=False):
   # Negate the values to get the ascending order from descending sort.
-  values_or_indices = _descending_sort(-values, axis)
-  return -values_or_indices
+  values_or_indices = _descending_sort(-values, axis, return_argsort)
+  # If not argsort, negate the values again.
+  return values_or_indices if return_argsort else -values_or_indices
 
 
 _SORT_IMPL = {
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops_test.py b/tensorflow/contrib/framework/python/ops/sort_ops_test.py
index d08ae502f10d98ee14d8bea2f76b18bedb935cea..a8fb94b245dccc8c7cf0e94cef9b436f881fe408 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/sort_ops_test.py
@@ -24,6 +24,8 @@ from tensorflow.contrib.framework.python.ops import sort_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -90,6 +92,38 @@ class SortTest(test.TestCase):
               axis=0,
               direction='DESCENDING').eval())
 
+  def testSort_staticallyKnownRank_constantTransposition(self):
+    # The transposition array should be a constant if the rank of "values" is
+    # statically known.
+    tensor = random_ops.random_uniform(
+        # Rank is statically known to be 5, but the dimension lengths are not
+        # known.
+        random_ops.random_uniform(
+            shape=(5,), minval=0, maxval=10, dtype=dtypes.int32))
+    sort_ops.sort(tensor, axis=1)
+    transposition = (
+        ops.get_default_graph().get_tensor_by_name('sort/transposition:0'))
+    self.assertFalse(tensor_util.constant_value(transposition) is None)
+    self.assertAllEqual(
+        # Swaps "1" and "4" to put "1" at the end.
+        tensor_util.constant_value(transposition),
+        [0, 4, 2, 3, 1])
+
+  def testArgsort_1d(self):
+    arr = np.random.random(42)
+    with self.test_session():
+      self.assertAllEqual(
+          np.sort(arr),
+          array_ops.gather(arr, sort_ops.argsort(arr)).eval())
+
+  def testArgsort(self):
+    arr = np.random.random((5, 6, 7, 8))
+    for axis in range(4):
+      with self.test_session():
+        self.assertAllEqual(
+            np.argsort(arr, axis=axis),
+            sort_ops.argsort(arr, axis=axis).eval())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index ce37672895b37275770d2f5410f662e9acf1de9d..0eb6889db1fae1c74aeb4392441b308392b091a5 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -157,15 +157,3 @@ cuda_py_test(
         "requires_cudnn6",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index ff6f3b744190c9a7c74fb88878e5f13412251e79..461066bbb493932b342cee8f8842e899a2d84fff 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -545,15 +545,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 082c42eba180917e732bb7890129dfa94bf00fec..e3fc6bf0f034051fc33ff5966e2f4ea85aa538db 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -88,8 +88,8 @@ class GANEstimator(estimator.Estimator):
           discriminator_fn=discriminator_fn,
           generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
           discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
-          generator_optimizer=tf.train.AdamOptimizier(0.1, 0.5),
-          discriminator_optimizer=tf.train.AdamOptimizier(0.1, 0.5))
+          generator_optimizer=tf.train.AdamOptimizer(0.1, 0.5),
+          discriminator_optimizer=tf.train.AdamOptimizer(0.1, 0.5))
 
       # Train estimator.
       gan_estimator.train(train_input_fn, steps)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index 4811edcbcfa63e99210b3c2f416b71bb83915869..47e51415fd9e7daa360ca06a11078f6edcf63b5b 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -44,11 +44,11 @@ from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import resource_loader
 
-
 __all__ = [
     'get_graph_def_from_disk',
     'get_graph_def_from_resource',
@@ -62,10 +62,11 @@ __all__ = [
     'frechet_inception_distance',
     'frechet_classifier_distance',
     'frechet_classifier_distance_from_activations',
+    'mean_only_frechet_classifier_distance_from_activations',
+    'diagonal_only_frechet_classifier_distance_from_activations',
     'INCEPTION_DEFAULT_IMAGE_SIZE',
 ]
 
-
 INCEPTION_URL = 'http://download.tensorflow.org/models/frozen_inception_v1_2015_12_05.tar.gz'
 INCEPTION_FROZEN_GRAPH = 'inceptionv1_for_inception_score.pb'
 INCEPTION_INPUT = 'Mul:0'
@@ -77,8 +78,7 @@ INCEPTION_DEFAULT_IMAGE_SIZE = 299
 def _validate_images(images, image_size):
   images = ops.convert_to_tensor(images)
   images.shape.with_rank(4)
-  images.shape.assert_is_compatible_with(
-      [None, image_size, image_size, None])
+  images.shape.assert_is_compatible_with([None, image_size, image_size, None])
   return images
 
 
@@ -109,9 +109,10 @@ def _symmetric_matrix_square_root(mat, eps=1e-10):
       math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
 
 
-def preprocess_image(
-    images, height=INCEPTION_DEFAULT_IMAGE_SIZE,
-    width=INCEPTION_DEFAULT_IMAGE_SIZE, scope=None):
+def preprocess_image(images,
+                     height=INCEPTION_DEFAULT_IMAGE_SIZE,
+                     width=INCEPTION_DEFAULT_IMAGE_SIZE,
+                     scope=None):
   """Prepare a batch of images for evaluation.
 
   This is the preprocessing portion of the graph from
@@ -272,8 +273,11 @@ def run_inception(images,
   return activations
 
 
-def run_image_classifier(tensor, graph_def, input_tensor,
-                         output_tensor, scope='RunClassifier'):
+def run_image_classifier(tensor,
+                         graph_def,
+                         input_tensor,
+                         output_tensor,
+                         scope='RunClassifier'):
   """Runs a network from a frozen graph.
 
   Args:
@@ -433,8 +437,8 @@ def trace_sqrt_product(sigma, sigma_v):
   sqrt_sigma = _symmetric_matrix_square_root(sigma)
 
   # This is sqrt(A sigma_v A) above
-  sqrt_a_sigmav_a = math_ops.matmul(
-      sqrt_sigma, math_ops.matmul(sigma_v, sqrt_sigma))
+  sqrt_a_sigmav_a = math_ops.matmul(sqrt_sigma,
+                                    math_ops.matmul(sigma_v, sqrt_sigma))
 
   return math_ops.trace(_symmetric_matrix_square_root(sqrt_a_sigmav_a))
 
@@ -452,7 +456,7 @@ def frechet_classifier_distance(real_images,
   Given two Gaussian distribution with means m and m_w and covariance matrices
   C and C_w, this function calculates
 
-  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
+              |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
 
   which captures how different the distributions of real images and generated
   images (or more accurately, their visual features) are. Note that unlike the
@@ -511,10 +515,142 @@ def frechet_classifier_distance(real_images,
   return frechet_classifier_distance_from_activations(real_a, gen_a)
 
 
-def frechet_classifier_distance_from_activations(
+def mean_only_frechet_classifier_distance_from_activations(
     real_activations, generated_activations):
   """Classifier distance for evaluating a generative model from activations.
 
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calcuates
+
+                                |m - m_w|^2
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
+  In this variant, we only compute the difference between the means of the
+  fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet
+  still retains much of the same information as FID.
+
+  Args:
+    real_activations: 2D array of activations of real images of size
+      [num_images, num_dims] to use to compute Frechet Inception distance.
+    generated_activations: 2D array of activations of generated images of size
+      [num_images, num_dims] to use to compute Frechet Inception distance.
+
+  Returns:
+    The mean-only Frechet Inception distance. A floating-point scalar of the
+    same type as the output of the activations.
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
+
+  # Compute means of activations.
+  m = math_ops.reduce_mean(real_activations, 0)
+  m_w = math_ops.reduce_mean(generated_activations, 0)
+
+  # Next the distance between means.
+  mean = math_ops.reduce_sum(
+      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
+  mofid = mean
+  if activations_dtype != dtypes.float64:
+    mofid = math_ops.cast(mofid, activations_dtype)
+
+  return mofid
+
+
+def diagonal_only_frechet_classifier_distance_from_activations(
+    real_activations, generated_activations):
+  """Classifier distance for evaluating a generative model.
+
+  This is based on the Frechet Inception distance, but for an arbitrary
+  classifier.
+
+  This technique is described in detail in https://arxiv.org/abs/1706.08500.
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calcuates
+
+          |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2))
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images. In this variant, we compute diagonal-only covariance matrices.
+  As a result, instead of computing an expensive matrix square root, we can do
+  something much simpler, and has O(n) vs O(n^2) space complexity.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
+  Args:
+    real_activations: Real images to use to compute Frechet Inception distance.
+    generated_activations: Generated images to use to compute Frechet Inception
+      distance.
+
+  Returns:
+    The diagonal-only Frechet Inception distance. A floating-point scalar of
+    the same type as the output of the activations.
+
+  Raises:
+    ValueError: If the shape of the variance and mean vectors are not equal.
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
+
+  # Compute mean and covariance matrices of activations.
+  m, var = nn_impl.moments(real_activations, axes=[0])
+  m_w, var_w = nn_impl.moments(generated_activations, axes=[0])
+
+  actual_shape = var.get_shape()
+  expected_shape = m.get_shape()
+
+  if actual_shape != expected_shape:
+    raise ValueError('shape: {} must match expected shape: {}'.format(
+        actual_shape, expected_shape))
+
+  # Compute the two components of FID.
+
+  # First the covariance component.
+  # Here, note that trace(A + B) = trace(A) + trace(B)
+  trace = math_ops.reduce_sum(
+      (var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w)))
+
+  # Next the distance between means.
+  mean = math_ops.reduce_sum(
+      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
+  dofid = trace + mean
+  if activations_dtype != dtypes.float64:
+    dofid = math_ops.cast(dofid, activations_dtype)
+
+  return dofid
+
+
+def frechet_classifier_distance_from_activations(real_activations,
+                                                 generated_activations):
+  """Classifier distance for evaluating a generative model.
+
   This methods computes the Frechet classifier distance from activations of
   real images and generated images. This can be used independently of the
   frechet_classifier_distance() method, especially in the case of using large
@@ -525,13 +661,20 @@ def frechet_classifier_distance_from_activations(
   Given two Gaussian distribution with means m and m_w and covariance matrices
   C and C_w, this function calculates
 
-  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
+                |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
 
   which captures how different the distributions of real images and generated
   images (or more accurately, their visual features) are. Note that unlike the
   Inception score, this is a true distance and utilizes information about real
   world images.
 
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
   Args:
     real_activations: 2D Tensor containing activations of real data. Shape is
       [batch_size, activation_size].
@@ -553,36 +696,38 @@ def frechet_classifier_distance_from_activations(
 
   # Compute mean and covariance matrices of activations.
   m = math_ops.reduce_mean(real_activations, 0)
-  m_v = math_ops.reduce_mean(generated_activations, 0)
+  m_w = math_ops.reduce_mean(generated_activations, 0)
   num_examples = math_ops.to_double(array_ops.shape(real_activations)[0])
 
   # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
   real_centered = real_activations - m
   sigma = math_ops.matmul(
-      real_centered, real_centered, transpose_a=True) / (num_examples - 1)
+      real_centered, real_centered, transpose_a=True) / (
+          num_examples - 1)
 
-  gen_centered = generated_activations - m_v
-  sigma_v = math_ops.matmul(
-      gen_centered, gen_centered, transpose_a=True) / (num_examples - 1)
+  gen_centered = generated_activations - m_w
+  sigma_w = math_ops.matmul(
+      gen_centered, gen_centered, transpose_a=True) / (
+          num_examples - 1)
 
-  # Find the Tr(sqrt(sigma sigma_v)) component of FID
-  sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)
+  # Find the Tr(sqrt(sigma sigma_w)) component of FID
+  sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)
 
   # Compute the two components of FID.
 
   # First the covariance component.
   # Here, note that trace(A + B) = trace(A) + trace(B)
-  trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component
+  trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component
 
   # Next the distance between means.
-  mean = math_ops.square(linalg_ops.norm(m - m_v))  # This uses the L2 norm.
+  mean = math_ops.reduce_sum(
+      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
   fid = trace + mean
   if activations_dtype != dtypes.float64:
     fid = math_ops.cast(fid, activations_dtype)
 
   return fid
 
-
 frechet_inception_distance = functools.partial(
     frechet_classifier_distance,
     classifier_fn=functools.partial(
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index 61dc8646ddc10605561ae6b19e90f4739c346608..663e49bdca3cb2dd9257da326488c877fcc4256d 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -50,6 +50,26 @@ def _expected_inception_score(logits):
   return np.exp(np.mean(per_example_logincscore))
 
 
+def _expected_mean_only_fid(real_imgs, gen_imgs):
+  m = np.mean(real_imgs, axis=0)
+  m_v = np.mean(gen_imgs, axis=0)
+  mean = np.square(m - m_v).sum()
+  mofid = mean
+  return mofid
+
+
+def _expected_diagonal_only_fid(real_imgs, gen_imgs):
+  m = np.mean(real_imgs, axis=0)
+  m_v = np.mean(gen_imgs, axis=0)
+  var = np.var(real_imgs, axis=0)
+  var_v = np.var(gen_imgs, axis=0)
+  sqcc = np.sqrt(var * var_v)
+  mean = (np.square(m - m_v)).sum()
+  trace = (var + var_v - 2 * sqcc).sum()
+  dofid = mean + trace
+  return dofid
+
+
 def _expected_fid(real_imgs, gen_imgs):
   m = np.mean(real_imgs, axis=0)
   m_v = np.mean(gen_imgs, axis=0)
@@ -285,6 +305,46 @@ class ClassifierMetricsTest(test.TestCase):
 
     self.assertAllClose(_expected_inception_score(logits), incscore_np)
 
+  def test_mean_only_frechet_classifier_distance_value(self):
+    """Test that `frechet_classifier_distance` gives the correct value."""
+    np.random.seed(0)
+
+    pool_real_a = np.float32(np.random.randn(256, 2048))
+    pool_gen_a = np.float32(np.random.randn(256, 2048))
+
+    tf_pool_real_a = array_ops.constant(pool_real_a)
+    tf_pool_gen_a = array_ops.constant(pool_gen_a)
+
+    mofid_op = classifier_metrics.mean_only_frechet_classifier_distance_from_activations(  # pylint: disable=line-too-long
+        tf_pool_real_a, tf_pool_gen_a)
+
+    with self.test_session() as sess:
+      actual_mofid = sess.run(mofid_op)
+
+    expected_mofid = _expected_mean_only_fid(pool_real_a, pool_gen_a)
+
+    self.assertAllClose(expected_mofid, actual_mofid, 0.0001)
+
+  def test_diagonal_only_frechet_classifier_distance_value(self):
+    """Test that `frechet_classifier_distance` gives the correct value."""
+    np.random.seed(0)
+
+    pool_real_a = np.float32(np.random.randn(256, 2048))
+    pool_gen_a = np.float32(np.random.randn(256, 2048))
+
+    tf_pool_real_a = array_ops.constant(pool_real_a)
+    tf_pool_gen_a = array_ops.constant(pool_gen_a)
+
+    dofid_op = classifier_metrics.diagonal_only_frechet_classifier_distance_from_activations(  # pylint: disable=line-too-long
+        tf_pool_real_a, tf_pool_gen_a)
+
+    with self.test_session() as sess:
+      actual_dofid = sess.run(dofid_op)
+
+    expected_dofid = _expected_diagonal_only_fid(pool_real_a, pool_gen_a)
+
+    self.assertAllClose(expected_dofid, actual_dofid, 0.0001)
+
   def test_frechet_classifier_distance_value(self):
     """Test that `frechet_classifier_distance` gives the correct value."""
     np.random.seed(0)
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index 0d1afad72da8a8e087239868e25ddebe23490d1e..508f487722fba89cc8391a340f73673a526e86c4 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -31,6 +31,7 @@ __all__ = [
     'add_image_comparison_summaries',
     'add_gan_model_summaries',
     'add_regularization_loss_summaries',
+    'add_cyclegan_image_summaries',
 ]
 
 
@@ -51,14 +52,9 @@ def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
     ValueError: If real and generated data aren't images.
   """
   if isinstance(gan_model, namedtuples.CycleGANModel):
-    saved_params = locals()
-    saved_params.pop('gan_model', None)
-    with ops.name_scope('cyclegan_x2y_image_summaries'):
-      add_gan_model_image_summaries(gan_model.model_x2y, **saved_params)
-    with ops.name_scope('cyclegan_y2x_image_summaries'):
-      add_gan_model_image_summaries(gan_model.model_y2x, **saved_params)
-    return
-
+    raise ValueError(
+        '`add_gan_model_image_summaries` does not take CycleGANModels. Please '
+        'use `add_cyclegan_image_summaries` instead.')
   _assert_is_image(gan_model.real_data)
   _assert_is_image(gan_model.generated_data)
 
@@ -89,6 +85,49 @@ def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
     add_gan_model_summaries(gan_model)
 
 
+def add_cyclegan_image_summaries(cyclegan_model):
+  """Adds image summaries for CycleGAN.
+
+  There are two summaries, one for each generator. The first image is the
+  generator input, the second is the generator output, and the third is G(F(x)).
+
+  Args:
+    cyclegan_model: A CycleGANModel tuple.
+
+  Raises:
+    ValueError: If `cyclegan_model` isn't a CycleGANModel.
+    ValueError: If generated data, generator inputs, and reconstructions aren't
+      images.
+    ValueError: If the generator input, generated data, and reconstructions
+      aren't all the same size.
+  """
+  if not isinstance(cyclegan_model, namedtuples.CycleGANModel):
+    raise ValueError('`cyclegan_model` was not a CycleGANModel. Instead, was '
+                     '%s' % type(cyclegan_model))
+
+  _assert_is_image(cyclegan_model.model_x2y.generator_inputs)
+  _assert_is_image(cyclegan_model.model_x2y.generated_data)
+  _assert_is_image(cyclegan_model.reconstructed_x)
+  _assert_is_image(cyclegan_model.model_y2x.generator_inputs)
+  _assert_is_image(cyclegan_model.model_y2x.generated_data)
+  _assert_is_image(cyclegan_model.reconstructed_y)
+
+  def _add_comparison_summary(gan_model, reconstructions):
+    image_list = (array_ops.unstack(gan_model.generator_inputs[:1]) +
+                  array_ops.unstack(gan_model.generated_data[:1]) +
+                  array_ops.unstack(reconstructions[:1]))
+    summary.image(
+        'image_comparison', eval_utils.image_reshaper(
+            image_list, num_cols=len(image_list)), max_outputs=1)
+
+  with ops.name_scope('x2y_image_comparison_summaries'):
+    _add_comparison_summary(
+        cyclegan_model.model_x2y, cyclegan_model.reconstructed_x)
+  with ops.name_scope('y2x_image_comparison_summaries'):
+    _add_comparison_summary(
+        cyclegan_model.model_y2x, cyclegan_model.reconstructed_y)
+
+
 def add_image_comparison_summaries(gan_model, num_comparisons=2,
                                    display_diffs=False):
   """Adds image summaries to compare triplets of images.
@@ -109,15 +148,6 @@ def add_image_comparison_summaries(gan_model, num_comparisons=2,
     ValueError: If the generator input, real, and generated data aren't all the
       same size.
   """
-  if isinstance(gan_model, namedtuples.CycleGANModel):
-    saved_params = locals()
-    saved_params.pop('gan_model', None)
-    with ops.name_scope('cyclegan_x2y_image_comparison_summaries'):
-      add_image_comparison_summaries(gan_model.model_x2y, **saved_params)
-    with ops.name_scope('cyclegan_y2x_image_comparison_summaries'):
-      add_image_comparison_summaries(gan_model.model_y2x, **saved_params)
-    return
-
   _assert_is_image(gan_model.generator_inputs)
   _assert_is_image(gan_model.generated_data)
   _assert_is_image(gan_model.real_data)
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 45eb108586bed07434ac29595164745eac6054c1..33d51bfc218ab93fb52439b1eefed98a4568c4a1 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -65,15 +65,14 @@ def get_cyclegan_model():
   return namedtuples.CycleGANModel(
       model_x2y=model_x2y,
       model_y2x=model_y2x,
-      reconstructed_x=array_ops.zeros([3, 30, 35, 6]),
-      reconstructed_y=array_ops.zeros([3, 30, 35, 6]))
+      reconstructed_x=array_ops.zeros([4, 32, 32, 3]),
+      reconstructed_y=array_ops.zeros([4, 32, 32, 3]))
 
 
 class SummariesTest(test.TestCase):
 
-  def _test_add_gan_model_image_summaries_impl(self, get_model_fn,
-                                               expected_num_summary_ops,
-                                               model_summaries):
+  def _test_add_gan_model_image_summaries_impl(
+      self, get_model_fn, expected_num_summary_ops, model_summaries):
     summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2,
                                             model_summaries=model_summaries)
 
@@ -89,8 +88,9 @@ class SummariesTest(test.TestCase):
   def test_add_gan_model_image_summaries_no_model(self):
     self._test_add_gan_model_image_summaries_impl(get_gan_model, 2, False)
 
-  def test_add_gan_model_image_summaries_for_cyclegan(self):
-    self._test_add_gan_model_image_summaries_impl(get_cyclegan_model, 10, True)
+  def test_cyclegan_image_summaries_dont_work(self):
+    with self.assertRaises(ValueError):
+      summaries.add_gan_model_image_summaries(get_cyclegan_model())
 
   def _test_add_gan_model_summaries_impl(self, get_model_fn,
                                          expected_num_summary_ops):
@@ -137,7 +137,11 @@ class SummariesTest(test.TestCase):
     self._test_add_image_comparison_summaries_impl(get_gan_model, 1)
 
   def test_add_image_comparison_summaries_for_cyclegan(self):
-    self._test_add_image_comparison_summaries_impl(get_cyclegan_model, 2)
+    summaries.add_cyclegan_image_summaries(get_cyclegan_model())
+
+    self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
+    with self.test_session(use_gpu=True):
+      summary.merge_all().eval()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index 39588b7219ebac1cc4855532be3fcc38e6381134..1ba3a641671c7f2a411a0c5f99228ca16eee1080 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -306,6 +306,7 @@ def wasserstein_gradient_penalty(
     discriminator_scope,
     epsilon=1e-10,
     target=1.0,
+    one_sided=False,
     weights=1.0,
     scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -327,6 +328,8 @@ def wasserstein_gradient_penalty(
       computing the gradient norm.
     target: Optional Python number or `Tensor` indicating the target value of
       gradient norm. Defaults to 1.0.
+    one_sided: If `True`, penalty proposed in https://arxiv.org/abs/1709.08894
+      is used. Defaults to `False`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `real_data` and `generated_data`, and must be broadcastable to
       them (i.e., all dimensions must be either `1`, or the same as the
@@ -377,10 +380,13 @@ def wasserstein_gradient_penalty(
     # For numerical stability, add epsilon to the sum before taking the square
     # root. Note tf.norm does not add epsilon.
     slopes = math_ops.sqrt(gradient_squares + epsilon)
-    penalties = math_ops.square(slopes / target - 1.0)
+    penalties = slopes / target - 1.0
+    if one_sided:
+      penalties = math_ops.maximum(0., penalties)
+    penalties_squared = math_ops.square(penalties)
     penalty = losses.compute_weighted_loss(
-        penalties, weights, scope=scope, loss_collection=loss_collection,
-        reduction=reduction)
+        penalties_squared, weights, scope=scope,
+        loss_collection=loss_collection, reduction=reduction)
 
     if add_summaries:
       summary.scalar('gradient_penalty_loss', penalty)
@@ -665,7 +671,7 @@ def least_squares_discriminator_loss(
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
     add_summaries=False):
-  """Least squares generator loss.
+  """Least squares discriminator loss.
 
   This loss comes from `Least Squares Generative Adversarial Networks`
   (https://arxiv.org/abs/1611.04076).
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index dbaa624ae9d6a5a5949db692e52c0c1deb18b8df..2889e937436d2faa66b5693c19046e122cbaf652 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -481,6 +481,28 @@ class GradientPenaltyTest(test.TestCase, _PenaltyTest):
                       })
       self.assertAlmostEqual(self._expected_loss, loss, 5)
 
+  def test_loss_using_one_sided_mode(self):
+    generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+    real_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+
+    loss = tfgan_losses.wasserstein_gradient_penalty(
+        generated_data,
+        real_data,
+        self._kwargs['generator_inputs'],
+        self._kwargs['discriminator_fn'],
+        self._kwargs['discriminator_scope'],
+        one_sided=True)
+    self.assertEqual(generated_data.dtype, loss.dtype)
+
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      loss = sess.run(loss,
+                      feed_dict={
+                          generated_data: self._generated_data_np,
+                          real_data: self._real_data_np,
+                      })
+      self.assertAlmostEqual(self._expected_loss, loss, 5)
+
   def test_loss_with_gradient_norm_target(self):
     """Test loss value with non default gradient norm target."""
     generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 776eb11ecb1624544d24611d8fe6ca19768b8313..73acd05b60a5fb02601423fd9234a56a34f75276 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -461,6 +461,7 @@ def gan_loss(
     gradient_penalty_weight=None,
     gradient_penalty_epsilon=1e-10,
     gradient_penalty_target=1.0,
+    gradient_penalty_one_sided=False,
     mutual_information_penalty_weight=None,
     aux_cond_generator_weight=None,
     aux_cond_discriminator_weight=None,
@@ -485,6 +486,8 @@ def gan_loss(
     gradient_penalty_target: If `gradient_penalty_weight` is not None, a Python
       number or `Tensor` indicating the target value of gradient norm. See the
       CIFAR10 section of https://arxiv.org/abs/1710.10196. Defaults to 1.0.
+    gradient_penalty_one_sided: If `True`, penalty proposed in
+      https://arxiv.org/abs/1709.08894 is used. Defaults to `False`.
     mutual_information_penalty_weight: If not `None`, must be a non-negative
       Python number or Tensor indicating how much to weight the mutual
       information penalty. See https://arxiv.org/abs/1606.03657 for more
@@ -546,6 +549,7 @@ def gan_loss(
         model,
         epsilon=gradient_penalty_epsilon,
         target=gradient_penalty_target,
+        one_sided=gradient_penalty_one_sided,
         add_summaries=add_summaries)
     dis_loss += gradient_penalty_weight * gp_loss
   if _use_aux_loss(mutual_information_penalty_weight):
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index f9bdaa74c948ecee11d5cfd89f06087924f8dace..3ebbe55d059e5e72607bc4efdbf95a6c96d99f11 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -359,10 +359,12 @@ class GANLossTest(test.TestCase):
     self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
 
   # Test gradient penalty option.
-  def _test_grad_penalty_helper(self, create_gan_model_fn):
+  def _test_grad_penalty_helper(self, create_gan_model_fn, one_sided=False):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
-    loss_gp = train.gan_loss(model, gradient_penalty_weight=1.0)
+    loss_gp = train.gan_loss(model,
+                             gradient_penalty_weight=1.0,
+                             gradient_penalty_one_sided=one_sided)
     self.assertTrue(isinstance(loss_gp, namedtuples.GANLoss))
 
     # Check values.
@@ -394,6 +396,25 @@ class GANLossTest(test.TestCase):
   def test_grad_penalty_callable_acgan(self):
     self._test_grad_penalty_helper(create_callable_acgan_model)
 
+  def test_grad_penalty_one_sided_gan(self):
+    self._test_grad_penalty_helper(create_gan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_gan(self):
+    self._test_grad_penalty_helper(create_callable_gan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_infogan(self):
+    self._test_grad_penalty_helper(create_infogan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_infogan(self):
+    self._test_grad_penalty_helper(
+        create_callable_infogan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_acgan(self):
+    self._test_grad_penalty_helper(create_acgan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_acgan(self):
+    self._test_grad_penalty_helper(create_callable_acgan_model, one_sided=True)
+
   # Test mutual information penalty option.
   def _test_mutual_info_penalty_helper(self, create_gan_model_fn):
     train.gan_loss(create_gan_model_fn(),
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index 707ae25d485c64f15694ee0e357f32b619d3cd33..e534fdc17749974ebe713c2730682bea6d7a85e4 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -9,18 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
diff --git a/tensorflow/contrib/graph_editor/BUILD b/tensorflow/contrib/graph_editor/BUILD
index 967ad2fc090906e93f22c777816eede37f9a1b04..1711100e3a857dba0d15c5b4f6c96cddc568e800 100644
--- a/tensorflow/contrib/graph_editor/BUILD
+++ b/tensorflow/contrib/graph_editor/BUILD
@@ -39,18 +39,6 @@ py_library(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "match",
     srcs = ["tests/match.py"],
diff --git a/tensorflow/contrib/graph_editor/tests/transform_test.py b/tensorflow/contrib/graph_editor/tests/transform_test.py
index ca00394388f67e2ed9508684a47b23c3ee9e79e8..2603de640735a612cbd883cc6227fe3cd9f11fca 100644
--- a/tensorflow/contrib/graph_editor/tests/transform_test.py
+++ b/tensorflow/contrib/graph_editor/tests/transform_test.py
@@ -23,6 +23,7 @@ from tensorflow.contrib import graph_editor as ge
 from tensorflow.contrib.graph_editor.tests import match
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -84,9 +85,9 @@ class TransformTest(test.TestCase):
   def test_transform(self):
     transformer = ge.Transformer()
 
-    def my_transform_op_handler(info, op):
+    def my_transform_op_handler(info, op, new_inputs):
       add_noise = op.name.startswith("Add")
-      op_, op_outputs_ = ge.transform.copy_op_handler(info, op)
+      op_, op_outputs_ = ge.transform.copy_op_handler(info, op, new_inputs)
       if not add_noise:
         return op_, op_outputs_
       # add some noise to op
@@ -201,15 +202,56 @@ class TransformTest(test.TestCase):
                         get_operation_by_name("res/grad/mul1_grad/Mul_1"))
 
     # Make sure _original_ops are as expected.
-    self.assertEquals(original_mul1_grad._original_op.name, u"mul1")
-    self.assertEquals(result_mul1_grad._original_op.name, u"res/mul1")
-    self.assertNotEquals(res.name, g.name)
+    self.assertEqual(original_mul1_grad._original_op.name, u"mul1")
+    self.assertEqual(result_mul1_grad._original_op.name, u"res/mul1")
+    self.assertNotEqual(res.name, g.name)
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
       g_val, res_val = sess.run([g, res])
     self.assertNear(g_val, 0.0, ERROR_TOLERANCE)
     self.assertNear(res_val, 0.0, ERROR_TOLERANCE)
 
+  def test_graph_while_loop(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      max_index = array_ops.placeholder(dtype=dtypes.int32, shape=tuple())
+      index_start = constant_op.constant(1)
+      sum_start = constant_op.constant(0)
+      _, result = control_flow_ops.while_loop(
+          cond=lambda i, unused_s: i <= max_index,
+          body=lambda i, s: (i + 1, s + i),
+          loop_vars=[index_start, sum_start])
+    copied_graph = ops.Graph()
+    _, copy_info = ge.copy(
+        graph, dst_graph=copied_graph, dst_scope="imported")
+    copied_result = copy_info.transformed(result)
+    copied_max_index = copy_info.transformed(max_index)
+    with copied_graph.as_default():
+      with session.Session() as sess:
+        n = 10
+        sum_val = sess.run(copied_result, feed_dict={copied_max_index: n})
+        self.assertEqual(sum_val, 55)
+
+  def test_graph_cond(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      choice = array_ops.placeholder(shape=(), dtype=dtypes.bool)
+      result = control_flow_ops.cond(
+          choice,
+          lambda: constant_op.constant(1),
+          lambda: constant_op.constant(2))
+    copied_graph = ops.Graph()
+    _, copy_info = ge.copy(
+        graph, dst_graph=copied_graph, dst_scope="imported")
+    copied_result = copy_info.transformed(result)
+    copied_choice = copy_info.transformed(choice)
+    with copied_graph.as_default():
+      with session.Session() as sess:
+        res = sess.run(copied_result, feed_dict={copied_choice: True})
+        self.assertEqual(res, 1)
+        res = sess.run(copied_result, feed_dict={copied_choice: False})
+        self.assertEqual(res, 2)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 14ac5296657d48c7f9e94d220c9e7e28af4d4353..d8a48387a745e7d88cc6a74c96cb21a2ba1cfa1f 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -129,20 +129,26 @@ def transform_op_if_inside_handler(info, op, keep_if_possible=True):
       return None
 
 
-def copy_op_handler(info, op, copy_shape=True):
+def copy_op_handler(info, op, new_inputs, copy_shape=True):
   """Copy a `tf.Operation`.
 
   Args:
     info: Transform._TmpInfo instance.
     op: the `tf.Operation` to be copied.
+    new_inputs: The new inputs for this op.
     copy_shape: also copy the shape of the tensor
   Returns:
     A `(op, op_outputs)` tuple containing the transformed op and its outputs.
   """
+  # The `new_inputs` was added to this function. For compatibility reason,
+  # let's raise an error if `new_inputs` is a boolean.
+  if isinstance(new_inputs, bool):
+    raise TypeError("the `new_inputs` argument must be an iterable.")
+
   # pylint: disable=protected-access
 
   # Clone the node def:
-  node_def_ = deepcopy(op._node_def)
+  node_def_ = deepcopy(op.node_def)
 
   # Transform name:
   name_ = info.new_name(op.name)
@@ -155,10 +161,10 @@ def copy_op_handler(info, op, copy_shape=True):
 
   # Make a copy of the op_def too.
   # Its unique to every _type_ of Operation.
-  op_def_ = deepcopy(op._op_def)
+  op_def_ = deepcopy(op.op_def)
 
   # Initialize a new Operation instance
-  op_ = tf_ops.Operation(node_def_, info.graph_, [], output_types_,
+  op_ = tf_ops.Operation(node_def_, info.graph_, new_inputs, output_types_,
                          [], input_types_, None, op_def_)
 
   # copy the shape over
@@ -170,6 +176,7 @@ def copy_op_handler(info, op, copy_shape=True):
   # attribute to exist, we will create a dummy original_op first and then
   # later finalise it with the actual original_op when all the ops have
   # been copied.
+  # TODO(fkp): Stop worrying about _original_op and remove this code?
   if op._original_op:
     op_._original_op = op._original_op
 
@@ -328,6 +335,14 @@ class _TmpInfo(object):
                             for key in self.graph.get_all_collection_keys())
     self.cyclic_ops = []
     self.transform_original_op_handler = transform_op_if_inside_handler
+    # The graph is transformed op by op, in the same order the original ops
+    # were created. However, this is sometimes not possible due to cycles
+    # (i.e. while loops). So when the transformer creates a new op whose
+    # inputs do not exist yet, temporary placeholders are created and stored
+    # in this `tmp_cyclic_ts` container. During a second pass,
+    # those temporary tensors are replaced by the proper transformed tensors
+    # (see the function `_finalize_cycles`).
+    self.tmp_cyclic_ts = []
 
   def new_name(self, name):
     """Compute a destination name from a source name.
@@ -428,10 +443,10 @@ class Transformer(object):
 
     # Create temporary info used during this transform call
     info = _TmpInfo(sgv, dst_graph, dst_scope, src_scope)
-    info.transform_original_op_handler = self.transform_original_op_handler
 
     self._copy_ops(info)
-    self._connect_ops(info)
+    self._finalize_cycles(info)
+    self._connect_control_inputs(info)
 
     # Compute information about the transformation
     res_info = TransformerInfo(info)
@@ -440,10 +455,10 @@ class Transformer(object):
 
   def _copy_ops(self, info):
     """Copy ops without connecting them."""
-    for op in info.sgv.ops:
-      logging.debug("Copying op: %s", op.name)
-      # TODO(fkp): return a subgraph?
-      op_, op_outputs_ = self.transform_op_handler(info, op)
+    sorted_ops = sorted(info.sgv.ops, key=lambda op: op._id)  # pylint: disable=protected-access
+    for op in sorted_ops:
+      new_inputs = [self._transformed_t(info, t, op) for t in op.inputs]
+      op_, op_outputs_ = self.transform_op_handler(info, op, new_inputs)
       if op is op_:
         raise ValueError("In-place transformation not allowed.")
 
@@ -456,27 +471,36 @@ class Transformer(object):
         info.transformed_ts[op_output] = op_output_
         self.assign_collections_handler(info, op_output, op_output_)
 
-  def _connect_ops(self, info):
+  def _finalize_cycles(self, info):
+    """Reconnects the cyclic tensors."""
+    for t, tmp_t_, consumer_op in info.tmp_cyclic_ts:
+      if t not in info.transformed_ts:
+        raise ValueError("The tensor {} should be transformed by now.".format(
+            t.name))
+      if consumer_op not in info.transformed_ops:
+        raise ValueError("The op {} should be transformed by now.".format(
+            consumer_op.name))
+      t_ = info.transformed_ts[t]
+      consumer_op_ = info.transformed_ops[consumer_op]
+      t_index_ = list(consumer_op_.inputs).index(tmp_t_)
+      consumer_op_._update_input(t_index_, t_, update_dtype=False)  # pylint: disable=protected-access
+
+  def _connect_control_inputs(self, info):
     """Connect the previously copied ops."""
     for op in info.sgv.ops:
-      logging.debug("Finalizing op: %s", op.name)
+      logging.debug("Connecting control inputs of op: %s", op.name)
       op_ = info.transformed_ops[op]
 
-      # pylint: disable=protected-access
-      if op_.inputs:
-        raise ValueError("The newly transformed op should not have "
-                         "any inputs yet: {}".format(op_.name))
-      inputs_ = [self._transformed_t(info, t) for t in op.inputs]
-      for t in inputs_:
-        op_._add_input(t)
-
       # Finalize original op.
+      # TODO(fkp): Stop worrying about _original_op and remove this code?
+      # pylint: disable=protected-access
       if op._original_op:
-        original_op = info.transform_original_op_handler(info, op._original_op)
+        original_op = self.transform_original_op_handler(info, op._original_op)
         if original_op is None:
           logging.debug("Could not find original op for: %s", op_.name)
         else:
           op_._original_op = original_op
+      # pylint: enable=protected-access
 
       # Finalize control inputs:
       control_inputs_ = [self.transform_control_input_handler(info, ci)
@@ -525,19 +549,38 @@ class Transformer(object):
 
     return sgv_.remap(input_map_, output_map_)
 
-  def _transformed_t(self, info, t):
+  def _transformed_t(self, info, t, consumer_op):
     """Return tre transformed tensor of `t`."""
-    if t not in info.transformed_ts:
-      # If op is not in the subgraph.
-      if t in info.sgv_inputs_set:
-        # t is an input of the subgraph.
-        return self.transform_external_input_handler(info, t)
+    if t in info.transformed_ts:
+      # If op is in the subgraph, just return its transformed counterpart.
+      return info.transformed_ts[t]
+
+    if t in info.sgv_inputs_set:
+      # `t` is an input of the subgraph.
+      return self.transform_external_input_handler(info, t)
+    elif t.op in info.ops:
+      # `t` is an internal tensor but is not transformed yet because it
+      # belongs to a graph cycle.
+      logging.debug("Cyclic tensor: t.name = %s", t.name)
+      # Try to find an existing tensor we can use for now,
+      # otherwise create one. We'll rewire this later.
+      if consumer_op.type == "Merge":
+        first_input = consumer_op.inputs[0]
+        tmp_t_ = self._transformed_t(info, first_input, consumer_op)
+      elif t.op.type == "Enter":
+        enter_input = t.op.inputs[0]
+        tmp_t_ = self._transformed_t(info, enter_input, consumer_op)
       else:
-        # t is a hidden input of the subgraph.
-        return self.transform_external_hidden_input_handler(info, t)
+        with info.graph_.as_default():
+          tmp_t_ = util.make_placeholder_from_tensor(t, scope=info.scope_,
+                                                     prefix="geph_tmp")
+        logging.debug("Created temporary placeholder: %s.", tmp_t_.name)
+      # Register as temporary and return.
+      info.tmp_cyclic_ts.append((t, tmp_t_, consumer_op))
+      return tmp_t_
     else:
-      # If op is in the subgraph, just return its transformed.
-      return info.transformed_ts[t]
+      # `t` is a hidden input of the subgraph.
+      return self.transform_external_hidden_input_handler(info, t)
 
 
 def copy(sgv, dst_graph=None, dst_scope="", src_scope="",
@@ -624,6 +667,40 @@ def copy_with_input_replacements(sgv, replacement_ts,
       sgv, dst_graph, dst_scope, src_scope, reuse_dst_scope=reuse_dst_scope)
 
 
+def _add_control_flow_ops(ops, control_ios):
+  """Complete `ops` so that the tranformed graph is valid.
+
+  Partially copying a graph can lead to a malformed graph. For instance,
+  copying half of a while construct is likely to result in an invalid graph.
+  This function attempts to add missing ops so that the transformation result
+  in a valid graph.
+
+  Args:
+    ops: list of ops (modifed in-place).
+    control_ios: object created by a call to `util.ControlOutputs`.
+  """
+  # Find while contexts.
+  control_flow_contexts = set()
+  for op in ops:
+    cfc = op._control_flow_context  # pylint: disable=protected-access
+    if cfc:
+      control_flow_contexts.add(cfc)
+  # Find new ops.
+  new_ops = []
+  for cfc in control_flow_contexts:
+    if cfc.IsWhileContext():
+      new_ops += select.get_walks_intersection_ops(
+          [enter_t.op for enter_t in cfc.loop_enters],
+          [exit_t.op for exit_t in cfc.loop_exits],
+          control_ios=control_ios)
+  # Add new ops.
+  new_ops_set = set(new_ops)
+  ops_set = frozenset(ops)
+  for op in new_ops_set:
+    if op not in ops_set:
+      ops.append(op)
+
+
 def graph_replace(target_ts, replacement_ts, dst_scope="",
                   src_scope="", reuse_dst_scope=False):
   """Create a new graph which compute the targets from the replaced Tensors.
@@ -657,8 +734,13 @@ def graph_replace(target_ts, replacement_ts, dst_scope="",
                                           control_ios=control_ios)
   if not ops:
     raise ValueError("Targets and replacements are not connected!")
+
+  # Complete ops to avoid malformed control flow.
+  # TODO(fkp): Consider moving this function deeper (in the transformer?).
+  _add_control_flow_ops(ops, control_ios)
+
   # Create a copy of the relevant subgraph
-  _, info = copy_with_input_replacements(
+  unused_sgv_, info = copy_with_input_replacements(
       ops, replacement_ts, None, dst_scope, src_scope, reuse_dst_scope)
   # Return the transformed targets but keep the original if the transformed
   # counterpart cannot be found
diff --git a/tensorflow/contrib/graph_editor/util.py b/tensorflow/contrib/graph_editor/util.py
index 30bc33b9ee42ba78bc7307c67c0fc0af9f3356ef..584f4509ccc0aab30edc2be3bad7a9cb938d6e6a 100644
--- a/tensorflow/contrib/graph_editor/util.py
+++ b/tensorflow/contrib/graph_editor/util.py
@@ -38,6 +38,11 @@ __all__ = [
 ]
 
 
+# The graph editor sometimes need to create placeholders, they are named
+# "geph_*". "geph" stands for Graph-Editor PlaceHolder.
+_DEFAULT_PLACEHOLDER_PREFIX = "geph"
+
+
 def concatenate_unique(la, lb):
   """Add all the elements of `lb` to `la` if they are not there already.
 
@@ -405,7 +410,7 @@ def scope_basename(scope):
   return scope[slash + 1:]
 
 
-def placeholder_name(t=None, scope=None):
+def placeholder_name(t=None, scope=None, prefix=_DEFAULT_PLACEHOLDER_PREFIX):
   """Create placeholder name for the graph editor.
 
   Args:
@@ -413,6 +418,7 @@ def placeholder_name(t=None, scope=None):
       on
     scope: absolute scope with which to prefix the placeholder's name. None
       means that the scope of t is preserved. "" means the root scope.
+    prefix: placeholder name prefix.
   Returns:
     A new placeholder name prefixed by "geph". Note that "geph" stands for
       Graph Editor PlaceHolder. This convention allows to quickly identify the
@@ -430,19 +436,20 @@ def placeholder_name(t=None, scope=None):
     if scope is None:
       scope = op_dirname
 
-    if op_basename.startswith("geph__"):
+    if op_basename.startswith("{}__".format(prefix)):
       ph_name = op_basename
     else:
-      ph_name = "geph__{}_{}".format(op_basename, t.value_index)
+      ph_name = "{}__{}_{}".format(prefix, op_basename, t.value_index)
 
     return scope + ph_name
   else:
     if scope is None:
       scope = ""
-    return scope + "geph"
+    return "{}{}".format(scope, prefix)
 
 
-def make_placeholder_from_tensor(t, scope=None):
+def make_placeholder_from_tensor(t, scope=None,
+                                 prefix=_DEFAULT_PLACEHOLDER_PREFIX):
   """Create a `tf.placeholder` for the Graph Editor.
 
   Note that the correct graph scope must be set by the calling function.
@@ -452,17 +459,19 @@ def make_placeholder_from_tensor(t, scope=None):
       (see function placeholder_name).
     scope: absolute scope within which to create the placeholder. None
       means that the scope of `t` is preserved. `""` means the root scope.
+    prefix: placeholder name prefix.
   Returns:
     A newly created `tf.placeholder`.
   Raises:
     TypeError: if `t` is not `None` or a `tf.Tensor`.
   """
   return tf_array_ops.placeholder(
-      dtype=t.dtype, shape=t.get_shape(), name=placeholder_name(
-          t, scope=scope))
+      dtype=t.dtype, shape=t.get_shape(),
+      name=placeholder_name(t, scope=scope, prefix=prefix))
 
 
-def make_placeholder_from_dtype_and_shape(dtype, shape=None, scope=None):
+def make_placeholder_from_dtype_and_shape(dtype, shape=None, scope=None,
+                                          prefix=_DEFAULT_PLACEHOLDER_PREFIX):
   """Create a tf.placeholder for the Graph Editor.
 
   Note that the correct graph scope must be set by the calling function.
@@ -474,11 +483,13 @@ def make_placeholder_from_dtype_and_shape(dtype, shape=None, scope=None):
     shape: the tensor shape (optional).
     scope: absolute scope within which to create the placeholder. None
       means that the scope of t is preserved. "" means the root scope.
+    prefix: placeholder name prefix.
   Returns:
     A newly created tf.placeholder.
   """
   return tf_array_ops.placeholder(
-      dtype=dtype, shape=shape, name=placeholder_name(scope=scope))
+      dtype=dtype, shape=shape,
+      name=placeholder_name(scope=scope, prefix=prefix))
 
 
 _INTERNAL_VARIABLE_RE = re.compile(r"^__\w+__$")
diff --git a/tensorflow/contrib/grid_rnn/BUILD b/tensorflow/contrib/grid_rnn/BUILD
index d601a1ec6f7a219bcd461d819ab2dfc64135a3ae..d0b44640667010b58c017d933d50ae5f87e8b275 100644
--- a/tensorflow/contrib/grid_rnn/BUILD
+++ b/tensorflow/contrib/grid_rnn/BUILD
@@ -41,15 +41,3 @@ cuda_py_tests(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/hooks/BUILD b/tensorflow/contrib/hooks/BUILD
index 1b528d7afc1112f5dc0667ae299ade02bc8fd04b..d65b2d6026dd89959aa62b57e07b073eef84572c 100644
--- a/tensorflow/contrib/hooks/BUILD
+++ b/tensorflow/contrib/hooks/BUILD
@@ -23,14 +23,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
index 324035100df366b80f57af9052c4bd935655b248..e39c60b252a1b49a68d51302fff47734869dddfe 100644
--- a/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
+++ b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
@@ -13,18 +13,6 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//visibility:public"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_cc_binary(
     name = "clock_cycle_profiling",
     testonly = 1,
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
index 909dc396a33b6fef1b2d51c3f52fab7782fc8ea5..0081fb61770075a2c36e92f65e01126f657edeb4 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -10,17 +10,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
-
 tf_cc_binary(
     name = "hvx_ops_support_checker",
     testonly = 1,
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index 3ff02e085ee63fabf42b3cc4389f4605455f3800..da450480b30b548484e69c61c85667d6dd390417 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -78,7 +78,10 @@ tf_custom_op_py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":dense_image_warp_py",
         ":image_ops",
+        ":interpolate_spline_py",
+        ":sparse_image_warp_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:common_shapes",
@@ -194,6 +197,117 @@ cuda_py_test(
     ],
 )
 
+py_library(
+    name = "dense_image_warp_py",
+    srcs = [
+        "python/ops/dense_image_warp.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "interpolate_spline_py",
+    srcs = [
+        "python/ops/interpolate_spline.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "sparse_image_warp_py",
+    srcs = [
+        "python/ops/sparse_image_warp.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dense_image_warp_py",
+        ":interpolate_spline_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+    ],
+)
+
+cuda_py_test(
+    name = "sparse_image_warp_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/sparse_image_warp_test.py"],
+    additional_deps = [
+        ":sparse_image_warp_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/core:protos_all_py",
+    ],
+    data = [":sparse_image_warp_test_data"],
+    tags = ["no_pip"],
+)
+
+filegroup(
+    name = "sparse_image_warp_test_data",
+    srcs = glob(["python/kernel_tests/test_data/*.png"]),
+)
+
+cuda_py_test(
+    name = "dense_image_warp_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/dense_image_warp_test.py"],
+    additional_deps = [
+        ":dense_image_warp_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+cuda_py_test(
+    name = "interpolate_spline_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/interpolate_spline_test.py"],
+    additional_deps = [
+        ":interpolate_spline_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 tf_py_test(
     name = "segmentation_test",
     size = "medium",
@@ -270,15 +384,3 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
index cc8ed117ba2edcc7a53e609381166f17a2fbb45e..e982030bc8959309e72d0f4e02b9755c48535a10 100755
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -30,6 +30,9 @@ projective transforms (including rotation) are supported.
 @@transform
 @@translate
 @@translations_to_projective_transforms
+@@dense_image_warp
+@@interpolate_spline
+@@sparse_image_warp
 
 ## Image Segmentation `Ops`
 
@@ -47,6 +50,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.image.python.ops.dense_image_warp import dense_image_warp
+
 from tensorflow.contrib.image.python.ops.distort_image_ops import adjust_hsv_in_yiq
 from tensorflow.contrib.image.python.ops.distort_image_ops import random_hsv_in_yiq
 
@@ -57,7 +62,9 @@ from tensorflow.contrib.image.python.ops.image_ops import rotate
 from tensorflow.contrib.image.python.ops.image_ops import transform
 from tensorflow.contrib.image.python.ops.image_ops import translate
 from tensorflow.contrib.image.python.ops.image_ops import translations_to_projective_transforms
+from tensorflow.contrib.image.python.ops.interpolate_spline import interpolate_spline
 from tensorflow.contrib.image.python.ops.single_image_random_dot_stereograms import single_image_random_dot_stereograms
+from tensorflow.contrib.image.python.ops.sparse_image_warp import sparse_image_warp
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a58b6a247ed6ae252db25a12f1e47c08c9a5c147
--- /dev/null
+++ b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
@@ -0,0 +1,267 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dense_image_warp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+
+from tensorflow.contrib.image.python.ops import dense_image_warp
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+from tensorflow.python.training import adam
+
+
+class DenseImageWarpTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    np.random.seed(0)
+
+  def test_interpolate_small_grid_ij(self):
+    grid = constant_op.constant(
+        [[0., 1., 2.], [3., 4., 5.], [6., 7., 8.]], shape=[1, 3, 3, 1])
+    query_points = constant_op.constant(
+        [[0., 0.], [1., 0.], [2., 0.5], [1.5, 1.5]], shape=[1, 4, 2])
+    expected_results = np.reshape(np.array([0., 3., 6.5, 6.]), [1, 4, 1])
+
+    interp = dense_image_warp._interpolate_bilinear(grid, query_points)
+
+    with self.test_session() as sess:
+      predicted = sess.run(interp)
+      self.assertAllClose(expected_results, predicted)
+
+  def test_interpolate_small_grid_xy(self):
+    grid = constant_op.constant(
+        [[0., 1., 2.], [3., 4., 5.], [6., 7., 8.]], shape=[1, 3, 3, 1])
+    query_points = constant_op.constant(
+        [[0., 0.], [0., 1.], [0.5, 2.0], [1.5, 1.5]], shape=[1, 4, 2])
+    expected_results = np.reshape(np.array([0., 3., 6.5, 6.]), [1, 4, 1])
+
+    interp = dense_image_warp._interpolate_bilinear(
+        grid, query_points, indexing='xy')
+
+    with self.test_session() as sess:
+      predicted = sess.run(interp)
+      self.assertAllClose(expected_results, predicted)
+
+  def test_interpolate_small_grid_batched(self):
+    grid = constant_op.constant(
+        [[[0., 1.], [3., 4.]], [[5., 6.], [7., 8.]]], shape=[2, 2, 2, 1])
+    query_points = constant_op.constant([[[0., 0.], [1., 0.], [0.5, 0.5]],
+                                         [[0.5, 0.], [1., 0.], [1., 1.]]])
+    expected_results = np.reshape(
+        np.array([[0., 3., 2.], [6., 7., 8.]]), [2, 3, 1])
+
+    interp = dense_image_warp._interpolate_bilinear(grid, query_points)
+
+    with self.test_session() as sess:
+      predicted = sess.run(interp)
+      self.assertAllClose(expected_results, predicted)
+
+  def get_image_and_flow_placeholders(self, shape, image_type, flow_type):
+    batch_size, height, width, numchannels = shape
+    image_shape = [batch_size, height, width, numchannels]
+    flow_shape = [batch_size, height, width, 2]
+
+    tf_type = {
+        'float16': dtypes.half,
+        'float32': dtypes.float32,
+        'float64': dtypes.float64
+    }
+
+    image = array_ops.placeholder(dtype=tf_type[image_type], shape=image_shape)
+
+    flows = array_ops.placeholder(dtype=tf_type[flow_type], shape=flow_shape)
+    return image, flows
+
+  def get_random_image_and_flows(self, shape, image_type, flow_type):
+    batch_size, height, width, numchannels = shape
+    image_shape = [batch_size, height, width, numchannels]
+    image = np.random.normal(size=image_shape)
+    flow_shape = [batch_size, height, width, 2]
+    flows = np.random.normal(size=flow_shape) * 3
+    return image.astype(image_type), flows.astype(flow_type)
+
+  def assert_correct_interpolation_value(self,
+                                         image,
+                                         flows,
+                                         pred_interpolation,
+                                         batch_index,
+                                         y_index,
+                                         x_index,
+                                         low_precision=False):
+    """Assert that the tf interpolation matches hand-computed value."""
+
+    height = image.shape[1]
+    width = image.shape[2]
+    displacement = flows[batch_index, y_index, x_index, :]
+    float_y = y_index - displacement[0]
+    float_x = x_index - displacement[1]
+    floor_y = max(min(height - 2, math.floor(float_y)), 0)
+    floor_x = max(min(width - 2, math.floor(float_x)), 0)
+    ceil_y = floor_y + 1
+    ceil_x = floor_x + 1
+
+    alpha_y = min(max(0.0, float_y - floor_y), 1.0)
+    alpha_x = min(max(0.0, float_x - floor_x), 1.0)
+
+    floor_y = int(floor_y)
+    floor_x = int(floor_x)
+    ceil_y = int(ceil_y)
+    ceil_x = int(ceil_x)
+
+    top_left = image[batch_index, floor_y, floor_x, :]
+    top_right = image[batch_index, floor_y, ceil_x, :]
+    bottom_left = image[batch_index, ceil_y, floor_x, :]
+    bottom_right = image[batch_index, ceil_y, ceil_x, :]
+
+    interp_top = alpha_x * (top_right - top_left) + top_left
+    interp_bottom = alpha_x * (bottom_right - bottom_left) + bottom_left
+    interp = alpha_y * (interp_bottom - interp_top) + interp_top
+    atol = 1e-6
+    rtol = 1e-6
+    if low_precision:
+      atol = 1e-2
+      rtol = 1e-3
+    self.assertAllClose(
+        interp,
+        pred_interpolation[batch_index, y_index, x_index, :],
+        atol=atol,
+        rtol=rtol)
+
+  def check_zero_flow_correctness(self, shape, image_type, flow_type):
+    """Assert using zero flows doesn't change the input image."""
+
+    image, flows = self.get_image_and_flow_placeholders(shape, image_type,
+                                                        flow_type)
+    interp = dense_image_warp.dense_image_warp(image, flows)
+
+    with self.test_session() as sess:
+      rand_image, rand_flows = self.get_random_image_and_flows(
+          shape, image_type, flow_type)
+      rand_flows *= 0
+
+      predicted_interpolation = sess.run(
+          interp, feed_dict={
+              image: rand_image,
+              flows: rand_flows
+          })
+      self.assertAllClose(rand_image, predicted_interpolation)
+
+  def test_zero_flows(self):
+    """Apply check_zero_flow_correctness() for a few sizes and types."""
+
+    shapes_to_try = [[3, 4, 5, 6], [1, 2, 2, 1]]
+    for shape in shapes_to_try:
+      self.check_zero_flow_correctness(
+          shape, image_type='float32', flow_type='float32')
+
+  def check_interpolation_correctness(self,
+                                      shape,
+                                      image_type,
+                                      flow_type,
+                                      num_probes=5):
+    """Interpolate, and then assert correctness for a few query locations."""
+
+    image, flows = self.get_image_and_flow_placeholders(shape, image_type,
+                                                        flow_type)
+    interp = dense_image_warp.dense_image_warp(image, flows)
+    low_precision = image_type == 'float16' or flow_type == 'float16'
+    with self.test_session() as sess:
+      rand_image, rand_flows = self.get_random_image_and_flows(
+          shape, image_type, flow_type)
+
+      pred_interpolation = sess.run(
+          interp, feed_dict={
+              image: rand_image,
+              flows: rand_flows
+          })
+
+      for _ in range(num_probes):
+        batch_index = np.random.randint(0, shape[0])
+        y_index = np.random.randint(0, shape[1])
+        x_index = np.random.randint(0, shape[2])
+
+        self.assert_correct_interpolation_value(
+            rand_image,
+            rand_flows,
+            pred_interpolation,
+            batch_index,
+            y_index,
+            x_index,
+            low_precision=low_precision)
+
+  def test_interpolation(self):
+    """Apply check_interpolation_correctness() for a few sizes and types."""
+
+    shapes_to_try = [[3, 4, 5, 6], [1, 5, 5, 3], [1, 2, 2, 1]]
+    for im_type in ['float32', 'float64', 'float16']:
+      for flow_type in ['float32', 'float64', 'float16']:
+        for shape in shapes_to_try:
+          self.check_interpolation_correctness(shape, im_type, flow_type)
+
+  def test_gradients_exist(self):
+    """Check that backprop can run.
+
+    The correctness of the gradients is assumed, since the forward propagation
+    is tested to be correct and we only use built-in tf ops.
+    However, we perform a simple test to make sure that backprop can actually
+    run. We treat the flows as a tf.Variable and optimize them to minimize
+    the difference between the interpolated image and the input image.
+    """
+
+    batch_size, height, width, numchannels = [4, 5, 6, 7]
+    image_shape = [batch_size, height, width, numchannels]
+    image = random_ops.random_normal(image_shape)
+    flow_shape = [batch_size, height, width, 2]
+    init_flows = np.float32(np.random.normal(size=flow_shape) * 0.25)
+    flows = variables.Variable(init_flows)
+
+    interp = dense_image_warp.dense_image_warp(image, flows)
+    loss = math_ops.reduce_mean(math_ops.square(interp - image))
+
+    optimizer = adam.AdamOptimizer(1.0)
+    grad = gradients.gradients(loss, [flows])
+    opt_func = optimizer.apply_gradients(zip(grad, [flows]))
+    init_op = variables.global_variables_initializer()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(10):
+        sess.run(opt_func)
+
+  def test_size_exception(self):
+    """Make sure it throws an exception for images that are too small."""
+
+    shape = [1, 2, 1, 1]
+    msg = 'Should have raised an exception for invalid image size'
+    with self.assertRaises(ValueError, msg=msg):
+      self.check_interpolation_correctness(shape, 'float32', 'float32')
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py b/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1939caaa2d8586413cf9ecba6ce73cf64910d6fc
--- /dev/null
+++ b/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
@@ -0,0 +1,264 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for interpolate_spline."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import interpolate as sc_interpolate
+
+from tensorflow.contrib.image.python.ops import interpolate_spline
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+from tensorflow.python.training import momentum
+
+
+class _InterpolationProblem(object):
+  """Abstract class for interpolation problem descriptions."""
+
+  def get_problem(self, optimizable=False, extrapolate=True, dtype='float32'):
+    """Make data for an interpolation problem where all x vectors are n-d.
+
+    Args:
+      optimizable: If True, then make train_points a tf.Variable.
+      extrapolate: If False, then clamp the query_points values to be within
+      the max and min of train_points.
+      dtype: The data type to use.
+
+    Returns:
+      query_points, query_values, train_points, train_values: training and
+      test tensors for interpolation problem
+    """
+
+    # The values generated here depend on a seed of 0.
+    np.random.seed(0)
+
+    batch_size = 1
+    num_training_points = 10
+    num_query_points = 4
+
+    init_points = np.random.uniform(
+        size=[batch_size, num_training_points, self.DATA_DIM])
+
+    init_points = init_points.astype(dtype)
+    train_points = (
+        variables.Variable(init_points)
+        if optimizable else constant_op.constant(init_points))
+    train_values = self.tf_function(train_points)
+
+    query_points_np = np.random.uniform(
+        size=[batch_size, num_query_points, self.DATA_DIM])
+    query_points_np = query_points_np.astype(dtype)
+    if not extrapolate:
+      query_points_np = np.clip(query_points_np, np.min(init_points),
+                                np.max(init_points))
+
+    query_points = constant_op.constant(query_points_np)
+    query_values = self.np_function(query_points_np)
+
+    return query_points, query_values, train_points, train_values
+
+
+class _QuadraticPlusSinProblem1D(_InterpolationProblem):
+  """1D interpolation problem used for regression testing."""
+  DATA_DIM = 1
+  HARDCODED_QUERY_VALUES = {
+      (1.0, 0.0): [6.2647187603, -7.84362604077, -5.63690142322, 1.42928896387],
+      (1.0,
+       0.01): [6.77688289946, -8.02163669853, -5.79491157027, 1.4063285693],
+      (2.0,
+       0.0): [8.67110264937, -8.41281390883, -5.80190044693, 1.50155606059],
+      (2.0,
+       0.01): [6.70797816797, -7.49709587663, -5.28965776238, 1.52284731741],
+      (3.0,
+       0.0): [9.37691802935, -8.50390141515, -5.80786417426, 1.63467762122],
+      (3.0,
+       0.01): [4.47106304758, -5.71266128361, -3.92529303296, 1.86755293857],
+      (4.0,
+       0.0): [9.58172461111, -8.51432104771, -5.80967675388, 1.63361164256],
+      (4.0, 0.01): [
+          -3.87902711352, -0.0253462273846, 1.79857618022, -0.769339675725
+      ]
+  }
+
+  def np_function(self, x):
+    """Takes np array, evaluates the test function, and returns np array."""
+    return np.sum(
+        np.power((x - 0.5), 3) - 0.25 * x + 10 * np.sin(x * 10),
+        axis=2,
+        keepdims=True)
+
+  def tf_function(self, x):
+    """Takes tf tensor, evaluates the test function,  and returns tf tensor."""
+    return math_ops.reduce_mean(
+        math_ops.pow((x - 0.5), 3) - 0.25 * x + 10 * math_ops.sin(x * 10),
+        2,
+        keepdims=True)
+
+
+class _QuadraticPlusSinProblemND(_InterpolationProblem):
+  """3D interpolation problem used for regression testing."""
+
+  DATA_DIM = 3
+  HARDCODED_QUERY_VALUES = {
+      (1.0, 0.0): [1.06609663962, 1.28894849357, 1.10882405595, 1.63966936885],
+      (1.0, 0.01): [1.03123780748, 1.2952930985, 1.10366822954, 1.65265118569],
+      (2.0, 0.0): [0.627787735064, 1.43802857251, 1.00194632358, 1.91667538215],
+      (2.0, 0.01): [0.730159985046, 1.41702471595, 1.0065827217, 1.85758519312],
+      (3.0, 0.0): [0.350460417862, 1.67223539464, 1.00475331246, 2.31580322491],
+      (3.0,
+       0.01): [0.624557250556, 1.63138876667, 0.976588193162, 2.12511237866],
+      (4.0,
+       0.0): [0.898129669986, 1.24434133638, -0.938056116931, 1.59910338833],
+      (4.0,
+       0.01): [0.0930360338179, -3.38791305538, -1.00969032567, 0.745535080382],
+  }
+
+  def np_function(self, x):
+    """Takes np array, evaluates the test function, and returns np array."""
+    return np.sum(
+        np.square(x - 0.5) + 0.25 * x + 1 * np.sin(x * 15),
+        axis=2,
+        keepdims=True)
+
+  def tf_function(self, x):
+    """Takes tf tensor, evaluates the test function,  and returns tf tensor."""
+    return math_ops.reduce_sum(
+        math_ops.square(x - 0.5) + 0.25 * x + 1 * math_ops.sin(x * 15),
+        2,
+        keepdims=True)
+
+
+class InterpolateSplineTest(test_util.TensorFlowTestCase):
+
+  def test_1d_linear_interpolation(self):
+    """For 1d linear interpolation, we can compare directly to scipy."""
+
+    tp = _QuadraticPlusSinProblem1D()
+    (query_points, _, train_points, train_values) = tp.get_problem(
+        extrapolate=False, dtype='float64')
+    interpolation_order = 1
+
+    with ops.name_scope('interpolator'):
+      interpolator = interpolate_spline.interpolate_spline(
+          train_points, train_values, query_points, interpolation_order)
+      with self.test_session() as sess:
+        fetches = [query_points, train_points, train_values, interpolator]
+        query_points_, train_points_, train_values_, interp_ = sess.run(fetches)
+
+        # Just look at the first element of the minibatch.
+        # Also, trim the final singleton dimension.
+        interp_ = interp_[0, :, 0]
+        query_points_ = query_points_[0, :, 0]
+        train_points_ = train_points_[0, :, 0]
+        train_values_ = train_values_[0, :, 0]
+
+        # Compute scipy interpolation.
+        scipy_interp_function = sc_interpolate.interp1d(
+            train_points_, train_values_, kind='linear')
+
+        scipy_interpolation = scipy_interp_function(query_points_)
+        scipy_interpolation_on_train = scipy_interp_function(train_points_)
+
+        # Even with float64 precision, the interpolants disagree with scipy a
+        # bit due to the fact that we add the EPSILON to prevent sqrt(0), etc.
+        tol = 1e-3
+
+        self.assertAllClose(
+            train_values_, scipy_interpolation_on_train, atol=tol, rtol=tol)
+        self.assertAllClose(interp_, scipy_interpolation, atol=tol, rtol=tol)
+
+  def test_1d_interpolation(self):
+    """Regression test for interpolation with 1-D points."""
+
+    tp = _QuadraticPlusSinProblem1D()
+    (query_points, _, train_points,
+     train_values) = tp.get_problem(dtype='float64')
+
+    for order in (1, 2, 3):
+      for reg_weight in (0, 0.01):
+        interpolator = interpolate_spline.interpolate_spline(
+            train_points, train_values, query_points, order, reg_weight)
+
+        target_interpolation = tp.HARDCODED_QUERY_VALUES[(order, reg_weight)]
+        target_interpolation = np.array(target_interpolation)
+        with self.test_session() as sess:
+          interp_val = sess.run(interpolator)
+          self.assertAllClose(interp_val[0, :, 0], target_interpolation)
+
+  def test_nd_linear_interpolation(self):
+    """Regression test for interpolation with N-D points."""
+
+    tp = _QuadraticPlusSinProblemND()
+    (query_points, _, train_points,
+     train_values) = tp.get_problem(dtype='float64')
+
+    for order in (1, 2, 3):
+      for reg_weight in (0, 0.01):
+        interpolator = interpolate_spline.interpolate_spline(
+            train_points, train_values, query_points, order, reg_weight)
+
+        target_interpolation = tp.HARDCODED_QUERY_VALUES[(order, reg_weight)]
+        target_interpolation = np.array(target_interpolation)
+        with self.test_session() as sess:
+          interp_val = sess.run(interpolator)
+          self.assertAllClose(interp_val[0, :, 0], target_interpolation)
+
+  def test_interpolation_gradient(self):
+    """Make sure that backprop can run. Correctness of gradients is assumed.
+
+    Here, we create a use a small 'training' set and a more densely-sampled
+    set of query points, for which we know the true value in advance. The goal
+    is to choose x locations for the training data such that interpolating using
+    this training data yields the best reconstruction for the function
+    values at the query points. The training data locations are optimized
+    iteratively using gradient descent.
+    """
+    tp = _QuadraticPlusSinProblemND()
+    (query_points, query_values, train_points,
+     train_values) = tp.get_problem(optimizable=True)
+
+    regularization = 0.001
+    for interpolation_order in (1, 2, 3, 4):
+      interpolator = interpolate_spline.interpolate_spline(
+          train_points, train_values, query_points, interpolation_order,
+          regularization)
+
+      loss = math_ops.reduce_mean(math_ops.square(query_values - interpolator))
+
+      optimizer = momentum.MomentumOptimizer(0.001, 0.9)
+      grad = gradients.gradients(loss, [train_points])
+      grad, _ = clip_ops.clip_by_global_norm(grad, 1.0)
+      opt_func = optimizer.apply_gradients(zip(grad, [train_points]))
+      init_op = variables.global_variables_initializer()
+
+      with self.test_session() as sess:
+        sess.run(init_op)
+        for _ in range(100):
+          sess.run([loss, opt_func])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py b/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0135c66e293693345c3da7fdb21e28ca6d160154
--- /dev/null
+++ b/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py
@@ -0,0 +1,254 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sparse_image_warp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.image.python.ops import sparse_image_warp
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+from tensorflow.python.training import momentum
+
+
+class SparseImageWarpTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    np.random.seed(0)
+
+  def testGetBoundaryLocations(self):
+    image_height = 11
+    image_width = 11
+    num_points_per_edge = 4
+    locs = sparse_image_warp._get_boundary_locations(image_height, image_width,
+                                                     num_points_per_edge)
+    num_points = locs.shape[0]
+    self.assertEqual(num_points, 4 + 4 * num_points_per_edge)
+    locs = [(locs[i, 0], locs[i, 1]) for i in range(num_points)]
+    for i in (0, image_height - 1):
+      for j in (0, image_width - 1):
+        self.assertIn((i, j), locs, '{},{} not in the locations'.format(i, j))
+
+      for i in (2, 4, 6, 8):
+        for j in (0, image_width - 1):
+          self.assertIn((i, j), locs, '{},{} not in the locations'.format(i, j))
+
+      for i in (0, image_height - 1):
+        for j in (2, 4, 6, 8):
+          self.assertIn((i, j), locs, '{},{} not in the locations'.format(i, j))
+
+  def testGetGridLocations(self):
+    image_height = 5
+    image_width = 3
+    grid = sparse_image_warp._get_grid_locations(image_height, image_width)
+    for i in range(image_height):
+      for j in range(image_width):
+        self.assertEqual(grid[i, j, 0], i)
+        self.assertEqual(grid[i, j, 1], j)
+
+  def testZeroShift(self):
+    """Run assertZeroShift for various hyperparameters."""
+    for order in (1, 2):
+      for regularization in (0, 0.01):
+        for num_boundary_points in (0, 1):
+          self.assertZeroShift(order, regularization, num_boundary_points)
+
+  def assertZeroShift(self, order, regularization, num_boundary_points):
+    """Check that warping with zero displacements doesn't change the image."""
+    batch_size = 1
+    image_height = 4
+    image_width = 4
+    channels = 3
+
+    image = np.random.uniform(
+        size=[batch_size, image_height, image_width, channels])
+
+    input_image_op = constant_op.constant(np.float32(image))
+
+    control_point_locations = [[1., 1.], [2., 2.], [2., 1.]]
+    control_point_locations = constant_op.constant(
+        np.float32(np.expand_dims(control_point_locations, 0)))
+
+    control_point_displacements = np.zeros(
+        control_point_locations.shape.as_list())
+    control_point_displacements = constant_op.constant(
+        np.float32(control_point_displacements))
+
+    (warped_image_op, flow_field) = sparse_image_warp.sparse_image_warp(
+        input_image_op,
+        control_point_locations,
+        control_point_locations + control_point_displacements,
+        interpolation_order=order,
+        regularization_weight=regularization,
+        num_boundary_points=num_boundary_points)
+
+    with self.test_session() as sess:
+      warped_image, input_image, _ = sess.run(
+          [warped_image_op, input_image_op, flow_field])
+
+      self.assertAllClose(warped_image, input_image)
+
+  def testMoveSinglePixel(self):
+    """Run assertMoveSinglePixel for various hyperparameters and data types."""
+    for order in (1, 2):
+      for num_boundary_points in (1, 2):
+        for type_to_use in (dtypes.float32, dtypes.float64):
+          self.assertMoveSinglePixel(order, num_boundary_points, type_to_use)
+
+  def assertMoveSinglePixel(self, order, num_boundary_points, type_to_use):
+    """Move a single block in a small grid using warping."""
+    batch_size = 1
+    image_height = 7
+    image_width = 7
+    channels = 3
+
+    image = np.zeros([batch_size, image_height, image_width, channels])
+    image[:, 3, 3, :] = 1.0
+    input_image_op = constant_op.constant(image, dtype=type_to_use)
+
+    # Place a control point at the one white pixel.
+    control_point_locations = [[3., 3.]]
+    control_point_locations = constant_op.constant(
+        np.float32(np.expand_dims(control_point_locations, 0)),
+        dtype=type_to_use)
+    # Shift it one pixel to the right.
+    control_point_displacements = [[0., 1.0]]
+    control_point_displacements = constant_op.constant(
+        np.float32(np.expand_dims(control_point_displacements, 0)),
+        dtype=type_to_use)
+
+    (warped_image_op, flow_field) = sparse_image_warp.sparse_image_warp(
+        input_image_op,
+        control_point_locations,
+        control_point_locations + control_point_displacements,
+        interpolation_order=order,
+        num_boundary_points=num_boundary_points)
+
+    with self.test_session() as sess:
+      warped_image, input_image, flow = sess.run(
+          [warped_image_op, input_image_op, flow_field])
+      # Check that it moved the pixel correctly.
+      self.assertAllClose(
+          warped_image[0, 4, 5, :],
+          input_image[0, 4, 4, :],
+          atol=1e-5,
+          rtol=1e-5)
+
+      # Test that there is no flow at the corners.
+      for i in (0, image_height - 1):
+        for j in (0, image_width - 1):
+          self.assertAllClose(
+              flow[0, i, j, :], np.zeros([2]), atol=1e-5, rtol=1e-5)
+
+  def load_image(self, image_file, sess):
+    image_op = image_ops.decode_png(
+        io_ops.read_file(image_file), dtype=dtypes.uint8, channels=4)[:, :, 0:3]
+    return sess.run(image_op)
+
+  def testSmileyFace(self):
+    """Check warping accuracy by comparing to hardcoded warped images."""
+
+    test_data_dir = test.test_src_dir_path('contrib/image/python/'
+                                           'kernel_tests/test_data/')
+    input_file = test_data_dir + 'Yellow_Smiley_Face.png'
+    with self.test_session() as sess:
+      input_image = self.load_image(input_file, sess)
+    control_points = np.asarray([[64, 59], [180 - 64, 59], [39, 111],
+                                 [180 - 39, 111], [90, 143], [58, 134],
+                                 [180 - 58, 134]])  # pyformat: disable
+    control_point_displacements = np.asarray(
+        [[-10.5, 10.5], [10.5, 10.5], [0, 0], [0, 0], [0, -10], [-20, 10.25],
+         [10, 10.75]])
+    control_points_op = constant_op.constant(
+        np.expand_dims(np.float32(control_points[:, [1, 0]]), 0))
+    control_point_displacements_op = constant_op.constant(
+        np.expand_dims(np.float32(control_point_displacements[:, [1, 0]]), 0))
+    float_image = np.expand_dims(np.float32(input_image) / 255, 0)
+    input_image_op = constant_op.constant(float_image)
+
+    for interpolation_order in (1, 2, 3):
+      for num_boundary_points in (0, 1, 4):
+        warp_op, _ = sparse_image_warp.sparse_image_warp(
+            input_image_op,
+            control_points_op,
+            control_points_op + control_point_displacements_op,
+            interpolation_order=interpolation_order,
+            num_boundary_points=num_boundary_points)
+        with self.test_session() as sess:
+          warped_image = sess.run(warp_op)
+          out_image = np.uint8(warped_image[0, :, :, :] * 255)
+          target_file = (
+              test_data_dir +
+              'Yellow_Smiley_Face_Warp-interp' + '-{}-clamp-{}.png'.format(
+                  interpolation_order, num_boundary_points))
+
+          target_image = self.load_image(target_file, sess)
+
+          # Check that the target_image and out_image difference is no
+          # bigger than 2 (on a scale of 0-255). Due to differences in
+          # floating point computation on different devices, the float
+          # output in warped_image may get rounded to a different int
+          # than that in the saved png file loaded into target_image.
+          self.assertAllClose(target_image, out_image, atol=2, rtol=1e-3)
+
+  def testThatBackpropRuns(self):
+    """Run optimization to ensure that gradients can be computed."""
+
+    batch_size = 1
+    image_height = 9
+    image_width = 12
+    image = variables.Variable(
+        np.float32(
+            np.random.uniform(size=[batch_size, image_height, image_width, 3])))
+    control_point_locations = [[3., 3.]]
+    control_point_locations = constant_op.constant(
+        np.float32(np.expand_dims(control_point_locations, 0)))
+    control_point_displacements = [[0.25, -0.5]]
+    control_point_displacements = constant_op.constant(
+        np.float32(np.expand_dims(control_point_displacements, 0)))
+    warped_image, _ = sparse_image_warp.sparse_image_warp(
+        image,
+        control_point_locations,
+        control_point_locations + control_point_displacements,
+        num_boundary_points=3)
+
+    loss = math_ops.reduce_mean(math_ops.abs(warped_image - image))
+    optimizer = momentum.MomentumOptimizer(0.001, 0.9)
+    grad = gradients.gradients(loss, [image])
+    grad, _ = clip_ops.clip_by_global_norm(grad, 1.0)
+    opt_func = optimizer.apply_gradients(zip(grad, [image]))
+    init_op = variables.global_variables_initializer()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run([loss, opt_func])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face.png
new file mode 100644
index 0000000000000000000000000000000000000000..7e303881e213a82e412d18de9d9d86f368726f06
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-0.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..7fd9e4e6d69f3120428d1d778846d495cea1a989
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-0.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-1.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..86d225e5d2158804f88dca881f69ed3ab287d866
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-1.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-4.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..37e8ffae114625d0cc6a07ab2b8dbbb7413a3829
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-1-clamp-4.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-0.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..e49b5816120d43a669264915f1b6747606e080e0
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-0.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-1.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..df3cf2004312ed0ed0ebf1f0340cbfec7fd9ac46
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-1.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-4.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1799a87c8542d7e515b6185d7e8f6f75fe73f3e
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-2-clamp-4.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-0.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..2c346e0ce5487e21d41aa4e6306fd83a7b4ffdb4
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-0.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-1.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..6f8b65451cc08a463e4305ddc4be0dbe2879fae9
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-1.png differ
diff --git a/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-4.png b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e78146d955ae8f02230121e6314f3285e87611e
Binary files /dev/null and b/tensorflow/contrib/image/python/kernel_tests/test_data/Yellow_Smiley_Face_Warp-interp-3-clamp-4.png differ
diff --git a/tensorflow/contrib/image/python/ops/dense_image_warp.py b/tensorflow/contrib/image/python/ops/dense_image_warp.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9b219ada492466919c615d8978e462e6c619d33
--- /dev/null
+++ b/tensorflow/contrib/image/python/ops/dense_image_warp.py
@@ -0,0 +1,201 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Image warping using per-pixel flow vectors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _interpolate_bilinear(grid,
+                          query_points,
+                          name='interpolate_bilinear',
+                          indexing='ij'):
+  """Similar to Matlab's interp2 function.
+
+  Finds values for query points on a grid using bilinear interpolation.
+
+  Args:
+    grid: a 4-D float `Tensor` of shape `[batch, height, width, channels]`.
+    query_points: a 3-D float `Tensor` of N points with shape `[batch, N, 2]`.
+    name: a name for the operation (optional).
+    indexing: whether the query points are specified as row and column (ij),
+      or Cartesian coordinates (xy).
+
+  Returns:
+    values: a 3-D `Tensor` with shape `[batch, N, channels]`
+
+  Raises:
+    ValueError: if the indexing mode is invalid, or if the shape of the inputs
+      invalid.
+  """
+  if indexing != 'ij' and indexing != 'xy':
+    raise ValueError('Indexing mode must be \'ij\' or \'xy\'')
+
+  with ops.name_scope(name):
+    grid = ops.convert_to_tensor(grid)
+    query_points = ops.convert_to_tensor(query_points)
+    shape = grid.get_shape().as_list()
+    if len(shape) != 4:
+      msg = 'Grid must be 4 dimensional. Received size: '
+      raise ValueError(msg + str(grid.get_shape()))
+
+    batch_size, height, width, channels = shape
+    query_type = query_points.dtype
+    grid_type = grid.dtype
+
+    if (len(query_points.get_shape()) != 3 or
+        query_points.get_shape()[2].value != 2):
+      msg = ('Query points must be 3 dimensional and size 2 in dim 2. Received '
+             'size: ')
+      raise ValueError(msg + str(query_points.get_shape()))
+
+    _, num_queries, _ = query_points.get_shape().as_list()
+
+    if height < 2 or width < 2:
+      msg = 'Grid must be at least batch_size x 2 x 2 in size. Received size: '
+      raise ValueError(msg + str(grid.get_shape()))
+
+    alphas = []
+    floors = []
+    ceils = []
+
+    index_order = [0, 1] if indexing == 'ij' else [1, 0]
+    unstacked_query_points = array_ops.unstack(query_points, axis=2)
+
+    for dim in index_order:
+      with ops.name_scope('dim-' + str(dim)):
+        queries = unstacked_query_points[dim]
+
+        size_in_indexing_dimension = shape[dim + 1]
+
+        # max_floor is size_in_indexing_dimension - 2 so that max_floor + 1
+        # is still a valid index into the grid.
+        max_floor = math_ops.cast(size_in_indexing_dimension - 2, query_type)
+        min_floor = constant_op.constant(0.0, dtype=query_type)
+        floor = math_ops.minimum(
+            math_ops.maximum(min_floor, math_ops.floor(queries)), max_floor)
+        int_floor = math_ops.cast(floor, dtypes.int32)
+        floors.append(int_floor)
+        ceil = int_floor + 1
+        ceils.append(ceil)
+
+        # alpha has the same type as the grid, as we will directly use alpha
+        # when taking linear combinations of pixel values from the image.
+        alpha = math_ops.cast(queries - floor, grid_type)
+        min_alpha = constant_op.constant(0.0, dtype=grid_type)
+        max_alpha = constant_op.constant(1.0, dtype=grid_type)
+        alpha = math_ops.minimum(math_ops.maximum(min_alpha, alpha), max_alpha)
+
+        # Expand alpha to [b, n, 1] so we can use broadcasting
+        # (since the alpha values don't depend on the channel).
+        alpha = array_ops.expand_dims(alpha, 2)
+        alphas.append(alpha)
+
+    if batch_size * height * width > np.iinfo(np.int32).max / 8:
+      error_msg = """The image size or batch size is sufficiently large
+                     that the linearized addresses used by array_ops.gather
+                     may exceed the int32 limit."""
+      raise ValueError(error_msg)
+
+    flattened_grid = array_ops.reshape(grid,
+                                       [batch_size * height * width, channels])
+    batch_offsets = array_ops.reshape(
+        math_ops.range(batch_size) * height * width, [batch_size, 1])
+
+    # This wraps array_ops.gather. We reshape the image data such that the
+    # batch, y, and x coordinates are pulled into the first dimension.
+    # Then we gather. Finally, we reshape the output back. It's possible this
+    # code would be made simpler by using array_ops.gather_nd.
+    def gather(y_coords, x_coords, name):
+      with ops.name_scope('gather-' + name):
+        linear_coordinates = batch_offsets + y_coords * width + x_coords
+        gathered_values = array_ops.gather(flattened_grid, linear_coordinates)
+        return array_ops.reshape(gathered_values,
+                                 [batch_size, num_queries, channels])
+
+    # grab the pixel values in the 4 corners around each query point
+    top_left = gather(floors[0], floors[1], 'top_left')
+    top_right = gather(floors[0], ceils[1], 'top_right')
+    bottom_left = gather(ceils[0], floors[1], 'bottom_left')
+    bottom_right = gather(ceils[0], ceils[1], 'bottom_right')
+
+    # now, do the actual interpolation
+    with ops.name_scope('interpolate'):
+      interp_top = alphas[1] * (top_right - top_left) + top_left
+      interp_bottom = alphas[1] * (bottom_right - bottom_left) + bottom_left
+      interp = alphas[0] * (interp_bottom - interp_top) + interp_top
+
+    return interp
+
+
+def dense_image_warp(image, flow, name='dense_image_warp'):
+  """Image warping using per-pixel flow vectors.
+
+  Apply a non-linear warp to the image, where the warp is specified by a dense
+  flow field of offset vectors that define the correspondences of pixel values
+  in the output image back to locations in the  source image. Specifically, the
+  pixel value at output[b, j, i, c] is
+  images[b, j - flow[b, j, i, 0], i - flow[b, j, i, 1], c].
+
+  The locations specified by this formula do not necessarily map to an int
+  index. Therefore, the pixel value is obtained by bilinear
+  interpolation of the 4 nearest pixels around
+  (b, j - flow[b, j, i, 0], i - flow[b, j, i, 1]). For locations outside
+  of the image, we use the nearest pixel values at the image boundary.
+
+
+  Args:
+    image: 4-D float `Tensor` with shape `[batch, height, width, channels]`.
+    flow: A 4-D float `Tensor` with shape `[batch, height, width, 2]`.
+    name: A name for the operation (optional).
+
+    Note that image and flow can be of type tf.half, tf.float32, or tf.float64,
+    and do not necessarily have to be the same type.
+
+  Returns:
+    A 4-D float `Tensor` with shape`[batch, height, width, channels]`
+      and same type as input image.
+
+  Raises:
+    ValueError: if height < 2 or width < 2 or the inputs have the wrong number
+                of dimensions.
+  """
+  with ops.name_scope(name):
+    batch_size, height, width, channels = image.get_shape().as_list()
+    # The flow is defined on the image grid. Turn the flow into a list of query
+    # points in the grid space.
+    grid_x, grid_y = array_ops.meshgrid(
+        math_ops.range(width), math_ops.range(height))
+    stacked_grid = math_ops.cast(
+        array_ops.stack([grid_y, grid_x], axis=2), flow.dtype)
+    batched_grid = array_ops.expand_dims(stacked_grid, axis=0)
+    query_points_on_grid = batched_grid - flow
+    query_points_flattened = array_ops.reshape(query_points_on_grid,
+                                               [batch_size, height * width, 2])
+    # Compute values at the query points, then reshape the result back to the
+    # image grid.
+    interpolated = _interpolate_bilinear(image, query_points_flattened)
+    interpolated = array_ops.reshape(interpolated,
+                                     [batch_size, height, width, channels])
+    return interpolated
diff --git a/tensorflow/contrib/image/python/ops/interpolate_spline.py b/tensorflow/contrib/image/python/ops/interpolate_spline.py
new file mode 100644
index 0000000000000000000000000000000000000000..daf8c56456327f102f1409296a91f9f7b68ec799
--- /dev/null
+++ b/tensorflow/contrib/image/python/ops/interpolate_spline.py
@@ -0,0 +1,291 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Polyharmonic spline interpolation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+
+EPSILON = 0.0000000001
+
+
+def _cross_squared_distance_matrix(x, y):
+  """Pairwise squared distance between two (batch) matrices' rows (2nd dim).
+
+  Computes the pairwise distances between rows of x and rows of y
+  Args:
+    x: [batch_size, n, d] float `Tensor`
+    y: [batch_size, m, d] float `Tensor`
+
+  Returns:
+    squared_dists: [batch_size, n, m] float `Tensor`, where
+    squared_dists[b,i,j] = ||x[b,i,:] - y[b,j,:]||^2
+  """
+  x_norm_squared = math_ops.reduce_sum(math_ops.square(x), 2)
+  y_norm_squared = math_ops.reduce_sum(math_ops.square(y), 2)
+
+  # Expand so that we can broadcast.
+  x_norm_squared_tile = array_ops.expand_dims(x_norm_squared, 2)
+  y_norm_squared_tile = array_ops.expand_dims(y_norm_squared, 1)
+
+  x_y_transpose = math_ops.matmul(x, y, adjoint_b=True)
+
+  # squared_dists[b,i,j] = ||x_bi - y_bj||^2 = x_bi'x_bi- 2x_bi'x_bj + x_bj'x_bj
+  squared_dists = x_norm_squared_tile - 2 * x_y_transpose + y_norm_squared_tile
+
+  return squared_dists
+
+
+def _pairwise_squared_distance_matrix(x):
+  """Pairwise squared distance among a (batch) matrix's rows (2nd dim).
+
+  This saves a bit of computation vs. using _cross_squared_distance_matrix(x,x)
+
+  Args:
+    x: `[batch_size, n, d]` float `Tensor`
+
+  Returns:
+    squared_dists: `[batch_size, n, n]` float `Tensor`, where
+    squared_dists[b,i,j] = ||x[b,i,:] - x[b,j,:]||^2
+  """
+
+  x_x_transpose = math_ops.matmul(x, x, adjoint_b=True)
+  x_norm_squared = array_ops.matrix_diag_part(x_x_transpose)
+  x_norm_squared_tile = array_ops.expand_dims(x_norm_squared, 2)
+
+  # squared_dists[b,i,j] = ||x_bi - x_bj||^2 = x_bi'x_bi- 2x_bi'x_bj + x_bj'x_bj
+  squared_dists = x_norm_squared_tile - 2 * x_x_transpose + array_ops.transpose(
+      x_norm_squared_tile, [0, 2, 1])
+
+  return squared_dists
+
+
+def _solve_interpolation(train_points, train_values, order,
+                         regularization_weight):
+  """Solve for interpolation coefficients.
+
+  Computes the coefficients of the polyharmonic interpolant for the 'training'
+  data defined by (train_points, train_values) using the kernel phi.
+
+  Args:
+    train_points: `[b, n, d]` interpolation centers
+    train_values: `[b, n, k]` function values
+    order: order of the interpolation
+    regularization_weight: weight to place on smoothness regularization term
+
+  Returns:
+    w: `[b, n, k]` weights on each interpolation center
+    v: `[b, d, k]` weights on each input dimension
+  """
+
+  b, n, d = train_points.get_shape().as_list()
+  _, _, k = train_values.get_shape().as_list()
+
+  # First, rename variables so that the notation (c, f, w, v, A, B, etc.)
+  # follows https://en.wikipedia.org/wiki/Polyharmonic_spline.
+  # To account for python style guidelines we use
+  # matrix_a for A and matrix_b for B.
+
+  c = train_points
+  f = train_values
+
+  # Next, construct the linear system.
+  with ops.name_scope('construct_linear_system'):
+
+    matrix_a = _phi(_pairwise_squared_distance_matrix(c), order)  # [b, n, n]
+    if regularization_weight > 0:
+      batch_identity_matrix = np.expand_dims(np.eye(n), 0)
+      batch_identity_matrix = constant_op.constant(
+          batch_identity_matrix, dtype=train_points.dtype)
+
+      matrix_a += regularization_weight * batch_identity_matrix
+
+    # Append ones to the feature values for the bias term in the linear model.
+    ones = array_ops.ones([b, n, 1], train_points.dtype)
+    matrix_b = array_ops.concat([c, ones], 2)  # [b, n, d + 1]
+
+    # [b, n + d + 1, n]
+    left_block = array_ops.concat(
+        [matrix_a, array_ops.transpose(matrix_b, [0, 2, 1])], 1)
+
+    num_b_cols = matrix_b.get_shape()[2]  # d + 1
+    lhs_zeros = array_ops.zeros([b, num_b_cols, num_b_cols], train_points.dtype)
+    right_block = array_ops.concat([matrix_b, lhs_zeros],
+                                   1)  # [b, n + d + 1, d + 1]
+    lhs = array_ops.concat([left_block, right_block],
+                           2)  # [b, n + d + 1, n + d + 1]
+
+    rhs_zeros = array_ops.zeros([b, d + 1, k], train_points.dtype)
+    rhs = array_ops.concat([f, rhs_zeros], 1)  # [b, n + d + 1, k]
+
+  # Then, solve the linear system and unpack the results.
+  with ops.name_scope('solve_linear_system'):
+    w_v = linalg_ops.matrix_solve(lhs, rhs)
+    w = w_v[:, :n, :]
+    v = w_v[:, n:, :]
+
+  return w, v
+
+
+def _apply_interpolation(query_points, train_points, w, v, order):
+  """Apply polyharmonic interpolation model to data.
+
+  Given coefficients w and v for the interpolation model, we evaluate
+  interpolated function values at query_points.
+
+  Args:
+    query_points: `[b, m, d]` x values to evaluate the interpolation at
+    train_points: `[b, n, d]` x values that act as the interpolation centers
+                    ( the c variables in the wikipedia article)
+    w: `[b, n, k]` weights on each interpolation center
+    v: `[b, d, k]` weights on each input dimension
+    order: order of the interpolation
+
+  Returns:
+    Polyharmonic interpolation evaluated at points defined in query_points.
+  """
+
+  batch_size = train_points.get_shape()[0].value
+  num_query_points = query_points.get_shape()[1].value
+
+  # First, compute the contribution from the rbf term.
+  pairwise_dists = _cross_squared_distance_matrix(query_points, train_points)
+  phi_pairwise_dists = _phi(pairwise_dists, order)
+
+  rbf_term = math_ops.matmul(phi_pairwise_dists, w)
+
+  # Then, compute the contribution from the linear term.
+  # Pad query_points with ones, for the bias term in the linear model.
+  query_points_pad = array_ops.concat([
+      query_points,
+      array_ops.ones([batch_size, num_query_points, 1], train_points.dtype)
+  ], 2)
+  linear_term = math_ops.matmul(query_points_pad, v)
+
+  return rbf_term + linear_term
+
+
+def _phi(r, order):
+  """Coordinate-wise nonlinearity used to define the order of the interpolation.
+
+  See https://en.wikipedia.org/wiki/Polyharmonic_spline for the definition.
+
+  Args:
+    r: input op
+    order: interpolation order
+
+  Returns:
+    phi_k evaluated coordinate-wise on r, for k = r
+  """
+
+  # using EPSILON prevents log(0), sqrt0), etc.
+  # sqrt(0) is well-defined, but its gradient is not
+  with ops.name_scope('phi'):
+    if order == 1:
+      r = math_ops.maximum(r, EPSILON)
+      r = math_ops.sqrt(r)
+      return r
+    elif order == 2:
+      return 0.5 * r * math_ops.log(math_ops.maximum(r, EPSILON))
+    elif order == 4:
+      return 0.5 * math_ops.square(r) * math_ops.log(
+          math_ops.maximum(r, EPSILON))
+    elif order % 2 == 0:
+      r = math_ops.maximum(r, EPSILON)
+      return 0.5 * math_ops.pow(r, 0.5 * order) * math_ops.log(r)
+    else:
+      r = math_ops.maximum(r, EPSILON)
+      return math_ops.pow(r, 0.5 * order)
+
+
+def interpolate_spline(train_points,
+                       train_values,
+                       query_points,
+                       order,
+                       regularization_weight=0.0,
+                       name='interpolate_spline'):
+  r"""Interpolate signal using polyharmonic interpolation.
+
+  The interpolant has the form
+  $$f(x) = \sum_{i = 1}^n w_i \phi(||x - c_i||) + v^T x + b.$$
+
+  This is a sum of two terms: (1) a weighted sum of radial basis function (RBF)
+  terms, with the centers \\(c_1, ... c_n\\), and (2) a linear term with a bias.
+  The \\(c_i\\) vectors are 'training' points. In the code, b is absorbed into v
+  by appending 1 as a final dimension to x. The coefficients w and v are
+  estimated such that the interpolant exactly fits the value of the function at
+  the \\(c_i\\) points, the vector w is orthogonal to each \\(c_i\\), and the
+  vector w sums to 0. With these constraints, the coefficients can be obtained
+  by solving a linear system.
+
+  \\(\phi\\) is an RBF, parametrized by an interpolation
+  order. Using order=2 produces the well-known thin-plate spline.
+
+  We also provide the option to perform regularized interpolation. Here, the
+  interpolant is selected to trade off between the squared loss on the training
+  data and a certain measure of its curvature
+  ([details](https://en.wikipedia.org/wiki/Polyharmonic_spline)).
+  Using a regularization weight greater than zero has the effect that the
+  interpolant will no longer exactly fit the training data. However, it may be
+  less vulnerable to overfitting, particularly for high-order interpolation.
+
+  Note the interpolation procedure is differentiable with respect to all inputs
+  besides the order parameter.
+
+  Args:
+    train_points: `[batch_size, n, d]` float `Tensor` of n d-dimensional
+      locations. These do not need to be regularly-spaced.
+    train_values: `[batch_size, n, k]` float `Tensor` of n c-dimensional values
+      evaluated at train_points.
+    query_points: `[batch_size, m, d]` `Tensor` of m d-dimensional locations
+      where we will output the interpolant's values.
+    order: order of the interpolation. Common values are 1 for
+      \\(\phi(r) = r\\), 2 for \\(\phi(r) = r^2 * log(r)\\) (thin-plate spline),
+       or 3 for \\(\phi(r) = r^3\\).
+    regularization_weight: weight placed on the regularization term.
+      This will depend substantially on the problem, and it should always be
+      tuned. For many problems, it is reasonable to use no regularization.
+      If using a non-zero value, we recommend a small value like 0.001.
+    name: name prefix for ops created by this function
+
+  Returns:
+    `[b, m, k]` float `Tensor` of query values. We use train_points and
+    train_values to perform polyharmonic interpolation. The query values are
+    the values of the interpolant evaluated at the locations specified in
+    query_points.
+  """
+  with ops.name_scope(name):
+    train_points = ops.convert_to_tensor(train_points)
+    train_values = ops.convert_to_tensor(train_values)
+    query_points = ops.convert_to_tensor(query_points)
+
+    # First, fit the spline to the observed data.
+    with ops.name_scope('solve'):
+      w, v = _solve_interpolation(train_points, train_values, order,
+                                  regularization_weight)
+
+    # Then, evaluate the spline at the query locations.
+    with ops.name_scope('predict'):
+      query_values = _apply_interpolation(query_points, train_points, w, v,
+                                          order)
+
+  return query_values
diff --git a/tensorflow/contrib/image/python/ops/sparse_image_warp.py b/tensorflow/contrib/image/python/ops/sparse_image_warp.py
new file mode 100644
index 0000000000000000000000000000000000000000..54a215d6db6ded56a1a4a018a7e176f35fe6397e
--- /dev/null
+++ b/tensorflow/contrib/image/python/ops/sparse_image_warp.py
@@ -0,0 +1,201 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Image warping using sparse flow defined at control points."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.image.python.ops import dense_image_warp
+from tensorflow.contrib.image.python.ops import interpolate_spline
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+
+
+def _get_grid_locations(image_height, image_width):
+  """Wrapper for np.meshgrid."""
+
+  y_range = np.linspace(0, image_height - 1, image_height)
+  x_range = np.linspace(0, image_width - 1, image_width)
+  y_grid, x_grid = np.meshgrid(y_range, x_range, indexing='ij')
+  return np.stack((y_grid, x_grid), -1)
+
+
+def _expand_to_minibatch(np_array, batch_size):
+  """Tile arbitrarily-sized np_array to include new batch dimension."""
+  tiles = [batch_size] + [1] * np_array.ndim
+  return np.tile(np.expand_dims(np_array, 0), tiles)
+
+
+def _get_boundary_locations(image_height, image_width, num_points_per_edge):
+  """Compute evenly-spaced indices along edge of image."""
+  y_range = np.linspace(0, image_height - 1, num_points_per_edge + 2)
+  x_range = np.linspace(0, image_width - 1, num_points_per_edge + 2)
+  ys, xs = np.meshgrid(y_range, x_range, indexing='ij')
+  is_boundary = np.logical_or(
+      np.logical_or(xs == 0, xs == image_width - 1),
+      np.logical_or(ys == 0, ys == image_height - 1))
+  return np.stack([ys[is_boundary], xs[is_boundary]], axis=-1)
+
+
+def _add_zero_flow_controls_at_boundary(control_point_locations,
+                                        control_point_flows, image_height,
+                                        image_width, boundary_points_per_edge):
+  """Add control points for zero-flow boundary conditions.
+
+   Augment the set of control points with extra points on the
+   boundary of the image that have zero flow.
+
+  Args:
+    control_point_locations: input control points
+    control_point_flows: their flows
+    image_height: image height
+    image_width: image width
+    boundary_points_per_edge: number of points to add in the middle of each
+                           edge (not including the corners).
+                           The total number of points added is
+                           4 + 4*(boundary_points_per_edge).
+
+  Returns:
+    merged_control_point_locations: augmented set of control point locations
+    merged_control_point_flows: augmented set of control point flows
+  """
+
+  batch_size = control_point_locations.get_shape()[0].value
+
+  boundary_point_locations = _get_boundary_locations(image_height, image_width,
+                                                     boundary_points_per_edge)
+
+  boundary_point_flows = np.zeros([boundary_point_locations.shape[0], 2])
+
+  type_to_use = control_point_locations.dtype
+  boundary_point_locations = constant_op.constant(
+      _expand_to_minibatch(boundary_point_locations, batch_size),
+      dtype=type_to_use)
+
+  boundary_point_flows = constant_op.constant(
+      _expand_to_minibatch(boundary_point_flows, batch_size), dtype=type_to_use)
+
+  merged_control_point_locations = array_ops.concat(
+      [control_point_locations, boundary_point_locations], 1)
+
+  merged_control_point_flows = array_ops.concat(
+      [control_point_flows, boundary_point_flows], 1)
+
+  return merged_control_point_locations, merged_control_point_flows
+
+
+def sparse_image_warp(image,
+                      source_control_point_locations,
+                      dest_control_point_locations,
+                      interpolation_order=2,
+                      regularization_weight=0.0,
+                      num_boundary_points=0,
+                      name='sparse_image_warp'):
+  """Image warping using correspondences between sparse control points.
+
+  Apply a non-linear warp to the image, where the warp is specified by
+  the source and destination locations of a (potentially small) number of
+  control points. First, we use a polyharmonic spline
+  (@{tf.contrib.image.interpolate_spline}) to interpolate the displacements
+  between the corresponding control points to a dense flow field.
+  Then, we warp the image using this dense flow field
+  (@{tf.contrib.image.dense_image_warp}).
+
+  Let t index our control points. For regularization_weight=0, we have:
+  warped_image[b, dest_control_point_locations[b, t, 0],
+                  dest_control_point_locations[b, t, 1], :] =
+  image[b, source_control_point_locations[b, t, 0],
+           source_control_point_locations[b, t, 1], :].
+
+  For regularization_weight > 0, this condition is met approximately, since
+  regularized interpolation trades off smoothness of the interpolant vs.
+  reconstruction of the interpolant at the control points.
+  See @{tf.contrib.image.interpolate_spline} for further documentation of the
+  interpolation_order and regularization_weight arguments.
+
+
+  Args:
+    image: `[batch, height, width, channels]` float `Tensor`
+    source_control_point_locations: `[batch, num_control_points, 2]` float
+      `Tensor`
+    dest_control_point_locations: `[batch, num_control_points, 2]` float
+      `Tensor`
+    interpolation_order: polynomial order used by the spline interpolation
+    regularization_weight: weight on smoothness regularizer in interpolation
+    num_boundary_points: How many zero-flow boundary points to include at
+      each image edge.Usage:
+        num_boundary_points=0: don't add zero-flow points
+        num_boundary_points=1: 4 corners of the image
+        num_boundary_points=2: 4 corners and one in the middle of each edge
+          (8 points total)
+        num_boundary_points=n: 4 corners and n-1 along each edge
+    name: A name for the operation (optional).
+
+    Note that image and offsets can be of type tf.half, tf.float32, or
+    tf.float64, and do not necessarily have to be the same type.
+
+  Returns:
+    warped_image: `[batch, height, width, channels]` float `Tensor` with same
+      type as input image.
+    flow_field: `[batch, height, width, 2]` float `Tensor` containing the dense
+      flow field produced by the interpolation.
+  """
+
+  image = ops.convert_to_tensor(image)
+  source_control_point_locations = ops.convert_to_tensor(
+      source_control_point_locations)
+  dest_control_point_locations = ops.convert_to_tensor(
+      dest_control_point_locations)
+
+  control_point_flows = (
+      dest_control_point_locations - source_control_point_locations)
+
+  clamp_boundaries = num_boundary_points > 0
+  boundary_points_per_edge = num_boundary_points - 1
+
+  with ops.name_scope(name):
+
+    batch_size, image_height, image_width, _ = image.get_shape().as_list()
+
+    # This generates the dense locations where the interpolant
+    # will be evaluated.
+    grid_locations = _get_grid_locations(image_height, image_width)
+
+    flattened_grid_locations = np.reshape(grid_locations,
+                                          [image_height * image_width, 2])
+
+    flattened_grid_locations = constant_op.constant(
+        _expand_to_minibatch(flattened_grid_locations, batch_size), image.dtype)
+
+    if clamp_boundaries:
+      (dest_control_point_locations,
+       control_point_flows) = _add_zero_flow_controls_at_boundary(
+           dest_control_point_locations, control_point_flows, image_height,
+           image_width, boundary_points_per_edge)
+
+    flattened_flows = interpolate_spline.interpolate_spline(
+        dest_control_point_locations, control_point_flows,
+        flattened_grid_locations, interpolation_order, regularization_weight)
+
+    dense_flows = array_ops.reshape(flattened_flows,
+                                    [batch_size, image_height, image_width, 2])
+
+    warped_image = dense_image_warp.dense_image_warp(image, dense_flows)
+
+    return warped_image, dense_flows
diff --git a/tensorflow/contrib/input_pipeline/BUILD b/tensorflow/contrib/input_pipeline/BUILD
index 9d6b4d5d87e24d72b29ab33ee805fe0d068cc30a..0e34315db45d61282af1882631dc769a72965c3e 100644
--- a/tensorflow/contrib/input_pipeline/BUILD
+++ b/tensorflow/contrib/input_pipeline/BUILD
@@ -114,14 +114,3 @@ tf_cc_tests(
         "//tensorflow/core:testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/input_pipeline/kernels/BUILD b/tensorflow/contrib/input_pipeline/kernels/BUILD
index f20a6e38d4e80f869e9274d6fc49338a95fc6788..797605b8fe66e8375edcc70668a07a8d2a6d73f3 100644
--- a/tensorflow/contrib/input_pipeline/kernels/BUILD
+++ b/tensorflow/contrib/input_pipeline/kernels/BUILD
@@ -17,14 +17,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/integrate/BUILD b/tensorflow/contrib/integrate/BUILD
index 66948c1ea1f3f239d3f43a57626f8c229fe24ad9..0b7d64f4edd7587000ca5b9ecae257fe8fedd4a1 100644
--- a/tensorflow/contrib/integrate/BUILD
+++ b/tensorflow/contrib/integrate/BUILD
@@ -42,14 +42,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD
index 1c3974871c62911c0cb47677eb92d28286837142..3913c9dc7abfba2829bde5e86fe2927e8fc29a9d 100644
--- a/tensorflow/contrib/kafka/BUILD
+++ b/tensorflow/contrib/kafka/BUILD
@@ -119,17 +119,3 @@ tf_py_test(
         "notap",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index 7e0019ce4ad6c96e09ac9e222e2f4e2840273983..7a4cab20d1a3471af2a2a402a6d1443a90fa7f9b 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -52,15 +52,3 @@ py_library(
         "//tensorflow/python/keras",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kernel_methods/BUILD b/tensorflow/contrib/kernel_methods/BUILD
index eff7dfeb4c1117e40f4faf43c5e92a52cffd6528..87c2dcd89b63fa9f92d93c87abce91fd3460d44e 100644
--- a/tensorflow/contrib/kernel_methods/BUILD
+++ b/tensorflow/contrib/kernel_methods/BUILD
@@ -90,15 +90,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index f182fef067b7f523bc5ca63227265be40528b171..4ef0a66a52429233c6e6f70667a451466493629c 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -43,10 +43,10 @@ def sparse_multiclass_hinge_loss(
 
   This is a generalization of standard (binary) hinge loss. For a given instance
   with correct label c*, the loss is given by:
-    loss = max_{c != c*} logits_c - logits_{c*} + 1.
+    $$loss = max_{c != c*} logits_c - logits_{c*} + 1.$$
   or equivalently
-    loss = max_c { logits_c - logits_{c*} + I_{c != c*} }
-  where I_{c != c*} = 1 if c != c* and 0 otherwise.
+    $$loss = max_c { logits_c - logits_{c*} + I_{c != c*} }$$
+  where \\(I_{c != c*} = 1\ \text{if}\ c != c*\\) and 0 otherwise.
 
   Args:
     labels: `Tensor` of shape [batch_size] or [batch_size, 1]. Corresponds to
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
index 9dc01124ab195ae17b8795a11e4ebefe3f2c746b..091f0a109801065f06110e2a313c24486d38109f 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@@ -35,23 +35,23 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
 
   The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
   ```
-  exp(-||x-y||_2^2 / (2 * sigma^2))
+  $$(exp(-||x-y||_2^2 / (2 * \sigma^2))$$
   ```
 
   The implementation of RFFM is based on the following paper:
   "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
   (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
 
-  The mapping uses a matrix `Omega \in R^{d x D}` and a bias vector `b \in R^D`
-  where `d` is the input dimension (number of dense input features) and `D` is
-  the output dimension (i.e., dimension of the feature space the input is mapped
-  to). Each entry of `Omega` is sampled i.i.d. from a (scaled) Gaussian
-  distribution and each entry of `b` is sampled independently and uniformly from
-  [0, 2 * pi].
+  The mapping uses a matrix `\\(Omega \in R^{d x D}\\)` and a bias vector
+  `\\(b \in R^D\\)` where `d` is the input dimension (number of dense input
+  features) and `D` is the output dimension (i.e., dimension of the feature
+  space the input is mapped to). Each entry of `Omega` is sampled i.i.d. from a
+  (scaled) Gaussian distribution and each entry of `b` is sampled independently
+  and uniformly from [0, \\(2 * pi\\)].
 
   For a single input feature vector x in R^d, its RFFM is defined as:
   ```
-      sqrt(2/D) * cos(x * Omega + b)
+      $$sqrt(2/D) * cos(x * Omega + b)$$
   ```
   where `cos` is the element-wise cosine function and `x, b` are represented as
   row vectors. The aforementioned paper shows that the linear kernel of
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
index 6f4a264485993ab737723171409042b4a9673669..91929184a2e6f3cccae92cb819501a7c6ef81673 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
@@ -34,7 +34,7 @@ def _inner_product(x, y):
   """Inner product between tensors x and y.
 
   The input tensors are assumed to be in ROW representation, that is, the method
-  returns x * y^T.
+  returns \\(x * y^T\\).
 
   Args:
     x: input tensor in row format
diff --git a/tensorflow/contrib/kfac/BUILD b/tensorflow/contrib/kfac/BUILD
index 9a5759bf14f753bbc50d3ef8f54ceab7daf745ab..b719046b37ac761d56e8d5aa34772103be691cd6 100644
--- a/tensorflow/contrib/kfac/BUILD
+++ b/tensorflow/contrib/kfac/BUILD
@@ -24,15 +24,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/examples/BUILD b/tensorflow/contrib/kfac/examples/BUILD
index 89965eda374b2b403f680fc77eb923d0e660d1e2..8186fa1c62cb952f86614a96c3965bcddae1686e 100644
--- a/tensorflow/contrib/kfac/examples/BUILD
+++ b/tensorflow/contrib/kfac/examples/BUILD
@@ -28,8 +28,28 @@ py_library(
 )
 
 py_binary(
-    name = "convnet_mnist_main",
-    srcs = ["convnet_mnist_main.py"],
+    name = "convnet_mnist_single_main",
+    srcs = ["convnet_mnist_single_main.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "convnet_mnist_multi_tower_main",
+    srcs = ["convnet_mnist_multi_tower_main.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "convnet_mnist_distributed_main",
+    srcs = ["convnet_mnist_distributed_main.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":convnet",
@@ -58,15 +78,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/examples/convnet.py b/tensorflow/contrib/kfac/examples/convnet.py
index 39d80addaac1fe855a37255b32bf4412b99df46a..e8e3353091df25e135b1247bf976bb9ce177d1a7 100644
--- a/tensorflow/contrib/kfac/examples/convnet.py
+++ b/tensorflow/contrib/kfac/examples/convnet.py
@@ -37,6 +37,8 @@ import tensorflow as tf
 
 from tensorflow.contrib.kfac.examples import mlp
 from tensorflow.contrib.kfac.examples import mnist
+from tensorflow.contrib.kfac.python.ops import optimizer as opt
+
 
 lc = tf.contrib.kfac.layer_collection
 oq = tf.contrib.kfac.op_queue
@@ -48,12 +50,18 @@ __all__ = [
     "linear_layer",
     "build_model",
     "minimize_loss_single_machine",
-    "minimize_loss_distributed",
+    "distributed_grads_only_and_ops_chief_worker",
+    "distributed_grads_and_ops_dedicated_workers",
     "train_mnist_single_machine",
-    "train_mnist_distributed",
+    "train_mnist_distributed_sync_replicas",
+    "train_mnist_multitower"
 ]
 
 
+# Inverse update ops will be run every _INVERT_EVRY iterations.
+_INVERT_EVERY = 10
+
+
 def conv_layer(layer_id, inputs, kernel_size, out_channels):
   """Builds a convolutional layer with ReLU non-linearity.
 
@@ -161,8 +169,9 @@ def build_model(examples, labels, num_labels, layer_collection):
   accuracy = tf.reduce_mean(
       tf.cast(tf.equal(labels, tf.argmax(logits, axis=1)), dtype=tf.float32))
 
-  tf.summary.scalar("loss", loss)
-  tf.summary.scalar("accuracy", accuracy)
+  with tf.device("/cpu:0"):
+    tf.summary.scalar("loss", loss)
+    tf.summary.scalar("accuracy", accuracy)
 
   # Register parameters. K-FAC needs to know about the inputs, outputs, and
   # parameters of each conv/fully connected layer and the logits powering the
@@ -181,41 +190,59 @@ def build_model(examples, labels, num_labels, layer_collection):
 def minimize_loss_single_machine(loss,
                                  accuracy,
                                  layer_collection,
+                                 device="/gpu:0",
                                  session_config=None):
   """Minimize loss with K-FAC on a single machine.
 
-  A single Session is responsible for running all of K-FAC's ops.
+  A single Session is responsible for running all of K-FAC's ops. The covariance
+  and inverse update ops are placed on `device`. All model variables are on CPU.
 
   Args:
     loss: 0-D Tensor. Loss to be minimized.
     accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
     layer_collection: LayerCollection instance describing model architecture.
       Used by K-FAC to construct preconditioner.
+    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and invserse
+      update ops are run on this device.
     session_config: None or tf.ConfigProto. Configuration for tf.Session().
 
   Returns:
     final value for 'accuracy'.
   """
   # Train with K-FAC.
-  global_step = tf.train.get_or_create_global_step()
+  g_step = tf.train.get_or_create_global_step()
   optimizer = opt.KfacOptimizer(
       learning_rate=0.0001,
       cov_ema_decay=0.95,
       damping=0.001,
       layer_collection=layer_collection,
+      placement_strategy="round_robin",
+      cov_devices=[device],
+      inv_devices=[device],
       momentum=0.9)
-  train_op = optimizer.minimize(loss, global_step=global_step)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+
+  with tf.device(device):
+    train_op = optimizer.minimize(loss, global_step=g_step)
+
+  def make_update_op(update_thunks):
+    update_op = [thunk() for thunk in update_thunks]
+    return tf.group(*update_op)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([train_op, cov_update_op]):
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(g_step + 1, _INVERT_EVERY), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
 
   tf.logging.info("Starting training.")
   with tf.train.MonitoredTrainingSession(config=session_config) as sess:
     while not sess.should_stop():
-      global_step_, loss_, accuracy_, _, _ = sess.run(
-          [global_step, loss, accuracy, train_op, optimizer.cov_update_op])
-
-      if global_step_ % 100 == 0:
-        sess.run(optimizer.inv_update_op)
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [g_step, loss, accuracy, inverse_op])
 
-      if global_step_ % 100 == 0:
+      if (global_step_ + 1) % _INVERT_EVERY == 0:
         tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
                         global_step_, loss_, accuracy_)
 
@@ -250,16 +277,62 @@ def _num_gradient_tasks(num_tasks):
   return int(np.ceil(0.6 * num_tasks))
 
 
-def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
-                              checkpoint_dir, loss, accuracy, layer_collection):
-  """Minimize loss with an synchronous implementation of K-FAC.
+def _make_distributed_train_op(
+    task_id,
+    num_worker_tasks,
+    num_ps_tasks,
+    layer_collection
+):
+  """Creates optimizer and distributed training op.
 
-  Different tasks are responsible for different parts of K-FAC's Ops. The first
-  60% of tasks update weights; the next 20% accumulate covariance statistics;
-  the last 20% invert the matrices used to precondition gradients.
+  Constructs KFAC optimizer and wraps it in `sync_replicas` optimizer. Makes
+  the train op.
+
+  Args:
+   task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    num_worker_tasks: int. Number of workers in this distributed training setup.
+    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
+      parameter servers are not used.
+    layer_collection: LayerCollection instance describing model architecture.
+      Used by K-FAC to construct preconditioner.
+
+  Returns:
+    sync_optimizer: `tf.train.SyncReplicasOptimizer` instance which wraps KFAC
+      optimizer.
+    optimizer: Instance of `opt.KfacOptimizer`.
+    global_step: `tensor`, Global step.
+  """
+  tf.logging.info("Task id : %d", task_id)
+  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
+    global_step = tf.train.get_or_create_global_step()
+    optimizer = opt.KfacOptimizer(
+        learning_rate=0.0001,
+        cov_ema_decay=0.95,
+        damping=0.001,
+        layer_collection=layer_collection,
+        momentum=0.9)
+    sync_optimizer = tf.train.SyncReplicasOptimizer(
+        opt=optimizer,
+        replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks),
+        total_num_replicas=num_worker_tasks)
+    return sync_optimizer, optimizer, global_step
+
+
+def distributed_grads_only_and_ops_chief_worker(
+    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
+    loss, accuracy, layer_collection, invert_every=10):
+  """Minimize loss with a synchronous implementation of K-FAC.
+
+  All workers perform gradient computation. Chief worker applies gradient after
+  averaging the gradients obtained from all the workers. All workers block
+  execution untill the update is applied. Chief worker runs covariance and
+  inverse update ops. Covariance and inverse matrices are placed on parameter
+  servers in a round robin manner. For further details on synchronous
+  distributed optimization check `tf.train.SyncReplicasOptimizer`.
 
   Args:
     task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
     num_worker_tasks: int. Number of workers in this distributed training setup.
     num_ps_tasks: int. Number of parameter servers holding variables. If 0,
       parameter servers are not used.
@@ -271,6 +344,7 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
       run with each step.
     layer_collection: LayerCollection instance describing model architecture.
       Used by K-FAC to construct preconditioner.
+    invert_every: `int`, Number of steps between update the inverse.
 
   Returns:
     final value for 'accuracy'.
@@ -278,19 +352,80 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
   Raises:
     ValueError: if task_id >= num_worker_tasks.
   """
-  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
-    global_step = tf.train.get_or_create_global_step()
-    optimizer = opt.KfacOptimizer(
-        learning_rate=0.0001,
-        cov_ema_decay=0.95,
-        damping=0.001,
-        layer_collection=layer_collection,
-        momentum=0.9)
-    inv_update_queue = oq.OpQueue(optimizer.inv_update_ops)
-    sync_optimizer = tf.train.SyncReplicasOptimizer(
-        opt=optimizer,
-        replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks))
-    train_op = sync_optimizer.minimize(loss, global_step=global_step)
+
+  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
+      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+  train_op = sync_optimizer.minimize(loss, global_step=global_step)
+
+  tf.logging.info("Starting training.")
+  hooks = [sync_optimizer.make_session_run_hook(is_chief)]
+
+  def make_update_op(update_thunks):
+    update_op = [thunk() for thunk in update_thunks]
+    return tf.group(*update_op)
+
+  if is_chief:
+    cov_update_op = make_update_op(cov_update_thunks)
+    with tf.control_dependencies([train_op, cov_update_op]):
+      update_op = tf.cond(
+          tf.equal(tf.mod(global_step + 1, invert_every), 0),
+          lambda: make_update_op(inv_update_thunks),
+          tf.no_op)
+  else:
+    update_op = train_op
+
+  with tf.train.MonitoredTrainingSession(
+      master=master,
+      is_chief=is_chief,
+      checkpoint_dir=checkpoint_dir,
+      hooks=hooks,
+      stop_grace_period_secs=0) as sess:
+    while not sess.should_stop():
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [global_step, loss, accuracy, update_op])
+      tf.logging.info("global_step: %d | loss: %f | accuracy: %s", global_step_,
+                      loss_, accuracy_)
+  return accuracy_
+
+
+def distributed_grads_and_ops_dedicated_workers(
+    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
+    loss, accuracy, layer_collection):
+  """Minimize loss with a synchronous implementation of K-FAC.
+
+  Different workers are responsible for different parts of K-FAC's Ops. The
+  first 60% of tasks compute gradients; the next 20% accumulate covariance
+  statistics; the last 20% invert the matrices used to precondition gradients.
+  The chief worker applies the gradient .
+
+  Args:
+    task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
+    num_worker_tasks: int. Number of workers in this distributed training setup.
+    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
+      parameter servers are not used.
+    master: string. IP and port of TensorFlow runtime process. Set to empty
+      string to run locally.
+    checkpoint_dir: string or None. Path to store checkpoints under.
+    loss: 0-D Tensor. Loss to be minimized.
+    accuracy: dict mapping strings to 0-D Tensors. Additional accuracy to
+      run with each step.
+    layer_collection: LayerCollection instance describing model architecture.
+      Used by K-FAC to construct preconditioner.
+
+  Returns:
+    final value for 'accuracy'.
+
+  Raises:
+    ValueError: if task_id >= num_worker_tasks.
+  """
+  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
+      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
+  _, cov_update_op, inv_update_ops, _, _, _ = optimizer.make_ops_and_vars()
+  train_op = sync_optimizer.minimize(loss, global_step=global_step)
+  inv_update_queue = oq.OpQueue(inv_update_ops)
 
   tf.logging.info("Starting training.")
   is_chief = (task_id == 0)
@@ -306,7 +441,7 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
       if _is_gradient_task(task_id, num_worker_tasks):
         learning_op = train_op
       elif _is_cov_update_task(task_id, num_worker_tasks):
-        learning_op = optimizer.cov_update_op
+        learning_op = cov_update_op
       elif _is_inv_update_task(task_id, num_worker_tasks):
         # TODO(duckworthd): Running this op before cov_update_op has been run a
         # few times can result in "InvalidArgumentError: Cholesky decomposition
@@ -324,13 +459,18 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
   return accuracy_
 
 
-def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False):
+def train_mnist_single_machine(data_dir,
+                               num_epochs,
+                               use_fake_data=False,
+                               device="/gpu:0"):
   """Train a ConvNet on MNIST.
 
   Args:
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     use_fake_data: bool. If True, generate a synthetic dataset.
+    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and inverse
+      update ops are run on this device.
 
   Returns:
     accuracy of model on the final minibatch of training data.
@@ -350,22 +490,38 @@ def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False):
       examples, labels, num_labels=10, layer_collection=layer_collection)
 
   # Fit model.
-  return minimize_loss_single_machine(loss, accuracy, layer_collection)
+  return minimize_loss_single_machine(
+      loss, accuracy, layer_collection, device=device)
 
 
 def train_mnist_multitower(data_dir, num_epochs, num_towers,
-                           use_fake_data=True):
+                           use_fake_data=True, devices=None):
   """Train a ConvNet on MNIST.
 
+  Training data is split equally among the towers. Each tower computes loss on
+  its own batch of data and the loss is aggregated on the CPU. The model
+  variables are placed on first tower. The covariance and inverse update ops
+  and variables are placed on GPUs in a round robin manner.
+
   Args:
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     num_towers: int. Number of CPUs to split inference across.
     use_fake_data: bool. If True, generate a synthetic dataset.
+    devices: string, Either list of CPU or GPU. The covaraince and inverse
+      update ops are run on this device.
 
   Returns:
     accuracy of model on the final minibatch of training data.
   """
+  if devices:
+    device_count = {"GPU": num_towers}
+  else:
+    device_count = {"CPU": num_towers}
+
+  devices = devices or [
+      "/cpu:{}".format(tower_id) for tower_id in range(num_towers)
+  ]
   # Load a dataset.
   tf.logging.info("Loading MNIST into memory.")
   tower_batch_size = 128
@@ -388,7 +544,7 @@ def train_mnist_multitower(data_dir, num_epochs, num_towers,
   layer_collection = lc.LayerCollection()
   tower_results = []
   for tower_id in range(num_towers):
-    with tf.device("/cpu:%d" % tower_id):
+    with tf.device(devices[tower_id]):
       with tf.name_scope("tower%d" % tower_id):
         with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)):
           tf.logging.info("Building tower %d." % tower_id)
@@ -402,34 +558,79 @@ def train_mnist_multitower(data_dir, num_epochs, num_towers,
   accuracy = tf.reduce_mean(accuracies)
 
   # Fit model.
+
   session_config = tf.ConfigProto(
-      allow_soft_placement=False, device_count={
-          "CPU": num_towers
-      })
-  return minimize_loss_single_machine(
-      loss, accuracy, layer_collection, session_config=session_config)
+      allow_soft_placement=False,
+      device_count=device_count,
+  )
+
+  g_step = tf.train.get_or_create_global_step()
+  optimizer = opt.KfacOptimizer(
+      learning_rate=0.0001,
+      cov_ema_decay=0.95,
+      damping=0.001,
+      layer_collection=layer_collection,
+      placement_strategy="round_robin",
+      cov_devices=devices,
+      inv_devices=devices,
+      momentum=0.9)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
 
+  train_op = optimizer.minimize(loss, global_step=g_step)
 
-def train_mnist_distributed(task_id,
-                            num_worker_tasks,
-                            num_ps_tasks,
-                            master,
-                            data_dir,
-                            num_epochs,
-                            use_fake_data=False):
-  """Train a ConvNet on MNIST.
+  def make_update_op(update_thunks):
+    update_op = [thunk() for thunk in update_thunks]
+    return tf.group(*update_op)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([train_op, cov_update_op]):
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(g_step + 1, _INVERT_EVERY), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
+
+  tf.logging.info("Starting training.")
+  with tf.train.MonitoredTrainingSession(config=session_config) as sess:
+    while not sess.should_stop():
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [g_step, loss, accuracy, inverse_op])
+
+      if (global_step_ + 1) % _INVERT_EVERY == 0:
+        tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
+                        global_step_, loss_, accuracy_)
+
+
+def train_mnist_distributed_sync_replicas(task_id,
+                                          is_chief,
+                                          num_worker_tasks,
+                                          num_ps_tasks,
+                                          master,
+                                          data_dir,
+                                          num_epochs,
+                                          op_strategy,
+                                          use_fake_data=False):
+  """Train a ConvNet on MNIST using Sync replicas optimizer.
 
   Args:
     task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
     num_worker_tasks: int. Number of workers in this distributed training setup.
     num_ps_tasks: int. Number of parameter servers holding variables.
     master: string. IP and port of TensorFlow runtime process.
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
+    op_strategy: `string`, Strategy to run the covariance and inverse
+      ops. If op_strategy == `chief_worker` then covaraiance and inverse
+      update ops are run on chief worker otherwise they are run on dedicated
+      workers.
+
     use_fake_data: bool. If True, generate a synthetic dataset.
 
   Returns:
     accuracy of model on the final minibatch of training data.
+
+  Raises:
+    ValueError: If `op_strategy` not in ["chief_worker", "dedicated_workers"].
   """
   # Load a dataset.
   tf.logging.info("Loading MNIST into memory.")
@@ -448,9 +649,17 @@ def train_mnist_distributed(task_id,
 
   # Fit model.
   checkpoint_dir = None if data_dir is None else os.path.join(data_dir, "kfac")
-  return minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks,
-                                   master, checkpoint_dir, loss, accuracy,
-                                   layer_collection)
+  if op_strategy == "chief_worker":
+    return distributed_grads_only_and_ops_chief_worker(
+        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
+        checkpoint_dir, loss, accuracy, layer_collection)
+  elif op_strategy == "dedicated_workers":
+    return distributed_grads_and_ops_dedicated_workers(
+        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
+        checkpoint_dir, loss, accuracy, layer_collection)
+  else:
+    raise ValueError("Only supported op strategies are : {}, {}".format(
+        "chief_worker", "dedicated_workers"))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4c2d4a9e9bfcc4bfb55a25d2f23e66afe5b1375
--- /dev/null
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Train a ConvNet on MNIST using K-FAC.
+
+Distributed training with sync replicas optimizer. See
+`convnet.train_mnist_distributed_sync_replicas` for details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.contrib.kfac.examples import convnet
+
+FLAGS = flags.FLAGS
+flags.DEFINE_integer("task", -1, "Task identifier")
+flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
+flags.DEFINE_string(
+    "cov_inv_op_strategy", "chief_worker",
+    "In dist training mode run the cov, inv ops on chief or dedicated workers."
+)
+flags.DEFINE_string("master", "local", "Session master.")
+flags.DEFINE_integer("ps_tasks", 2,
+                     "Number of tasks in the parameter server job.")
+flags.DEFINE_integer("replicas_to_aggregate", 5,
+                     "Number of replicas to aggregate.")
+flags.DEFINE_integer("worker_replicas", 5, "Number of replicas in worker job.")
+flags.DEFINE_integer("num_epochs", None, "Number of epochs.")
+
+
+def _is_chief():
+  """Determines whether a job is the chief worker."""
+  if "chief_worker" in FLAGS.brain_jobs:
+    return FLAGS.brain_job_name == "chief_worker"
+  else:
+    return FLAGS.task == 0
+
+
+def main(unused_argv):
+  _ = unused_argv
+  convnet.train_mnist_distributed_sync_replicas(
+      FLAGS.task, _is_chief(), FLAGS.worker_replicas, FLAGS.ps_tasks,
+      FLAGS.master, FLAGS.data_dir, FLAGS.num_epochs, FLAGS.cov_inv_op_strategy)
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
similarity index 57%
rename from tensorflow/contrib/kfac/examples/convnet_mnist_main.py
rename to tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
index b0c6fbde198850c76af0bc1600dc23e926227229..4249bf8a8d9d3a5beb87d4140a55b0ee6eadbc64 100644
--- a/tensorflow/contrib/kfac/examples/convnet_mnist_main.py
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
@@ -14,44 +14,35 @@
 # ==============================================================================
 r"""Train a ConvNet on MNIST using K-FAC.
 
-See convnet.py for details.
+Multi tower training mode. See `convnet.train_mnist_multitower` for details.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
-import sys
 
+from absl import flags
 import tensorflow as tf
 
 from tensorflow.contrib.kfac.examples import convnet
 
-FLAGS = None
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/tmp/multitower_1/mnist", "local mnist dir")
+flags.DEFINE_integer("num_towers", 2,
+                     "Number of towers for multi tower training.")
 
 
-def main(argv):
-  _ = argv
-
-  if FLAGS.num_towers > 1:
-    convnet.train_mnist_multitower(
-        FLAGS.data_dir, num_epochs=200, num_towers=FLAGS.num_towers)
-  else:
-    convnet.train_mnist_single_machine(FLAGS.data_dir, num_epochs=200)
+def main(unused_argv):
+  _ = unused_argv
+  assert FLAGS.num_towers > 1
+  devices = ["/gpu:{}".format(tower_id) for tower_id in range(FLAGS.num_towers)]
+  convnet.train_mnist_multitower(
+      FLAGS.data_dir,
+      num_epochs=200,
+      num_towers=FLAGS.num_towers,
+      devices=devices)
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      "--data_dir",
-      type=str,
-      default="/tmp/mnist",
-      help="Directory to store dataset in.")
-  parser.add_argument(
-      "--num_towers",
-      type=int,
-      default=1,
-      help="Number of CPUs to split minibatch across.")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings.py b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
similarity index 63%
rename from tensorflow/contrib/bayesflow/python/ops/metropolis_hastings.py
rename to tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
index e7fcbc65ef379e84a140a06e020549f74f905a99..3aa52aff196fd2699559f80b0c226f470c94b2a3 100644
--- a/tensorflow/contrib/bayesflow/python/ops/metropolis_hastings.py
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
@@ -12,23 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functions to create a Markov Chain Monte Carlo Metropolis step."""
+r"""Train a ConvNet on MNIST using K-FAC.
+
+Train on single machine. See `convnet.train_mnist_single_machine` for details.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.metropolis_hastings_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = [
-    'kernel',
-    'evolve',
-    'proposal_uniform',
-    'proposal_normal',
-]
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.contrib.kfac.examples import convnet
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
+
+
+def main(unused_argv):
+  convnet.train_mnist_single_gpu(FLAGS.data_dir, num_epochs=200)
+
 
-remove_undocumented(__name__, _allowed_symbols)
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/tests/BUILD b/tensorflow/contrib/kfac/examples/tests/BUILD
index ce7da95c124beaed4773d68ce0d0c41f187f7c9d..ede7f183fe24f26bd86e232e831dea5f8ea1fdc4 100644
--- a/tensorflow/contrib/kfac/examples/tests/BUILD
+++ b/tensorflow/contrib/kfac/examples/tests/BUILD
@@ -50,15 +50,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/examples/tests/convnet_test.py b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
index 8d86c2bb5150cd4bc8a2b21ba050e904929e0fe9..6de775cc79953ba548c766e861d6d88e0455a508 100644
--- a/tensorflow/contrib/kfac/examples/tests/convnet_test.py
+++ b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
@@ -112,15 +112,16 @@ class ConvNetTest(tf.test.TestCase):
   def testMinimizeLossSingleMachine(self):
     with tf.Graph().as_default():
       loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.minimize_loss_single_machine(loss, accuracy,
-                                                       layer_collection)
-      self.assertLess(accuracy_, 1.0)
+      accuracy_ = convnet.minimize_loss_single_machine(
+          loss, accuracy, layer_collection, device="/cpu:0")
+      self.assertLess(accuracy_, 2.0)
 
   def testMinimizeLossDistributed(self):
     with tf.Graph().as_default():
       loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.minimize_loss_distributed(
+      accuracy_ = convnet.distributed_grads_only_and_ops_chief_worker(
           task_id=0,
+          is_chief=True,
           num_worker_tasks=1,
           num_ps_tasks=0,
           master="",
@@ -128,7 +129,7 @@ class ConvNetTest(tf.test.TestCase):
           loss=loss,
           accuracy=accuracy,
           layer_collection=layer_collection)
-      self.assertLess(accuracy_, 1.0)
+      self.assertLess(accuracy_, 2.0)
 
   def testTrainMnistSingleMachine(self):
     with tf.Graph().as_default():
@@ -138,7 +139,7 @@ class ConvNetTest(tf.test.TestCase):
       # but there are too few parameters for the model to effectively memorize
       # the training set the way an MLP can.
       convnet.train_mnist_single_machine(
-          data_dir=None, num_epochs=1, use_fake_data=True)
+          data_dir=None, num_epochs=1, use_fake_data=True, device="/cpu:0")
 
   def testTrainMnistMultitower(self):
     with tf.Graph().as_default():
@@ -149,13 +150,15 @@ class ConvNetTest(tf.test.TestCase):
   def testTrainMnistDistributed(self):
     with tf.Graph().as_default():
       # Ensure model training doesn't crash.
-      convnet.train_mnist_distributed(
+      convnet.train_mnist_distributed_sync_replicas(
           task_id=0,
+          is_chief=True,
           num_worker_tasks=1,
           num_ps_tasks=0,
           master="",
           data_dir=None,
           num_epochs=1,
+          op_strategy="chief_worker",
           use_fake_data=True)
 
 
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
index d1c449402a697dd5f8876c82a6682dde2d18b4df..2477d2bfc12c2df64a672fd457e9634009ccd129 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
@@ -156,15 +156,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
index 30c5404e03910eedb48132b0d69b2eabb89a9149..f22dbcf21566297340f3b4158a810f6d03af12f5 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.contrib.kfac.python.ops import estimator
 from tensorflow.contrib.kfac.python.ops import layer_collection as lc
 from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -40,30 +39,6 @@ from tensorflow.python.training import training_util
 _ALL_ESTIMATION_MODES = ["gradients", "empirical", "curvature_prop", "exact"]
 
 
-class DeviceContextGeneratorTest(test.TestCase):
-
-  def testNoDevice(self):
-    device_context_generator = estimator._DeviceContextGenerator(None)
-    with ops.device("/device:CPU:0"):  # This is what will be used
-      with device_context_generator():  # Does nothing
-        a = constant_op.constant([2.0], name="a")
-    self.assertEqual("/device:CPU:0", a.op.device)
-
-  def testTwoDevices(self):
-    device_context_generator = estimator._DeviceContextGenerator(
-        ["/device:GPU:0", "/device:GPU:1"])
-    with ops.device("/device:CPU:0"):  # Will be over-ridden by the inner scopes
-      with device_context_generator():
-        a = constant_op.constant([2.0], name="a")
-      with device_context_generator():
-        b = constant_op.constant([2.0], name="b")
-      with device_context_generator():
-        c = constant_op.constant([2.0], name="c")
-    self.assertEqual("/device:GPU:0", a.op.device)
-    self.assertEqual("/device:GPU:1", b.op.device)
-    self.assertEqual("/device:GPU:0", c.op.device)
-
-
 class EstimatorTest(test.TestCase):
 
   def setUp(self):
@@ -90,68 +65,98 @@ class EstimatorTest(test.TestCase):
   def testEstimatorInitManualRegistration(self):
     with self._graph.as_default():
       # We should be able to build an estimator for only the registered vars.
-      estimator.FisherEstimator([self.weights], 0.1, 0.2,
-                                self.layer_collection)
+      estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection
+      )
 
       # Check that we throw an error if we try to build an estimator for vars
       # that were not manually registered.
       with self.assertRaises(ValueError):
-        est = estimator.FisherEstimator([self.weights, self.bias], 0.1, 0.2,
-                                        self.layer_collection)
+        est = estimator.FisherEstimatorRoundRobin(
+            variables=[self.weights, self.bias],
+            cov_ema_decay=0.1,
+            damping=0.2,
+            layer_collection=self.layer_collection
+        )
         est.make_ops_and_vars()
 
       # Check that we throw an error if we don't include registered variables,
       # i.e. self.weights
       with self.assertRaises(ValueError):
-        est = estimator.FisherEstimator([], 0.1, 0.2, self.layer_collection)
+        est = estimator.FisherEstimatorRoundRobin(
+            variables=[],
+            cov_ema_decay=0.1,
+            damping=0.2,
+            layer_collection=self.layer_collection)
         est.make_ops_and_vars()
 
   @test.mock.patch.object(utils.SubGraph, "variable_uses", return_value=42)
   def testVariableWrongNumberOfUses(self, mock_uses):
     with self.assertRaises(ValueError):
-      est = estimator.FisherEstimator([self.weights], 0.1, 0.2,
-                                      self.layer_collection)
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection)
       est.make_ops_and_vars()
 
   def testInvalidEstimationMode(self):
     with self.assertRaises(ValueError):
-      est = estimator.FisherEstimator([self.weights], 0.1, 0.2,
-                                      self.layer_collection,
-                                      estimation_mode="not_a_real_mode")
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="not_a_real_mode")
       est.make_ops_and_vars()
 
   def testGradientsModeBuild(self):
     with self._graph.as_default():
-      est = estimator.FisherEstimator([self.weights], 0.1, 0.2,
-                                      self.layer_collection,
-                                      estimation_mode="gradients")
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="gradients")
       est.make_ops_and_vars()
 
   def testEmpiricalModeBuild(self):
     with self._graph.as_default():
-      est = estimator.FisherEstimator([self.weights], 0.1, 0.2,
-                                      self.layer_collection,
-                                      estimation_mode="empirical")
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="empirical")
       est.make_ops_and_vars()
 
   def testCurvaturePropModeBuild(self):
     with self._graph.as_default():
-      est = estimator.FisherEstimator([self.weights], 0.1, 0.2,
-                                      self.layer_collection,
-                                      estimation_mode="curvature_prop")
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="curvature_prop")
       est.make_ops_and_vars()
 
   def testExactModeBuild(self):
     with self._graph.as_default():
-      est = estimator.FisherEstimator([self.weights], 0.1, 0.2,
-                                      self.layer_collection,
-                                      estimation_mode="exact")
+      est = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          cov_ema_decay=0.1,
+          damping=0.2,
+          layer_collection=self.layer_collection,
+          estimation_mode="exact")
       est.make_ops_and_vars()
 
   def test_cov_update_thunks(self):
     """Ensures covariance update ops run once per global_step."""
     with self._graph.as_default(), self.test_session() as sess:
-      fisher_estimator = estimator.FisherEstimator(
+      fisher_estimator = estimator.FisherEstimatorRoundRobin(
           variables=[self.weights],
           layer_collection=self.layer_collection,
           damping=0.2,
@@ -159,8 +164,8 @@ class EstimatorTest(test.TestCase):
 
       # Construct an op that executes one covariance update per step.
       global_step = training_util.get_or_create_global_step()
-      (cov_variable_thunks, cov_update_op_thunks,
-       _, _) = fisher_estimator.create_ops_and_vars_thunks()
+      (cov_variable_thunks, cov_update_op_thunks, _,
+       _) = fisher_estimator.create_ops_and_vars_thunks()
       for thunk in cov_variable_thunks:
         thunk()
       cov_matrices = [
@@ -198,10 +203,43 @@ class EstimatorTest(test.TestCase):
         sess.run(cov_update_op)
         sess.run(increment_global_step)
 
+  def test_round_robin_placement(self):
+    """Check if the ops and variables are placed on devices correctly."""
+    with self._graph.as_default():
+      fisher_estimator = estimator.FisherEstimatorRoundRobin(
+          variables=[self.weights],
+          layer_collection=self.layer_collection,
+          damping=0.2,
+          cov_ema_decay=0.0,
+          cov_devices=["/cpu:{}".format(i) for i in range(2)],
+          inv_devices=["/cpu:{}".format(i) for i in range(2)])
+
+      # Construct an op that executes one covariance update per step.
+      (cov_update_ops, _, inv_update_ops, _, _,
+       _) = fisher_estimator.make_ops_and_vars(scope="test")
+      self.assertEqual(cov_update_ops[0].device, "/device:CPU:0")
+      self.assertEqual(cov_update_ops[1].device, "/device:CPU:1")
+      self.assertEqual(inv_update_ops[0].device, "/device:CPU:0")
+      self.assertEqual(inv_update_ops[1].device, "/device:CPU:1")
+      cov_matrices = [
+          fisher_factor.get_cov()
+          for fisher_factor in self.layer_collection.get_factors()
+      ]
+      inv_matrices = [
+          matrix
+          for fisher_factor in self.layer_collection.get_factors()
+          for matrix in fisher_factor._matpower_by_exp_and_damping.values()
+      ]
+      self.assertEqual(cov_matrices[0].device, "/device:CPU:0")
+      self.assertEqual(cov_matrices[1].device, "/device:CPU:1")
+      # Inverse matrices need to be explicitly placed.
+      self.assertEqual(inv_matrices[0].device, "")
+      self.assertEqual(inv_matrices[1].device, "")
+
   def test_inv_update_thunks(self):
     """Ensures inverse update ops run once per global_step."""
     with self._graph.as_default(), self.test_session() as sess:
-      fisher_estimator = estimator.FisherEstimator(
+      fisher_estimator = estimator.FisherEstimatorRoundRobin(
           variables=[self.weights],
           layer_collection=self.layer_collection,
           damping=0.2,
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
index b70c700f0936c2d8a2eca6e0836a3ee4ffe4e46d..6eda6c31e34370fd2bea1192ebf777924824c8e3 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
@@ -63,7 +63,7 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -72,7 +72,7 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -81,7 +81,7 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors(grads, 0.5)
@@ -91,7 +91,7 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors((grads,), 0.5)
       block._factor.instantiate_cov_variables()
@@ -112,7 +112,7 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = array_ops.constant([[1.], [2.]])
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = params**2
       block.instantiate_factors((grads,), 0.5)
       block._factor.instantiate_cov_variables()
@@ -133,7 +133,7 @@ class FullFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = (array_ops.constant([2., 3.]), array_ops.constant(4.))
       damping = 0.5
       block.instantiate_factors((grads,), damping)
@@ -163,7 +163,7 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -172,7 +172,7 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -181,7 +181,7 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
 
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors(grads, 0.5)
@@ -191,7 +191,7 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors((grads,), 0.5)
       block._factor.instantiate_cov_variables()
@@ -210,7 +210,7 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = array_ops.constant([[1.], [2.]])
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = params**2
       block.instantiate_factors((grads,), 0.5)
       block._factor.instantiate_cov_variables()
@@ -228,7 +228,7 @@ class NaiveDiagonalFBTest(test.TestCase):
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
       block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_minibatch(32)
+      block.register_additional_tower(32)
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       damping = 0.5
       block.instantiate_factors((grads,), damping)
@@ -324,8 +324,8 @@ class FullyConnectedDiagonalFBTest(test.TestCase):
 
     self.assertAllClose(expected_result, result)
 
-  def testRegisterAdditionalMinibatch(self):
-    """Ensure 1 big minibatch and 2 small minibatches are equivalent."""
+  def testRegisterAdditionalTower(self):
+    """Ensure 1 big tower and 2 small towers are equivalent."""
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
@@ -376,7 +376,7 @@ class FullyConnectedDiagonalFBTest(test.TestCase):
       block = fb.FullyConnectedDiagonalFB(
           lc.LayerCollection(), has_bias=isinstance(params, (tuple, list)))
       for (i, o) in zip(inputs, outputs):
-        block.register_additional_minibatch(i, o)
+        block.register_additional_tower(i, o)
 
       block.instantiate_factors((output_grads,), damping=0.0)
       block._factor.instantiate_cov_variables()
@@ -402,7 +402,7 @@ class EmbeddingKFACFBTest(test.TestCase):
       # Add some examples.
       inputs = array_ops.constant([[0, 1], [1, 2], [2, 3]])
       outputs = array_ops.constant([[0.], [1.], [2.]])
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       # Instantiate factor's variables. Ensure it doesn't fail.
       grads = outputs**2.
@@ -420,7 +420,7 @@ class EmbeddingKFACFBTest(test.TestCase):
       # Add some examples.
       inputs = array_ops.constant([[0, 1], [1, 2], [2, 3]])
       outputs = array_ops.constant([[0.], [1.], [2.]])
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       # Instantiate factor's variables. Ensure it doesn't fail.
       grads = outputs**2.
@@ -461,7 +461,7 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([1., 2.])
       outputs = array_ops.constant([3., 4.])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection())
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       self.assertAllEqual([outputs], block.tensors_to_compute_grads())
 
@@ -471,7 +471,7 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([[1., 2.], [3., 4.]])
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=True)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       grads = outputs**2
       block.instantiate_factors(((grads,),), 0.5)
@@ -482,7 +482,7 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([[1., 2.], [3., 4.]])
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       grads = outputs**2
       block.instantiate_factors(((grads,),), 0.5)
@@ -493,7 +493,7 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]])
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
       block.instantiate_factors(((grads,),), 0.5)
 
@@ -525,7 +525,7 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       inputs = array_ops.constant([[1., 2.], [3., 4.]])
       outputs = array_ops.constant([[3., 4.], [5., 6.]])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
       block.instantiate_factors(((grads,),), 0.5)
       block._input_factor.instantiate_cov_variables()
@@ -553,7 +553,7 @@ class FullyConnectedKFACBasicFBTest(test.TestCase):
       outputs = array_ops.zeros([32, output_dim])
       params = array_ops.zeros([input_dim, output_dim])
       block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
       damping = 0.  # This test is only valid without damping.
       block.instantiate_factors(((grads,),), damping)
@@ -689,8 +689,8 @@ class ConvDiagonalFBTest(test.TestCase):
 
     self.assertAllClose(expected_result, result, atol=1e-3)
 
-  def testRegisterAdditionalMinibatch(self):
-    """Ensure 1 big minibatch and 2 small minibatches are equivalent."""
+  def testRegisterAdditionalTower(self):
+    """Ensure 1 big tower and 2 small towers are equivalent."""
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
@@ -751,7 +751,7 @@ class ConvDiagonalFBTest(test.TestCase):
       block = fb.ConvDiagonalFB(
           lc.LayerCollection(), params, strides=[1, 1, 1, 1], padding='SAME')
       for (i, o) in zip(inputs, outputs):
-        block.register_additional_minibatch(i, o)
+        block.register_additional_tower(i, o)
 
       block.instantiate_factors((output_grads,), damping=0.0)
       block._factor.instantiate_cov_variables()
@@ -775,7 +775,7 @@ class DepthwiseConvKFCBasicFBTest(test.TestCase):
       layer_collection = lc.LayerCollection()
       block = fb.DepthwiseConvKFCBasicFB(
           layer_collection, params=params, strides=[1, 1, 1, 1], padding='SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
       block.instantiate_factors(([grads],), 0.5)
 
@@ -788,7 +788,7 @@ class DepthwiseConvKFCBasicFBTest(test.TestCase):
       layer_collection = lc.LayerCollection()
       block = fb.DepthwiseConvKFCBasicFB(
           layer_collection, params=params, strides=[1, 1, 1, 1], padding='SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
       block.instantiate_factors(([grads],), 0.5)
       block._input_factor.instantiate_cov_variables()
@@ -825,7 +825,7 @@ class ConvKFCBasicFBTest(test.TestCase):
       outputs = random_ops.random_normal((2, 2, 2))
       block = fb.ConvKFCBasicFB(
           lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
 
       self.assertAllEqual([outputs], block.tensors_to_compute_grads())
 
@@ -843,7 +843,7 @@ class ConvKFCBasicFBTest(test.TestCase):
       outputs = random_ops.random_normal((2, 2, 2, 2))
       block = fb.ConvKFCBasicFB(
           lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
       block.instantiate_factors(((grads,),), 0.5)
       block._input_factor.instantiate_cov_variables()
@@ -874,7 +874,7 @@ class ConvKFCBasicFBTest(test.TestCase):
       outputs = random_ops.random_normal((2, 2, 2, 2))
       block = fb.ConvKFCBasicFB(
           lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       self.assertFalse(block._has_bias)
       grads = outputs**2
       block.instantiate_factors(((grads,),), 0.5)
@@ -902,7 +902,7 @@ class ConvKFCBasicFBTest(test.TestCase):
       outputs = random_ops.random_normal((2, 2, 2, 2))
       block = fb.ConvKFCBasicFB(
           lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       self.assertTrue(block._has_bias)
       grads = outputs**2
       block.instantiate_factors(((grads,),), 0.5)
@@ -930,7 +930,7 @@ class ConvKFCBasicFBTest(test.TestCase):
       outputs = array_ops.zeros((2, 2, 2, 2))
       block = fb.ConvKFCBasicFB(
           lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_minibatch(inputs, outputs)
+      block.register_additional_tower(inputs, outputs)
       grads = outputs**2
       damping = 0.  # This test is only valid without damping.
       block.instantiate_factors(((grads,),), damping)
@@ -964,7 +964,7 @@ class FullyConnectedSeriesFBTest(test.TestCase):
       inputs = array_ops.constant([1., 2.])
       outputs = array_ops.constant([3., 4.])
       block = fb.FullyConnectedSeriesFB(lc.LayerCollection())
-      block.register_additional_minibatch([inputs], [outputs])
+      block.register_additional_tower([inputs], [outputs])
       self.assertAllEqual([[outputs]], block.tensors_to_compute_grads())
 
   def testInstantiateFactorsHasBias(self):
@@ -975,7 +975,7 @@ class FullyConnectedSeriesFBTest(test.TestCase):
       block = fb.FullyConnectedSeriesFB(
           lc.LayerCollection(),
           has_bias=True)
-      block.register_additional_minibatch([inputs], [outputs])
+      block.register_additional_tower([inputs], [outputs])
       grads = outputs**2
       block.instantiate_factors((((grads,),),), 0.5)
 
@@ -987,7 +987,7 @@ class FullyConnectedSeriesFBTest(test.TestCase):
       block = fb.FullyConnectedSeriesFB(
           lc.LayerCollection(),
           has_bias=False)
-      block.register_additional_minibatch([inputs], [outputs])
+      block.register_additional_tower([inputs], [outputs])
       grads = outputs**2
       block.instantiate_factors((((grads,),),), 0.5)
 
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
index 16f02f1199ad8a404b0e6944fc89df32ce08609c..2a3592c53fdda488561e504ba2712aadc3214cc4 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
@@ -85,6 +85,12 @@ class FisherFactorTestingDummy(ff.FisherFactor):
   def instantiate_inv_variables(self):
     return NotImplementedError
 
+  def _num_towers(self):
+    raise NotImplementedError
+
+  def _get_data_device(self):
+    raise NotImplementedError
+
 
 class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
   """Dummy class to test the non-abstract methods on ff.InverseProvidingFactor.
@@ -116,6 +122,12 @@ class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
   def instantiate_covariance(self):
     pass
 
+  def _num_towers(self):
+    raise NotImplementedError
+
+  def _get_data_device(self):
+    raise NotImplementedError
+
 
 class NumericalUtilsTest(test.TestCase):
 
@@ -430,7 +442,7 @@ class EmbeddingInputKroneckerFactorTest(test.TestCase):
     with tf_ops.Graph().as_default():
       input_ids = array_ops.constant([[0], [1], [4]])
       vocab_size = 5
-      factor = ff.EmbeddingInputKroneckerFactor(input_ids, vocab_size)
+      factor = ff.EmbeddingInputKroneckerFactor((input_ids,), vocab_size)
       factor.instantiate_cov_variables()
       cov = factor.get_cov_var()
       self.assertEqual(cov.shape.as_list(), [vocab_size])
@@ -439,7 +451,7 @@ class EmbeddingInputKroneckerFactorTest(test.TestCase):
     with tf_ops.Graph().as_default():
       input_ids = array_ops.constant([[0], [1], [4]])
       vocab_size = 5
-      factor = ff.EmbeddingInputKroneckerFactor(input_ids, vocab_size)
+      factor = ff.EmbeddingInputKroneckerFactor((input_ids,), vocab_size)
       factor.instantiate_cov_variables()
       cov_update_op = factor.make_covariance_update_op(0.0)
 
@@ -477,8 +489,8 @@ class ConvDiagonalFactorTest(test.TestCase):
       ]
 
       factor = ff.ConvDiagonalFactor(
-          inputs,
-          outputs_grads,
+          (inputs,),
+          (outputs_grads,),
           self.kernel_shape,
           self.strides,
           self.padding,
@@ -508,7 +520,8 @@ class ConvDiagonalFactorTest(test.TestCase):
           self.out_channels)
 
       factor = ff.ConvDiagonalFactor(
-          constant_op.constant(inputs), [constant_op.constant(outputs_grad)],
+          (constant_op.constant(inputs),),
+          ((constant_op.constant(outputs_grad),),),
           self.kernel_shape,
           strides=[1, 1, 1, 1],
           padding='VALID')
@@ -537,8 +550,8 @@ class ConvDiagonalFactorTest(test.TestCase):
       ]
 
       factor = ff.ConvDiagonalFactor(
-          inputs,
-          outputs_grads,
+          (inputs,),
+          (outputs_grads,),
           self.kernel_shape,
           self.strides,
           self.padding,
@@ -569,7 +582,7 @@ class FullyConnectedKroneckerFactorTest(test.TestCase):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
-      factor = ff.FullyConnectedKroneckerFactor((tensor,), has_bias=has_bias)
+      factor = ff.FullyConnectedKroneckerFactor(((tensor,),), has_bias=has_bias)
       factor.instantiate_cov_variables()
       cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
@@ -587,7 +600,7 @@ class FullyConnectedKroneckerFactorTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      factor = ff.FullyConnectedKroneckerFactor((tensor,), has_bias=True)
+      factor = ff.FullyConnectedKroneckerFactor(((tensor,),), has_bias=True)
       factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
@@ -598,7 +611,7 @@ class FullyConnectedKroneckerFactorTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      factor = ff.FullyConnectedKroneckerFactor((tensor,))
+      factor = ff.FullyConnectedKroneckerFactor(((tensor,),))
       factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
@@ -629,8 +642,8 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
       out_channels = 4
 
       factor = ff.ConvInputKroneckerFactor(
-          inputs=random_ops.random_uniform(
-              (batch_size, width, width, width, in_channels), seed=0),
+          inputs=(random_ops.random_uniform(
+              (batch_size, width, width, width, in_channels), seed=0),),
           filter_shape=(width, width, width, in_channels, out_channels),
           padding='SAME',
           strides=(2, 2, 2),
@@ -661,8 +674,8 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
       out_channels = 4
 
       factor = ff.ConvInputKroneckerFactor(
-          inputs=random_ops.random_uniform(
-              (batch_size, width, width, in_channels), seed=0),
+          inputs=(random_ops.random_uniform(
+              (batch_size, width, width, in_channels), seed=0),),
           filter_shape=(1, 1, in_channels, out_channels),
           padding='SAME',
           strides=(1, 1, 1, 1),
@@ -691,8 +704,8 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
       out_channels = 4
 
       factor = ff.ConvInputKroneckerFactor(
-          inputs=random_ops.random_uniform(
-              (batch_size, width, width, in_channels), seed=0),
+          inputs=(random_ops.random_uniform(
+              (batch_size, width, width, in_channels), seed=0),),
           filter_shape=(1, 1, in_channels, out_channels),
           padding='SAME',
           strides=(1, 2, 1, 1),
@@ -716,8 +729,8 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
       out_channels = 4
 
       factor = ff.ConvInputKroneckerFactor(
-          inputs=random_ops.random_uniform(
-              (batch_size, width, width, in_channels), seed=0),
+          inputs=(random_ops.random_uniform(
+              (batch_size, width, width, in_channels), seed=0),),
           filter_shape=(3, 3, in_channels, out_channels),
           padding='SAME',
           extract_patches_fn='extract_image_patches',
@@ -739,7 +752,7 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
     with tf_ops.Graph().as_default():
       tensor = array_ops.ones((64, 1, 2, 3), name='a/b/c')
       factor = ff.ConvInputKroneckerFactor(
-          inputs=tensor,
+          inputs=(tensor,),
           filter_shape=(1, 2, 3, 4),
           padding='SAME',
           has_bias=False)
@@ -751,7 +764,7 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
     with tf_ops.Graph().as_default():
       tensor = array_ops.ones((64, 1, 2, 3), name='a/b/c')
       factor = ff.ConvInputKroneckerFactor(
-          tensor, filter_shape=(1, 2, 3, 4), padding='SAME', has_bias=True)
+          (tensor,), filter_shape=(1, 2, 3, 4), padding='SAME', has_bias=True)
       factor.instantiate_cov_variables()
       self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
                        factor.get_cov().get_shape().as_list())
@@ -761,7 +774,7 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
       dtype = dtypes.float64_ref
       tensor = array_ops.ones((64, 1, 2, 3), name='a/b/c', dtype=dtypes.float64)
       factor = ff.ConvInputKroneckerFactor(
-          tensor, filter_shape=(1, 2, 3, 4), padding='SAME', has_bias=True)
+          (tensor,), filter_shape=(1, 2, 3, 4), padding='SAME', has_bias=True)
       factor.instantiate_cov_variables()
       cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
@@ -775,7 +788,7 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
           np.arange(1, 1 + np.prod(input_shape)).reshape(input_shape).astype(
               np.float32))
       factor = ff.ConvInputKroneckerFactor(
-          tensor, filter_shape=(1, 1, 1, 1), padding='SAME', has_bias=True)
+          (tensor,), filter_shape=(1, 1, 1, 1), padding='SAME', has_bias=True)
       factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
@@ -794,7 +807,7 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
           np.arange(1, 1 + np.prod(input_shape)).reshape(input_shape).astype(
               np.float32))
       factor = ff.ConvInputKroneckerFactor(
-          tensor, filter_shape=(1, 1, 1, 1), padding='SAME')
+          (tensor,), filter_shape=(1, 1, 1, 1), padding='SAME')
       factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
@@ -810,10 +823,10 @@ class ConvOutputKroneckerFactorTest(ConvFactorTestCase):
       width = 3
       out_channels = width**3
 
-      factor = ff.ConvOutputKroneckerFactor(outputs_grads=[
+      factor = ff.ConvOutputKroneckerFactor(outputs_grads=([
           random_ops.random_uniform(
               (batch_size, width, width, width, out_channels), seed=0)
-      ])
+      ],))
       factor.instantiate_cov_variables()
 
       with self.test_session() as sess:
@@ -829,7 +842,7 @@ class ConvOutputKroneckerFactorTest(ConvFactorTestCase):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3, 4, 5), name='a/b/c')
-      factor = ff.ConvOutputKroneckerFactor((tensor,))
+      factor = ff.ConvOutputKroneckerFactor(((tensor,),))
       factor.instantiate_cov_variables()
       self.assertEqual([5, 5], factor.get_cov().get_shape().as_list())
 
@@ -838,7 +851,7 @@ class ConvOutputKroneckerFactorTest(ConvFactorTestCase):
       dtype = dtypes.float64_ref
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3, 4, 5), dtype=dtype, name='a/b/c')
-      factor = ff.ConvOutputKroneckerFactor((tensor,))
+      factor = ff.ConvOutputKroneckerFactor(((tensor,),))
       factor.instantiate_cov_variables()
       cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
@@ -848,7 +861,7 @@ class ConvOutputKroneckerFactorTest(ConvFactorTestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = np.arange(1, 17).reshape(2, 2, 2, 2).astype(np.float32)
-      factor = ff.ConvOutputKroneckerFactor((array_ops.constant(tensor),))
+      factor = ff.ConvOutputKroneckerFactor(((array_ops.constant(tensor),),))
       factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
@@ -862,8 +875,7 @@ class FullyConnectedMultiKFTest(test.TestCase):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), name='a/b/c')
-      tensor_list = [tensor]
-      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      factor = ff.FullyConnectedMultiKF(((tensor,),), has_bias=False)
       factor.instantiate_cov_variables()
       self.assertEqual([3, 3], factor.get_cov().get_shape().as_list())
 
@@ -872,8 +884,7 @@ class FullyConnectedMultiKFTest(test.TestCase):
       dtype = dtypes.float64_ref
       random_seed.set_random_seed(200)
       tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
-      tensor_list = [tensor]
-      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      factor = ff.FullyConnectedMultiKF(((tensor,),), has_bias=False)
       factor.instantiate_cov_variables()
       cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
@@ -883,8 +894,7 @@ class FullyConnectedMultiKFTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      tensor_list = [tensor]
-      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=True)
+      factor = ff.FullyConnectedMultiKF(((tensor,),), has_bias=True)
       factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
@@ -895,8 +905,7 @@ class FullyConnectedMultiKFTest(test.TestCase):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      tensor_list = [tensor]
-      factor = ff.FullyConnectedMultiKF((tensor_list,))
+      factor = ff.FullyConnectedMultiKF(((tensor,),))
       factor.instantiate_cov_variables()
 
       sess.run(tf_variables.global_variables_initializer())
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py b/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
index bae6bd7a3bd47bc50378afe95d26d57535377f6f..cb80fca3705308f92e308e2a840336fb72d0fa62 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.platform import test
 class MockFisherBlock(object):
   """A fake FisherBlock."""
 
-  num_registered_minibatches = 2
+  num_registered_towers = 2
 
   def __init__(self, name='MockFisherBlock'):
     self.name = name
@@ -135,8 +135,22 @@ class LayerCollectionTest(test.TestCase):
           array_ops.constant(6),
           16,
           approx=layer_collection.APPROX_DIAGONAL_NAME)
+      lc.register_fully_connected_multi(
+          array_ops.constant(1),
+          (array_ops.constant(2), array_ops.constant(3)),
+          (array_ops.constant(4), array_ops.constant(5)))
+      lc.register_conv2d_multi(
+          params=array_ops.ones((2, 3, 4, 5)),
+          strides=[1, 1, 1, 1],
+          padding='SAME',
+          inputs=(array_ops.ones((1, 2, 3, 4)), array_ops.ones((5, 6, 7, 8))),
+          outputs=(array_ops.ones((1, 1, 1, 5)), array_ops.ones((2, 2, 2, 10))))
+      lc.register_embedding_multi(
+          array_ops.constant((1,)),
+          (array_ops.constant(2), array_ops.constant(3)),
+          (array_ops.constant(4), array_ops.constant(5)))
 
-      self.assertEqual(9, len(lc.get_blocks()))
+      self.assertEqual(12, len(lc.get_blocks()))
 
   def testRegisterBlocksMultipleRegistrations(self):
     with ops.Graph().as_default():
@@ -454,13 +468,13 @@ class LayerCollectionTest(test.TestCase):
       b = variable_scope.get_variable('b', [3])
       lc = layer_collection.LayerCollection()
       lc.register_fully_connected(w, inputs, outputs)
-      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 1)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_towers, 1)
       with self.assertRaises(KeyError):
         lc.register_fully_connected((w, b), inputs, outputs, reuse=True)
       self.assertNotIn((w, b), lc.fisher_blocks)
-      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 1)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_towers, 1)
       lc.register_fully_connected(w, inputs, outputs, reuse=True)
-      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 2)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_towers, 2)
 
   def testMakeOrGetFactor(self):
     with ops.Graph().as_default():
diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD
index c26230c2a82ae9529ab13b523b9ec287d17debaf..b897fd68a080e819042cd36f2a1acfcf175e656b 100644
--- a/tensorflow/contrib/kfac/python/ops/BUILD
+++ b/tensorflow/contrib/kfac/python/ops/BUILD
@@ -171,6 +171,7 @@ py_library(
     name = "fisher_estimator",
     srcs = [
         "estimator.py",
+        "placement.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -180,6 +181,7 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -242,15 +244,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index 64755be65c4b5686397dbfd798fec1ed70ae61dc..ced1110676754b6c8bba813ace743b3f3daddb26 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -18,11 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-import itertools
-
+import abc
 import numpy as np
+import six
 
+from tensorflow.contrib.kfac.python.ops import placement
 from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import control_flow_ops
@@ -31,63 +31,46 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 
 
-class _DeviceContextGenerator(object):
-  """Class for generating device contexts in a round-robin fashion."""
-
-  def __init__(self, devices):
-    """Creates a _DeviceContextGenerator object.
-
-    Example usage:
+# The linter is confused.
+# pylint: disable=abstract-class-instantiated
+def make_fisher_estimator(placement_strategy=None, **kwargs):
+  """Creates Fisher estimator instances based on the placement strategy.
 
-    ```python
-    dcg = _DeviceContextGenerator(['/gpu:0', 'gpu:1'])
-    with dcg():
-      # All operations in this context will be placed on GPU 0
-      ...
-    with dcg():
-      # All operations in this context will be placed on GPU 1
-      ...
-    ```
-
-    Args:
-      devices: An iterable of device strings (or None). Successive calls to
-          __call__ will give contexts which place devices on these devices in
-          a round-robin fashion.
-    """
-    self._cycle = None if devices is None else itertools.cycle(devices)
+  For example if the `placement_strategy` is 'round_robin' then
+  `FisherEstimatorRoundRobin` instance is returned.
 
-  @contextlib.contextmanager
-  def __call__(self):
-    """Returns a context manager specifying the default device."""
-    if self._cycle is None:
-      yield
-    else:
-      with tf_ops.device(next(self._cycle)):
-        yield
+  Args:
+    placement_strategy: `string`, Strategy to be used for placing covariance
+      variables, covariance ops and inverse ops. Check
+      `placement.FisherEstimatorRoundRobin` for a concrete example.
+   **kwargs: Arguments to be passed into `FisherEstimator` class initializer.
 
+  Returns:
+    An instance of class which inherits from `FisherEstimator` and the mixin
+    which implements specific placement strategy. See,
+    `FisherEstimatorRoundRobin` which inherits from `FisherEstimator` and
+    `RoundRobinPlacementMixin`.
 
-def _make_thunk_on_device(func, device):
-  def thunk():
-    with tf_ops.device(device):
-      return func()
-  return thunk
+  Raises:
+    ValueError: If the `placement_strategy` is not equal to 'round_robin'.
+  """
+  if placement_strategy in [None, "round_robin"]:
+    return FisherEstimatorRoundRobin(**kwargs)
+  else:
+    raise ValueError("Unimplemented vars and ops placement strategy : %s",
+                     placement_strategy)
+# pylint: enable=abstract-class-instantiated
 
 
+@six.add_metaclass(abc.ABCMeta)
 class FisherEstimator(object):
   """Fisher estimator class supporting various approximations of the Fisher.
 
-  Attributes:
-    cov_update_thunks: list of no-arg functions. Executing a function adds
-      covariance update ops for a single FisherFactor to the graph.
-    cov_update_ops: List of Ops. Running an op updates covariance matrices for a
-      single FisherFactor.
-    cov_update_op: Op. Running updates covariance matrices for all
-      FisherFactors.
-    inv_update_thunks: list of no-arg functions.  Executing a function adds
-      inverse update ops for a single FisherFactor to the graph.
-    inv_update_ops: List of Ops. Running an op updates inverse matrices for a
-      single FisherFactor.
-    inv_update_op: Op. Running updates inverse matrices for all FisherFactors.
+  This is an abstract base class which does not implement a strategy for
+  placing covariance variables, covariance update ops and inverse update ops.
+  The placement strategies are implemented in `placement.py`. See
+  `FisherEstimatorRoundRobin` for example of a concrete subclass with
+  a round-robin placement strategy.
   """
 
   def __init__(self,
@@ -184,6 +167,77 @@ class FisherEstimator(object):
   def name(self):
     return self._name
 
+  @abc.abstractmethod
+  def make_ops_and_vars(self, scope=None):
+    """Make ops and vars with a specific placement strategy.
+
+    For each factor, all of that factor's cov variables and their associated
+    update ops will be placed on a particular device.  For example in case of
+    round robin placement a new device is chosen for each factor by cycling
+    through list of devices in the cov_devices argument. If cov_devices is None
+    then no explicit device placement occurs.
+
+    An analogous strategy is followed for inverse update ops, with the list of
+    devices being given by the inv_devices argument.
+
+    Inverse variables on the other hand are not placed on any specific device
+    (they will just use the current the device placement context, whatever
+    that happens to be).  The idea is that the inverse variable belong where
+    they will be accessed most often, which is the device that actually applies
+    the preconditioner to the gradient. The user will be responsible for setting
+    the device context for this.
+
+    Args:
+      scope: A string or None.  If None it will be set to the name of this
+        estimator (given by the name property). All variables will be created,
+        and all ops will execute, inside of a variable scope of the given
+        name. (Default: None)
+
+    Returns:
+      cov_update_ops: List of ops that compute the cov updates. Corresponds
+        one-to-one with the list of factors given by the "factors" property.
+      cov_update_op: cov_update_ops grouped into a single op.
+      inv_update_ops: List of ops that compute the inv updates. Corresponds
+        one-to-one with the list of factors given by the "factors" property.
+      inv_update_op: inv_update_ops grouped into a single op.
+      cov_update_thunks: Thunks that make the ops in cov_update_ops.
+      inv_update_thunks: Thunks that make the ops in inv_update_ops.
+    """
+    pass
+
+  @abc.abstractmethod
+  def make_vars_and_create_op_thunks(self, scope=None):
+    """Make vars and create op thunks with a specific placement strategy.
+
+    For each factor, all of that factor's cov variables and their associated
+    update ops will be placed on a particular device.  A new device is chosen
+    for each factor by cycling through list of devices in the cov_devices
+    argument. If cov_devices is None then no explicit device placement occurs.
+
+    An analogous strategy is followed for inverse update ops, with the list of
+    devices being given by the inv_devices argument.
+
+    Inverse variables on the other hand are not placed on any specific device
+    (they will just use the current the device placement context, whatever
+    that happens to be).  The idea is that the inverse variable belong where
+    they will be accessed most often, which is the device that actually applies
+    the preconditioner to the gradient. The user will be responsible for setting
+    the device context for this.
+
+    Args:
+      scope: A string or None.  If None it will be set to the name of this
+        estimator (given by the name property). All variables will be created,
+        and all thunks will execute, inside of a variable scope of the given
+        name. (Default: None)
+
+    Returns:
+      cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+      inv_update_thunks: List of inv update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+    """
+    pass
+
   def _apply_transformation(self, vecs_and_vars, transform):
     """Applies an block-wise transformation to the corresponding vectors.
 
@@ -286,158 +340,6 @@ class FisherEstimator(object):
     self._instantiate_factors()
     self._register_matrix_functions()
 
-  def make_ops_and_vars(self, scope=None):
-    """Make ops and vars with no specific device placement.
-
-    See make_ops_and_vars_round_robin for further details.
-
-    Args:
-      scope: A string or None.  If None it will be set to the name of this
-        estimator (given by the name property). All variables will be created,
-        and all ops will execute, inside of a variable scope of the given
-        name. (Default: None)
-    Returns:
-      cov_update_ops: List of ops that compute the cov updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      cov_update_op: cov_update_ops grouped into a single op.
-      inv_update_ops: List of ops that compute the inv updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      inv_update_op: inv_update_ops grouped into a single op.
-      cov_update_thunks: Thunks that make the ops in cov_update_ops.
-      inv_update_thunks: Thunks that make the ops in inv_update_ops.
-    """
-    return self.make_ops_and_vars_round_robin(scope=scope)
-
-  # TODO(b/70674513): Factor device placement outside of this class.
-  def make_ops_and_vars_round_robin(self, scope=None, cov_devices=None,
-                                    inv_devices=None):
-    """Make ops and vars with a round-robin device placement strategy.
-
-    For each factor, all of that factor's cov variables and their associated
-    update ops will be placed on a particular device.  A new device is chosen
-    for each factor by cycling through list of devices in the cov_devices
-    argument. If cov_devices is None then no explicit device placement occurs.
-
-    An analogous strategy is followed for inverse update ops, with the list of
-    devices being given by the inv_devices argument.
-
-    Inverse variables on the other hand are not placed on any specific device
-    (they will just use the current the device placement context, whatever
-    that happens to be).  The idea is that the inverse variable belong where
-    they will be accessed most often, which is the device that actually applies
-    the preconditioner to the gradient. The user will be responsible for setting
-    the device context for this.
-
-    Args:
-      scope: A string or None.  If None it will be set to the name of this
-        estimator (given by the name property). All variables will be created,
-        and all ops will execute, inside of a variable scope of the given
-        name. (Default: None)
-      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
-      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
-
-    Returns:
-      cov_update_ops: List of ops that compute the cov updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      cov_update_op: cov_update_ops grouped into a single op.
-      inv_update_ops: List of ops that compute the inv updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      inv_update_op: inv_update_ops grouped into a single op.
-      cov_update_thunks: Thunks that make the ops in cov_update_ops.
-      inv_update_thunks: Thunks that make the ops in inv_update_ops.
-    """
-    (cov_update_thunks,
-     inv_update_thunks) = self.make_vars_and_create_op_thunks_round_robin(
-         scope=scope,
-         cov_devices=cov_devices,
-         inv_devices=inv_devices)
-    cov_update_ops = [thunk() for thunk in cov_update_thunks]
-    inv_update_ops = [thunk() for thunk in inv_update_thunks]
-
-    scope = self.name if scope is None else scope
-    with variable_scope.variable_scope(scope):
-      cov_update_op = control_flow_ops.group(cov_update_ops,
-                                             name="cov_update_op")
-      inv_update_op = control_flow_ops.group(inv_update_ops,
-                                             name="inv_update_op")
-
-    return (cov_update_ops, cov_update_op, inv_update_ops, inv_update_op,
-            cov_update_thunks, inv_update_thunks)
-
-  def make_vars_and_create_op_thunks_round_robin(self,
-                                                 scope=None,
-                                                 cov_devices=None,
-                                                 inv_devices=None):
-    """Make vars and create op thunks w/ a round-robin device placement strat.
-
-    For each factor, all of that factor's cov variables and their associated
-    update ops will be placed on a particular device.  A new device is chosen
-    for each factor by cycling through list of devices in the cov_devices
-    argument. If cov_devices is None then no explicit device placement occurs.
-
-    An analogous strategy is followed for inverse update ops, with the list of
-    devices being given by the inv_devices argument.
-
-    Inverse variables on the other hand are not placed on any specific device
-    (they will just use the current the device placement context, whatever
-    that happens to be).  The idea is that the inverse variable belong where
-    they will be accessed most often, which is the device that actually applies
-    the preconditioner to the gradient. The user will be responsible for setting
-    the device context for this.
-
-    Args:
-      scope: A string or None.  If None it will be set to the name of this
-        estimator (given by the name property). All variables will be created,
-        and all thunks will execute, inside of a variable scope of the given
-        name. (Default: None)
-      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
-      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
-    Returns:
-      cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
-        the list of factors given by the "factors" property.
-      inv_update_thunks: List of inv update thunks. Corresponds one-to-one with
-        the list of factors given by the "factors" property.
-    """
-
-    (cov_variable_thunks_raw, cov_update_thunks_raw, inv_variable_thunks_raw,
-     inv_update_thunks_raw) = self.create_ops_and_vars_thunks(scope=scope)
-
-    if cov_devices:
-      cov_update_thunks = []
-      for cov_variable_thunk, cov_update_thunk, device in zip(
-          cov_variable_thunks_raw, cov_update_thunks_raw,
-          itertools.cycle(cov_devices)):
-        with tf_ops.device(device):
-          cov_variable_thunk()
-        cov_update_thunks.append(_make_thunk_on_device(cov_update_thunk,
-                                                       device))
-    else:
-      for cov_variable_thunk in cov_variable_thunks_raw:
-        cov_variable_thunk()
-      cov_update_thunks = cov_update_thunks_raw
-
-    for inv_variable_thunk in inv_variable_thunks_raw:
-      inv_variable_thunk()
-
-    if inv_devices:
-      inv_update_thunks = []
-      for inv_update_thunk, device in zip(inv_update_thunks_raw,
-                                          itertools.cycle(inv_devices)):
-        inv_update_thunks.append(_make_thunk_on_device(inv_update_thunk,
-                                                       device))
-    else:
-      inv_update_thunks = inv_update_thunks_raw
-
-    return cov_update_thunks, inv_update_thunks
-
   def create_ops_and_vars_thunks(self, scope=None):
     """Create thunks that make the ops and vars on demand.
 
@@ -582,3 +484,9 @@ class FisherEstimator(object):
               colocate_gradients_with_ops=self._colocate_gradients_with_ops)
           grads_all.append(nest.pack_sequence_as(tensors, grads_flat))
     return zip(*grads_all)
+
+
+class FisherEstimatorRoundRobin(placement.RoundRobinPlacementMixin,
+                                FisherEstimator):
+  """Fisher estimator which provides round robin device placement strategy."""
+  pass
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index 31f4689fbfbbf13872c237913a37478f3c2debe0..00b3673a742e92057b0a1673d3f42a19379111fe 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -19,11 +19,11 @@ Information matrix. Suppose one has a model that parameterizes a posterior
 distribution over 'y' given 'x' with parameters 'params', p(y | x, params). Its
 Fisher Information matrix is given by,
 
-  F(params) = E[ v(x, y, params) v(x, y, params)^T ]
+  $$F(params) = E[ v(x, y, params) v(x, y, params)^T ]$$
 
 where,
 
-  v(x, y, params) = (d / d params) log p(y | x, params)
+  $$v(x, y, params) = (d / d params) log p(y | x, params)$$
 
 and the expectation is taken with respect to the data's distribution for 'x' and
 the model's posterior distribution for 'y',
@@ -48,6 +48,7 @@ from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import nest
 
 # For blocks corresponding to convolutional layers, or any type of block where
 # the parameters can be thought of as being replicated in time or space,
@@ -84,7 +85,7 @@ def normalize_damping(damping, num_replications):
 def compute_pi_tracenorm(left_cov, right_cov):
   """Computes the scalar constant pi for Tikhonov regularization/damping.
 
-  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
+  $$\pi = \sqrt{ (trace(A) / dim(A)) / (trace(B) / dim(B)) }$$
   See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
 
   Args:
@@ -159,7 +160,7 @@ class FisherBlock(object):
   """Abstract base class for objects modeling approximate Fisher matrix blocks.
 
   Subclasses must implement register_matpower, multiply_matpower,
-  instantiate_factors, tensors_to_compute_grads, and num_registered_minibatches
+  instantiate_factors, tensors_to_compute_grads, and num_registered_towers
   methods.
   """
 
@@ -234,8 +235,8 @@ class FisherBlock(object):
     pass
 
   @abc.abstractproperty
-  def num_registered_minibatches(self):
-    """Number of minibatches registered for this FisherBlock.
+  def num_registered_towers(self):
+    """Number of towers registered for this FisherBlock.
 
     Typically equal to the number of towers in a multi-tower setup.
     """
@@ -287,8 +288,8 @@ class FullFB(FisherBlock):
   def tensors_to_compute_grads(self):
     return self._params
 
-  def register_additional_minibatch(self, batch_size):
-    """Register an additional minibatch.
+  def register_additional_tower(self, batch_size):
+    """Register an additional tower.
 
     Args:
       batch_size: The batch size, used in the covariance estimator.
@@ -296,7 +297,7 @@ class FullFB(FisherBlock):
     self._batch_sizes.append(batch_size)
 
   @property
-  def num_registered_minibatches(self):
+  def num_registered_towers(self):
     return len(self._batch_sizes)
 
   @property
@@ -349,8 +350,8 @@ class NaiveDiagonalFB(FisherBlock):
   def tensors_to_compute_grads(self):
     return self._params
 
-  def register_additional_minibatch(self, batch_size):
-    """Register an additional minibatch.
+  def register_additional_tower(self, batch_size):
+    """Register an additional tower.
 
     Args:
       batch_size: The batch size, used in the covariance estimator.
@@ -358,7 +359,7 @@ class NaiveDiagonalFB(FisherBlock):
     self._batch_sizes.append(batch_size)
 
   @property
-  def num_registered_minibatches(self):
+  def num_registered_towers(self):
     return len(self._batch_sizes)
 
   @property
@@ -366,24 +367,78 @@ class NaiveDiagonalFB(FisherBlock):
     return math_ops.reduce_sum(self._batch_sizes)
 
 
-class InputOutputMultiMinibatch(object):
+class InputOutputMultiTower(object):
   """Mix-in class for blocks with inputs & outputs and multiple mini-batches."""
 
   def __init__(self, *args, **kwargs):
     self.__inputs = []
     self.__outputs = []
-    super(InputOutputMultiMinibatch, self).__init__(*args, **kwargs)
+    super(InputOutputMultiTower, self).__init__(*args, **kwargs)
+
+  def _process_data(self, grads_list):
+    """Process data into the format used by the factors.
+
+    This function takes inputs and grads_lists data and processes it into
+    one of the formats expected by the FisherFactor classes (depending on
+    the value of the global configuration variable TOWER_STRATEGY).
+
+    The initial format of self._inputs is expected to be a list of Tensors
+    over towers. Similarly grads_lists is expected to be a list over sources
+    of such lists.
+
+    If TOWER_STRATEGY is "concat", 'inputs' becomes a tuple containing a single
+    tensor (represented as a PartitionedTensor object) equal to the
+    concatenation (across towers) of all of the elements of self._inputs. And
+    similarly grads_list is formatted into a tuple (over sources) of such
+    tensors (also represented as PartitionedTensors).
+
+    If TOWER_STRATEGY is "separate", formatting of inputs and grads_list
+    remains unchanged from the initial format (although possibly converting
+    from lists into tuples).
+
+    Args:
+      grads_list: grads_list in its initial format (see above).
+
+    Returns:
+      inputs: self._inputs transformed into the appropriate format (see
+        above).
+      grads_list: grads_list transformed into the appropriate format (see
+        above).
+
+    Raises:
+      ValueError: if TOWER_STRATEGY is not one of "separate" or "concat".
+    """
+    inputs = self._inputs
+    # inputs is a list over towers of Tensors
+    # grads_list is a list of list with the first index being sources and the
+    # second being towers.
+    if fisher_factors.TOWER_STRATEGY == "concat":
+      # Merge towers together into a PartitionedTensor. We package it in
+      # a singleton tuple since the factors will expect a list over towers
+      inputs = (utils.PartitionedTensor(inputs),)
+      # Do the same for grads_list but preserve leading sources dimension
+      grads_list = tuple((utils.PartitionedTensor(grads),)
+                         for grads in grads_list)
+    elif fisher_factors.TOWER_STRATEGY == "separate":
+      inputs = tuple(inputs)
+      grads_list = tuple(grads_list)
+
+    else:
+      raise ValueError("Global config variable TOWER_STRATEGY must be one of "
+                       "'concat' or 'separate'.")
+
+    return inputs, grads_list
 
   def tensors_to_compute_grads(self):
     """Tensors to compute derivative of loss with respect to."""
-    return self._outputs
+    return tuple(self._outputs)
 
-  def register_additional_minibatch(self, inputs, outputs):
+  def register_additional_tower(self, inputs, outputs):
     self._inputs.append(inputs)
     self._outputs.append(outputs)
 
   @property
-  def num_registered_minibatches(self):
+  def num_registered_towers(self):
     result = len(self._inputs)
     assert result == len(self._outputs)
     return result
@@ -396,59 +451,8 @@ class InputOutputMultiMinibatch(object):
   def _outputs(self):
     return self.__outputs
 
-  def _package_minibatches(self, grads_list):
-    """Constructs PartitionedTensor for inputs, grads_list.
-
-    The purpose of this method is to package up the towers/minibatch dimension
-    of these arrays into PartitionedTensor objects.
-
-    Args:
-      grads_list: 2-D list of Tensors. First index is for source, second
-        index for tower.
-
-    Returns:
-      inputs: PartitionedTensor.
-      grads_list: Tuple of PartitionedTensors, one per source.
-    """
-    inputs = utils.PartitionedTensor(self._inputs)
-    grads_list = tuple(utils.PartitionedTensor(grads) for grads in grads_list)
-
-    return inputs, grads_list
 
-  def _package_minibatches_multi(self, grads_list):
-    """Constructs PartitionedTensors for inputs, grads_list.
-
-    The purpose of this method is to package up the towers/minibatch dimension
-    of these arrays into PartitionedTensor objects.
-
-    This version of this function is for use with FisherBlocks that deal with
-    multiple uses or time-steps. One PartitionedTensor is created for each
-    use/time-step.
-
-    Args:
-      grads_list: 3-D tuple of Tensors. First index is for source, second
-        index is for tower, third is for use/time-step.
-
-    Returns:
-      inputs: A tuple of PartitionedTensor's, one per use/time-step.
-      grads_list: 2-D tuple of PartitionedTensors. First index is for source,
-        second is for use/time-step.
-    """
-    # self._inputs is a 2-D tuple.  First index is tower/mini-batch, second is
-    # use/time-step.
-    inputs = self._inputs
-    num_uses = len(inputs[0])
-    assert all(len(input_) == num_uses for input_ in inputs)
-    assert all(len(grad) == num_uses for grads in grads_list for grad in grads)
-
-    inputs = tuple(utils.PartitionedTensor(input_) for input_ in zip(*inputs))
-    grads_list = tuple(tuple(utils.PartitionedTensor(grad)
-                             for grad in zip(*grads)) for grads in grads_list)
-
-    return inputs, grads_list
-
-
-class FullyConnectedDiagonalFB(InputOutputMultiMinibatch, FisherBlock):
+class FullyConnectedDiagonalFB(InputOutputMultiTower, FisherBlock):
   """FisherBlock for fully-connected (dense) layers using a diagonal approx.
 
   Estimates the Fisher Information matrix's diagonal entries for a fully
@@ -458,14 +462,14 @@ class FullyConnectedDiagonalFB(InputOutputMultiMinibatch, FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider fully connected layer in this model with (unshared) weight matrix
   'w'. For an example 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( a (d loss / d s)^T )
+    $$v(x, y, w) = vec( a (d loss / d s)^T )$$
 
   This FisherBlock tracks Fisher(params)[i, i] for all indices 'i' corresponding
   to the layer's parameters 'w'.
@@ -485,7 +489,7 @@ class FullyConnectedDiagonalFB(InputOutputMultiMinibatch, FisherBlock):
     super(FullyConnectedDiagonalFB, self).__init__(layer_collection)
 
   def instantiate_factors(self, grads_list, damping):
-    inputs, grads_list = self._package_minibatches(grads_list)
+    inputs, grads_list = self._process_data(grads_list)
 
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.FullyConnectedDiagonalFactor,
@@ -518,7 +522,7 @@ class FullyConnectedDiagonalFB(InputOutputMultiMinibatch, FisherBlock):
     return utils.mat2d_to_layer_params(vector, reshaped_out)
 
 
-class ConvDiagonalFB(InputOutputMultiMinibatch, FisherBlock):
+class ConvDiagonalFB(InputOutputMultiTower, FisherBlock):
   """FisherBlock for 2-D convolutional layers using a diagonal approx.
 
   Estimates the Fisher Information matrix's diagonal entries for a convolutional
@@ -528,14 +532,14 @@ class ConvDiagonalFB(InputOutputMultiMinibatch, FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider a convoluational layer in this model with (unshared) filter matrix
   'w'. For an example image 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )
+    $$v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )$$
 
   where 'loc' is a single (x, y) location in an image.
 
@@ -598,10 +602,10 @@ class ConvDiagonalFB(InputOutputMultiMinibatch, FisherBlock):
     super(ConvDiagonalFB, self).__init__(layer_collection)
 
   def instantiate_factors(self, grads_list, damping):
-    inputs, grads_list = self._package_minibatches(grads_list)
+    inputs, grads_list = self._process_data(grads_list)
 
     # Infer number of locations upon which convolution is applied.
-    self._num_locations = num_conv_locations(inputs.shape.as_list(),
+    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
                                              self._strides)
 
     self._factor = self._layer_collection.make_or_get_factor(
@@ -630,7 +634,7 @@ class ConvDiagonalFB(InputOutputMultiMinibatch, FisherBlock):
 
 
 class KroneckerProductFB(FisherBlock):
-  """A base class for FisherBlocks with separate input and output factors.
+  """A base class for blocks with separate input and output Kronecker factors.
 
   The Fisher block is approximated as a Kronecker product of the input and
   output factors.
@@ -708,10 +712,10 @@ class KroneckerProductFB(FisherBlock):
                                                         right_factor)
 
 
-class EmbeddingKFACFB(InputOutputMultiMinibatch, KroneckerProductFB):
+class EmbeddingKFACFB(InputOutputMultiTower, KroneckerProductFB):
   """K-FAC FisherBlock for embedding layers.
 
-  This FisherBlock is similar to EmbeddingKFACFB, except that its
+  This FisherBlock is similar to FullyConnectedKFACBasicFB, except that its
   input factor is approximated by a diagonal matrix. In the case that each
   example references exactly one embedding, this approximation is exact.
 
@@ -740,18 +744,17 @@ class EmbeddingKFACFB(InputOutputMultiMinibatch, KroneckerProductFB):
       damping: 0-D Tensor or float. 'damping' * identity is approximately added
         to this FisherBlock's Fisher approximation.
     """
-    inputs, grads_list = self._package_minibatches(grads_list)
+    inputs, grads_list = self._process_data(grads_list)
 
-    self._input_factor = self._layer_collection.make_or_get_factor(  #
-        fisher_factors.EmbeddingInputKroneckerFactor,  #
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.EmbeddingInputKroneckerFactor,
         (inputs, self._vocab_size))
-    self._output_factor = self._layer_collection.make_or_get_factor(  #
-        fisher_factors.FullyConnectedKroneckerFactor,  #
-        (grads_list,))
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedKroneckerFactor, (grads_list,))
     self._setup_damping(damping)
 
 
-class FullyConnectedKFACBasicFB(InputOutputMultiMinibatch, KroneckerProductFB):
+class FullyConnectedKFACBasicFB(InputOutputMultiTower, KroneckerProductFB):
   """K-FAC FisherBlock for fully-connected (dense) layers.
 
   This uses the Kronecker-factorized approximation from the original
@@ -781,18 +784,18 @@ class FullyConnectedKFACBasicFB(InputOutputMultiMinibatch, KroneckerProductFB):
       damping: 0-D Tensor or float. 'damping' * identity is approximately added
         to this FisherBlock's Fisher approximation.
     """
-    inputs, grads_list = self._package_minibatches(grads_list)
+    inputs, grads_list = self._process_data(grads_list)
 
-    self._input_factor = self._layer_collection.make_or_get_factor(  #
-        fisher_factors.FullyConnectedKroneckerFactor,  #
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedKroneckerFactor,
         ((inputs,), self._has_bias))
-    self._output_factor = self._layer_collection.make_or_get_factor(  #
-        fisher_factors.FullyConnectedKroneckerFactor,  #
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedKroneckerFactor,
         (grads_list,))
     self._setup_damping(damping)
 
 
-class ConvKFCBasicFB(InputOutputMultiMinibatch, KroneckerProductFB):
+class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB):
   """FisherBlock for convolutional layers using the basic KFC approx.
 
   Estimates the Fisher Information matrix's blog for a convolutional
@@ -802,12 +805,12 @@ class ConvKFCBasicFB(InputOutputMultiMinibatch, KroneckerProductFB):
   'w'. For a minibatch that produces inputs 'a' and output preactivations 's',
   this FisherBlock estimates,
 
-    F(w) = #locations * kronecker(E[flat(a) flat(a)^T],
-                                  E[flat(ds) flat(ds)^T])
+    $$F(w) = \#locations * kronecker(E[flat(a) flat(a)^T],
+                                  E[flat(ds) flat(ds)^T])$$
 
   where
 
-    ds = (d / ds) log p(y | x, w)
+    $$ds = (d / ds) log p(y | x, w)$$
     #locations = number of (x, y) locations where 'w' is applied.
 
   where the expectation is taken over all examples and locations and flat()
@@ -858,10 +861,10 @@ class ConvKFCBasicFB(InputOutputMultiMinibatch, KroneckerProductFB):
     super(ConvKFCBasicFB, self).__init__(layer_collection)
 
   def instantiate_factors(self, grads_list, damping):
-    inputs, grads_list = self._package_minibatches(grads_list)
+    inputs, grads_list = self._process_data(grads_list)
 
     # Infer number of locations upon which convolution is applied.
-    self._num_locations = num_conv_locations(self._inputs[0].shape.as_list(),
+    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
                                              self._strides)
 
     self._input_factor = self._layer_collection.make_or_get_factor(
@@ -1137,42 +1140,327 @@ def num_conv_locations(input_shape, strides):
   return spatial_input_locations // spatial_strides_divisor
 
 
-class FullyConnectedMultiIndepFB(InputOutputMultiMinibatch, KroneckerProductFB):
+class InputOutputMultiTowerMultiUse(InputOutputMultiTower):
+  """Adds methods for multi-use/time-step case to InputOutputMultiTower."""
+
+  def __init__(self, num_uses=None, *args, **kwargs):
+    self._num_uses = num_uses
+    super(InputOutputMultiTowerMultiUse, self).__init__(*args, **kwargs)
+
+  def _process_data(self, grads_list):
+    """Process temporal/multi-use data into the format used by the factors.
+
+    This function takes inputs and grads_lists data and processes it into
+    one of the formats expected by the FisherFactor classes (depending on
+    the value of the global configuration variable TOWER_STRATEGY).
+
+    It accepts the data in one of two initial formats. The first possible
+    format is where self._inputs is a list of list of Tensors. The first index
+    is tower, the second is use/time-step. grads_list, meanwhile, is a list
+    over sources of such lists of lists.
+
+    The second possible data format is where self._inputs is a Tensor with
+    uses/times-steps folded into the batch dimension.  i.e. it is a Tensor
+    of shape [num_uses * size_batch, ...] which represents a reshape of a
+    Tensor of shape [num_uses, size_batch, ...].  And similarly grads_list is
+    a list over sources of such Tensors.
+
+    There are two possible formats which inputs and grads_list are transformed
+    into.
+
+    If TOWER_STRATEGY is "concat", 'inputs' becomes a tuple containing
+    a single tensor (represented as a PartitionedTensor object) with all of
+    the data from the towers, as well as the uses/time-steps, concatenated
+    together. In this tensor the leading dimension is the batch and
+    use/time-step dimensions folded together (with 'use' being the major of
+    these two, so that the tensors can be thought of as reshapes of ones of
+    shape [num_uses, batch_size, ...]). grads_list is similarly formatted as a
+    tuple over sources of such tensors.
+
+    If TOWER_STRATEGY is "separate" the inputs are formatted into lists of
+    tensors over towers. Each of these tensors has a similar format to
+    the tensor produced by the "concat" option, except that each contains
+    only the data from a single tower.  grads_list is similarly formatted
+    into a tuple over sources of such tuples.
+
+    Args:
+      grads_list: grads_list in its initial format (see above).
+
+    Returns:
+      inputs: self._inputs transformed into the appropriate format (see
+        above).
+      grads_list: grads_list transformed into the appropriate format (see
+        above).
+
+    Raises:
+      ValueError: If TOWER_STRATEGY is not one of "separate" or "concat".
+      ValueError: If the given/initial format of self._inputs and grads_list
+        isn't recognized, or doesn't agree with self._num_uses.
+    """
+
+    inputs = self._inputs
+
+    if isinstance(inputs[0], (list, tuple)):
+      num_uses = len(inputs[0])
+      if self._num_uses is not None and self._num_uses != num_uses:
+        raise ValueError("num_uses argument doesn't match length of inputs.")
+      else:
+        self._num_uses = num_uses
+
+      # Check that all mini-batches/towers have the same number of uses
+      if not all(len(input_) == num_uses for input_ in inputs):
+        raise ValueError("Length of inputs argument is inconsistent across "
+                         "towers.")
+
+      if fisher_factors.TOWER_STRATEGY == "concat":
+        # Reverse the tower and use/time-step indices, so that use is now first,
+        # and towers is second
+        inputs = tuple(zip(*inputs))
+
+        # Flatten the two dimensions
+        inputs = nest.flatten(inputs)
+
+        # Merge everything together into a PartitionedTensor. We package it in
+        # a singleton tuple since the factors will expect a list over towers
+        inputs = (utils.PartitionedTensor(inputs),)
+
+      elif fisher_factors.TOWER_STRATEGY == "separate":
+        # Merge together the uses/time-step dimension into PartitionedTensors,
+        # but keep the leading dimension (towers) intact for the factors to
+        # process individually.
+        inputs = tuple(utils.PartitionedTensor(input_) for input_ in inputs)
+
+      else:
+        raise ValueError("Global config variable TOWER_STRATEGY must be one of "
+                         "'concat' or 'separate'.")
+
+    # Now we perform the analogous processing for grads_list
+    if isinstance(grads_list[0][0], (list, tuple)):
+      num_uses = len(grads_list[0][0])
+      if self._num_uses is not None and self._num_uses != num_uses:
+        raise ValueError("num_uses argument doesn't match length of outputs, "
+                         "or length of outputs is inconsistent with length of "
+                         "inputs.")
+      else:
+        self._num_uses = num_uses
+
+      if not all(len(grad) == num_uses for grads in grads_list
+                 for grad in grads):
+        raise ValueError("Length of outputs argument is inconsistent across "
+                         "towers.")
+
+      if fisher_factors.TOWER_STRATEGY == "concat":
+        # Reverse the tower and use/time-step indices, so that use is now first,
+        # and towers is second
+        grads_list = tuple(tuple(zip(*grads)) for grads in grads_list)
+
+        # Flatten the two dimensions, leaving the leading dimension (source)
+        # intact
+        grads_list = tuple(nest.flatten(grads) for grads in grads_list)
+
+        # Merge inner dimensions together into PartitionedTensors. We package
+        # them in a singleton tuple since the factors will expect a list over
+        # towers
+        grads_list = tuple((utils.PartitionedTensor(grads),)
+                           for grads in grads_list)
+
+      elif fisher_factors.TOWER_STRATEGY == "separate":
+        # Merge together the uses/time-step dimension into PartitionedTensors,
+        # but keep the leading dimension (towers) intact for the factors to
+        # process individually.
+        grads_list = tuple(tuple(utils.PartitionedTensor(grad)
+                                 for grad in grads)
+                           for grads in grads_list)
+
+      else:
+        raise ValueError("Global config variable TOWER_STRATEGY must be one of "
+                         "'concat' or 'separate'.")
+
+    if self._num_uses is None:
+      raise ValueError("You must supply a value for the num_uses argument if "
+                       "the number of uses cannot be inferred from inputs or "
+                       "outputs arguments (e.g. if they are both given in the "
+                       "single Tensor format, instead of as lists of Tensors.")
+
+    return inputs, grads_list
+
+
+class FullyConnectedMultiIndepFB(InputOutputMultiTowerMultiUse,
+                                 KroneckerProductFB):
   """FisherBlock for fully-connected layers that share parameters.
+
+  This class implements the "independence across time" approximation from the
+  following paper:
+    https://openreview.net/pdf?id=HyMTkQZAb
   """
 
-  def __init__(self, layer_collection, has_bias=False):
+  def __init__(self, layer_collection, has_bias=False, num_uses=None):
     """Creates a FullyConnectedMultiIndepFB block.
 
     Args:
       layer_collection: LayerCollection instance.
       has_bias: bool. If True, estimates Fisher with respect to a bias
         parameter as well as the layer's parameters.
+      num_uses: int or None. Number of uses of the layer in the model's graph.
+        Only required if the data is formatted with uses/time folded into the
+        batch dimension (instead of uses/time being a list dimension).
+        (Default: None)
     """
     self._has_bias = has_bias
 
-    super(FullyConnectedMultiIndepFB, self).__init__(layer_collection)
+    super(FullyConnectedMultiIndepFB, self).__init__(
+        layer_collection=layer_collection,
+        num_uses=num_uses)
 
   def instantiate_factors(self, grads_list, damping):
-
-    self._num_uses = float(len(self._inputs[0]))
-    inputs, grads_list = self._package_minibatches_multi(grads_list)
+    inputs, grads_list = self._process_data(grads_list)
 
     self._input_factor = self._layer_collection.make_or_get_factor(
         fisher_factors.FullyConnectedMultiKF,
-        ((inputs,), self._has_bias))
+        ((inputs,), self._num_uses, self._has_bias))
 
     self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+        fisher_factors.FullyConnectedMultiKF, (grads_list, self._num_uses))
 
     self._setup_damping(damping, normalization=self._num_uses)
 
   @property
   def _renorm_coeff(self):
-    return self._num_uses
+    return float(self._num_uses)
 
-  def tensors_to_compute_grads(self):
-    return self._outputs
+
+class ConvKFCBasicMultiIndepFB(InputOutputMultiTowerMultiUse,
+                               KroneckerProductFB):
+  """FisherBlock for 2D convolutional layers using the basic KFC approx.
+
+  Similar to ConvKFCBasicFB except that this version supports multiple
+  uses/time-steps via a standard independence approximation.  Similar to the
+  "independence across time" used in FullyConnectedMultiIndepFB but generalized
+  in the obvious way to conv layers.
+  """
+
+  def __init__(self,
+               layer_collection,
+               params,
+               padding,
+               strides=None,
+               dilation_rate=None,
+               data_format=None,
+               extract_patches_fn=None,
+               num_uses=None):
+    """Creates a ConvKFCBasicMultiIndepFB block.
+
+    Args:
+      layer_collection: The collection of all layers in the K-FAC approximate
+          Fisher information matrix to which this FisherBlock belongs.
+      params: The parameters (Tensor or tuple of Tensors) of this layer. If
+        kernel alone, a Tensor of shape [..spatial_filter_shape..,
+        in_channels, out_channels]. If kernel and bias, a tuple of 2 elements
+        containing the previous and a Tensor of shape [out_channels].
+      padding: str. Padding method.
+      strides: List of ints or None. Contains [..spatial_filter_strides..] if
+        'extract_patches_fn' is compatible with tf.nn.convolution(), else
+        [1, ..spatial_filter_strides, 1].
+      dilation_rate: List of ints or None. Rate for dilation along each spatial
+        dimension if 'extract_patches_fn' is compatible with
+        tf.nn.convolution(), else [1, ..spatial_dilation_rates.., 1].
+      data_format: str or None. Format of input data.
+      extract_patches_fn: str or None. Name of function that extracts image
+        patches. One of "extract_convolution_patches", "extract_image_patches",
+        "extract_pointwise_conv2d_patches".
+      num_uses: int or None. Number of uses of the layer in the model's graph.
+        Only required if the data is formatted with uses/time folded into the
+        batch dimension (instead of uses/time being a list dimension).
+        (Default: None)
+    """
+    self._padding = padding
+    self._strides = maybe_tuple(strides)
+    self._dilation_rate = maybe_tuple(dilation_rate)
+    self._data_format = data_format
+    self._extract_patches_fn = extract_patches_fn
+    self._has_bias = isinstance(params, (tuple, list))
+
+    fltr = params[0] if self._has_bias else params
+    self._filter_shape = tuple(fltr.shape.as_list())
+
+    super(ConvKFCBasicMultiIndepFB, self).__init__(
+        layer_collection=layer_collection,
+        num_uses=num_uses)
+
+  def instantiate_factors(self, grads_list, damping):
+    inputs, grads_list = self._process_data(grads_list)
+
+    # Infer number of locations upon which convolution is applied.
+    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
+                                             self._strides)
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.ConvInputKroneckerFactor,
+        (inputs, self._filter_shape, self._padding, self._strides,
+         self._dilation_rate, self._data_format, self._extract_patches_fn,
+         self._has_bias))
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.ConvOutputKroneckerFactor, (grads_list,))
+
+    self._setup_damping(damping, normalization=
+                        (self._num_locations * self._num_uses))
+
+  @property
+  def _renorm_coeff(self):
+    return self._num_locations * self._num_uses
+
+
+class EmbeddingKFACMultiIndepFB(InputOutputMultiTowerMultiUse,
+                                KroneckerProductFB):
+  """K-FAC FisherBlock for embedding layers used multiple times in the graph.
+
+  Similar to EmbeddingKFACFB except that this version supports multiple uses
+  of the parameter within a single model. These uses could correspond to time
+  steps in an RNN architecture, but they don't have to.
+
+  Does not support bias parameters.
+  """
+
+  def __init__(self, layer_collection, vocab_size, num_uses=None):
+    """Creates a EmbeddingKFACMultiIndepFB block.
+
+    Args:
+      layer_collection: The collection of all layers in the K-FAC approximate
+          Fisher information matrix to which this FisherBlock belongs.
+      vocab_size: int. Size of vocabulary for this embedding layer.
+      num_uses: int or None. Number of uses of the layer in the model's graph.
+        Only required if the data is formatted with time folded into the batch
+        dimension (instead of time being a list dimension). (Default: None)
+    """
+    self._vocab_size = vocab_size
+
+    super(EmbeddingKFACMultiIndepFB, self).__init__(
+        layer_collection=layer_collection,
+        num_uses=num_uses)
+
+  def instantiate_factors(self, grads_list, damping):
+    """Instantiate Kronecker Factors for this FisherBlock.
+
+    Args:
+      grads_list: List of list of list of Tensors. grads_list[i][j][k] is the
+        gradient of the loss with respect to 'outputs' from source 'i',
+        tower/mini-batch 'j', and use/time-step 'k'. Each Tensor has shape
+        [tower_minibatch_size, output_size].
+      damping: 0-D Tensor or float. 'damping' * identity is approximately added
+        to this FisherBlock's Fisher approximation.
+    """
+    inputs, grads_list = self._process_data(grads_list)
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.EmbeddingInputKroneckerFactor,
+        (inputs, self._vocab_size))
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, (grads_list, self._num_uses))
+    self._setup_damping(damping, normalization=self._num_uses)
+
+  @property
+  def _renorm_coeff(self):
+    return float(self._num_uses)
 
 
 class SeriesFBApproximation(enum.IntEnum):
@@ -1181,10 +1469,12 @@ class SeriesFBApproximation(enum.IntEnum):
   option2 = 2
 
 
-class FullyConnectedSeriesFB(InputOutputMultiMinibatch, FisherBlock):
+class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
+                             KroneckerProductFB):
   """FisherBlock for fully-connected layers that share parameters across time.
 
-  See the following preprint for details:
+  This class implements the "Option 1" and "Option 2" approximation from the
+  following paper:
     https://openreview.net/pdf?id=HyMTkQZAb
 
   See the end of the appendix of the paper for a pseudo-code of the
@@ -1196,6 +1486,7 @@ class FullyConnectedSeriesFB(InputOutputMultiMinibatch, FisherBlock):
   def __init__(self,
                layer_collection,
                has_bias=False,
+               num_uses=None,
                option=SeriesFBApproximation.option2):
     """Constructs a new `FullyConnectedSeriesFB`.
 
@@ -1203,6 +1494,10 @@ class FullyConnectedSeriesFB(InputOutputMultiMinibatch, FisherBlock):
       layer_collection: The collection of all layers in the K-FAC approximate
         Fisher information matrix to which this FisherBlock belongs.
       has_bias: Whether the layer includes a bias parameter.
+      num_uses: int or None. Number of time-steps over which the layer
+        is used. Only required if the data is formatted with time folded into
+        the batch dimension (instead of time being a list dimension).
+        (Default: None)
       option: A `SeriesFBApproximation` specifying the simplifying assumption
         to be used in this block. `option1` approximates the cross-covariance
         over time as a symmetric matrix, while `option2` makes
@@ -1213,36 +1508,33 @@ class FullyConnectedSeriesFB(InputOutputMultiMinibatch, FisherBlock):
     self._has_bias = has_bias
     self._option = option
 
-    super(FullyConnectedSeriesFB, self).__init__(layer_collection)
+    super(FullyConnectedSeriesFB, self).__init__(
+        layer_collection=layer_collection,
+        num_uses=num_uses)
 
-  def instantiate_factors(self, grads_list, damping):
+  @property
+  def _num_timesteps(self):
+    return self._num_uses
+
+  @property
+  def _renorm_coeff(self):
+    # This should no longer be used since the multiply_X functions from the base
+    # class have been overridden
+    assert False
 
-    self._num_timesteps = len(self._inputs[0])
-    inputs, grads_list = self._package_minibatches_multi(grads_list)
+  def instantiate_factors(self, grads_list, damping):
+    inputs, grads_list = self._process_data(grads_list)
 
     self._input_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF, ((inputs,), self._has_bias))
+        fisher_factors.FullyConnectedMultiKF,
+        ((inputs,), self._num_uses, self._has_bias))
     self._input_factor.register_cov_dt1()
 
     self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+        fisher_factors.FullyConnectedMultiKF, (grads_list, self._num_uses))
     self._output_factor.register_cov_dt1()
 
-    def compute_damping():
-      normalized_damping = normalize_damping(damping, self._num_timesteps)
-      return compute_pi_adjusted_damping(self._input_factor.get_cov(),
-                                         self._output_factor.get_cov(),
-                                         normalized_damping**0.5)
-
-    damping_id = ("compute_pi_adjusted_damping",
-                  "cov", self._input_factor.name,
-                  "cov", self._output_factor.name,
-                  "normalize_damping",
-                  damping, self._num_timesteps, "power", 0.5)
-    self._input_damping_func = _package_func(lambda: compute_damping()[0],
-                                             damping_id + ("ref", 0))
-    self._output_damping_func = _package_func(lambda: compute_damping()[1],
-                                              damping_id + ("ref", 1))
+    self._setup_damping(damping, normalization=self._num_uses)
 
   def register_matpower(self, exp):
     if exp != -1:
@@ -1275,7 +1567,7 @@ class FullyConnectedSeriesFB(InputOutputMultiMinibatch, FisherBlock):
 
     if self._option == SeriesFBApproximation.option1:
 
-      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
+      # Note that \\(L_A = A0^{-1/2} * U_A and L_G = G0^{-1/2} * U_G.\\)
       L_A, psi_A = self._input_factor.get_option1quants(
           self._input_damping_func)
       L_G, psi_G = self._output_factor.get_option1quants(
@@ -1289,33 +1581,33 @@ class FullyConnectedSeriesFB(InputOutputMultiMinibatch, FisherBlock):
         T = self._num_timesteps
         return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
 
-      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
+      # \\(Y = \gamma( psi_G*psi_A^T )\\) (computed element-wise)
       # Even though Y is Z-independent we are recomputing it from the psi's
       # each since Y depends on both A and G quantities, and it is relatively
       # cheap to compute.
       Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
 
-      # Z = L_G^T * Z * L_A
+      # \\(Z = L_G^T * Z * L_A\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = U_G^T * Z * U_A
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = U_G^T * Z * U_A\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
 
-      # Z = Z .* Y
+      # \\(Z = Z .* Y\\)
       Z *= Y
 
-      # Z = L_G * Z * L_A^T
+      # \\(Z = L_G * Z * L_A^T\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = U_G * Z * U_A^T
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # \\(Z = U_G * Z * U_A^T\\)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
 
     elif self._option == SeriesFBApproximation.option2:
 
-      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
-      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
+      # Note that \\(P_A = A_1^T * A_0^{-1} and P_G = G_1^T * G_0^{-1}\\),
+      # and \\(K_A = A_0^{-1/2} * E_A\ and\ K_G = G_0^{-1/2} * E_G.\\)
       P_A, K_A, mu_A = self._input_factor.get_option2quants(
           self._input_damping_func)
       P_G, K_G, mu_G = self._output_factor.get_option2quants(
@@ -1324,26 +1616,26 @@ class FullyConnectedSeriesFB(InputOutputMultiMinibatch, FisherBlock):
       # Our approach differs superficially from the pseudo-code in the paper
       # in order to reduce the total number of matrix-matrix multiplies.
       # In particular, the first three computations in the pseudo code are
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = Z - hPsi_G^T * Z * hPsi_A
-      # Z = E_G^T * Z * E_A
-      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
-      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = Z - hPsi_G^T * Z * hPsi_A\\)
+      # \\(Z = E_G^T * Z * E_A\\)
+      # Noting that hPsi = C0^{-1/2} * C1 * C0^{-1/2}\\), so that
+      # \\(C0^{-1/2} * hPsi = C0^{-1} * C1 * C0^{-1/2} = P^T * C0^{-1/2}\\)
       # the entire computation can be written as
-      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
-      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
-      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
-      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
-      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
+      # \\(Z = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - hPsi_G^T * G0^{-1/2} * Z * A0^{-1/2} * hPsi_A) * E_A\\)
+      # \\(  = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2}) * E_A\\)
+      # \\(  = E_G^T * G0^{-1/2} * Z * A0^{-1/2} * E_A\\)
+      # \\(    -  E_G^T* G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2} * E_A\\)
+      # \\(  = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A\\)
       # This final expression is computed by the following two lines:
-      # Z = Z - P_G * Z * P_A^T
+      # \\(Z = Z - P_G * Z * P_A^T\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
-      # Z = K_G^T * Z * K_A
+      # \\(Z = K_G^T * Z * K_A\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
 
-      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
+      # \\(Z = Z ./ (1*1^T - mu_G*mu_A^T)\\)
       # Be careful with the outer product.  We don't want to accidentally
       # make it an inner-product instead.
       tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
@@ -1354,13 +1646,13 @@ class FullyConnectedSeriesFB(InputOutputMultiMinibatch, FisherBlock):
       # We now perform the transpose/reverse version of the operations
       # derived above, whose derivation from the original pseudo-code is
       # analgous.
-      # Z = K_G * Z * K_A^T
+      # \\(Z = K_G * Z * K_A^T\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
 
-      # Z = Z - P_G^T * Z * P_A
+      # \\(Z = Z - P_G^T * Z * P_A\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
 
-      # Z = normalize (1/E[T]) * Z
+      # \\(Z = normalize (1/E[T]) * Z\\)
       # Note that this normalization is done because we compute the statistics
       # by averaging, not summing, over time. (And the gradient is presumably
       # summed over time, not averaged, and thus their scales are different.)
@@ -1372,6 +1664,3 @@ class FullyConnectedSeriesFB(InputOutputMultiMinibatch, FisherBlock):
     return utils.mat2d_to_layer_params(vector, Z)
 
     # pylint: enable=invalid-name
-
-  def tensors_to_compute_grads(self):
-    return self._outputs
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index 6fc163e2323666aca8489bf146ebc8582995cf06..0d40d265a1727075d0ba721b0d9a756c38269a96 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import contextlib
 
 import numpy as np
 import six
@@ -37,6 +38,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import moving_averages
 from tensorflow.python.util import nest
 
+
 # Whether to initialize covariance estimators at a zero matrix (or the identity
 # matrix).
 INIT_COVARIANCES_AT_ZERO = False
@@ -53,16 +55,25 @@ EIGENVALUE_DECOMPOSITION_THRESHOLD = 2
 # matrix powers. Must be nonnegative.
 EIGENVALUE_CLIPPING_THRESHOLD = 0.0
 
+# TOWER_STRATEGY can be one of "concat" or "separate".  If "concat", the data
+# passed to the factors from the blocks will be concatenated across towers
+# (lazilly via PartitionedTensor objects).  Otherwise a tuple of tensors over
+# towers will be passed in, and the factors will iterate over this and do the
+# cov computations separately for each one, averaging the results together.
+TOWER_STRATEGY = "concat"
+
 
 def set_global_constants(init_covariances_at_zero=None,
                          zero_debias=None,
                          eigenvalue_decomposition_threshold=None,
-                         eigenvalue_clipping_threshold=None):
+                         eigenvalue_clipping_threshold=None,
+                         tower_strategy=None):
   """Sets various global constants used by the classes in this module."""
   global INIT_COVARIANCES_AT_ZERO
   global ZERO_DEBIAS
   global EIGENVALUE_DECOMPOSITION_THRESHOLD
   global EIGENVALUE_CLIPPING_THRESHOLD
+  global TOWER_STRATEGY
 
   if init_covariances_at_zero is not None:
     INIT_COVARIANCES_AT_ZERO = init_covariances_at_zero
@@ -72,6 +83,8 @@ def set_global_constants(init_covariances_at_zero=None,
     EIGENVALUE_DECOMPOSITION_THRESHOLD = eigenvalue_decomposition_threshold
   if eigenvalue_clipping_threshold is not None:
     EIGENVALUE_CLIPPING_THRESHOLD = eigenvalue_clipping_threshold
+  if tower_strategy is not None:
+    TOWER_STRATEGY = tower_strategy
 
 
 def inverse_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
@@ -90,6 +103,15 @@ def diagonal_covariance_initializer(shape, dtype, partition_info):  # pylint: di
   return array_ops.ones(shape, dtype)
 
 
+@contextlib.contextmanager
+def place_on_device(device):
+  if device is not None and len(device):
+    with tf_ops.device(device):
+      yield
+  else:
+    yield
+
+
 def compute_cov(tensor, tensor_right=None, normalizer=None):
   """Compute the empirical second moment of the rows of a 2D Tensor.
 
@@ -256,6 +278,10 @@ class FisherFactor(object):
     """
     pass
 
+  @abc.abstractproperty
+  def _num_towers(self):
+    pass
+
   @abc.abstractproperty
   def _dtype(self):
     """dtype for variable backing this factor."""
@@ -278,12 +304,14 @@ class FisherFactor(object):
           dtype=self._dtype)
 
   @abc.abstractmethod
-  def _compute_new_cov(self, idx=0):
+  def _compute_new_cov(self, source, tower):
     """Computes minibatch-estimated covariance for a single source.
 
     Args:
-      idx: int in [0, self._num_sources). Which source to use when estimating
-        covariance.
+      source: int in [0, self._num_sources). Which source to use when computing
+        the cov update.
+      tower: int in [0, self._num_towers). Which tower to use when computing
+        the cov update.
 
     Returns:
       Tensor of same shape as self.get_cov_var().
@@ -298,15 +326,33 @@ class FisherFactor(object):
     Returns:
       An Op for updating the covariance Variable referenced by _cov.
     """
-    new_cov_contribs = tuple(self._compute_new_cov(idx)
-                             for idx in range(self._num_sources))
-    new_cov = math_ops.add_n(new_cov_contribs)
-    # Synchronize value across all TPU cores.
+    new_cov_contribs = []
+    for source in range(self._num_sources):
+      for tower in range(self._num_towers):
+        device = (self._get_data_device(tower)
+                  if TOWER_STRATEGY == "separate" else None)
+        with place_on_device(device):
+          new_cov_contribs.append(self._compute_new_cov(source, tower))
+
+    new_cov = math_ops.add_n(new_cov_contribs) / float(self._num_towers)
+
+    # Compute average of 'new_cov' across all TPU cores. On a TPU, each
+    # instance of 'new_cov' will be based on a different minibatch. This ensures
+    # that by the end of assign_moving_average(), all TPU cores see the same
+    # value for self._cov.
+    #
+    # Other implementations of make_covariance_update_op() that accumulate
+    # statistics in other variables should mimic this behavior.
     if utils.on_tpu():
       new_cov = utils.cross_replica_mean(new_cov)
+
     return moving_averages.assign_moving_average(
         self._cov, new_cov, ema_decay, zero_debias=ZERO_DEBIAS)
 
+  @abc.abstractmethod
+  def _get_data_device(self, tower):
+    pass
+
   @abc.abstractmethod
   def instantiate_inv_variables(self):
     """Makes the internal "inverse" variable(s)."""
@@ -597,17 +643,26 @@ class FullFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._params_grads)
 
+  @property
+  def _num_towers(self):
+    return 1
+
   @property
   def _dtype(self):
     return self._params_grads[0][0].dtype
 
-  def _compute_new_cov(self, idx=0):
+  def _compute_new_cov(self, source, tower):
+    assert tower == 0
+
     # This will be a very basic rank 1 estimate
-    params_grads_flat = utils.tensors_to_column(self._params_grads[idx])
+    params_grads_flat = utils.tensors_to_column(self._params_grads[source])
     return ((params_grads_flat * array_ops.transpose(
         params_grads_flat)) / math_ops.cast(self._batch_size,
                                             params_grads_flat.dtype))
 
+  def _get_data_device(self, tower):
+    return None
+
 
 class DiagonalFactor(FisherFactor):
   """A base class for FisherFactors that use diagonal approximations.
@@ -692,15 +747,24 @@ class NaiveDiagonalFactor(DiagonalFactor):
   def _num_sources(self):
     return len(self._params_grads)
 
+  @property
+  def _num_towers(self):
+    return 1
+
   @property
   def _dtype(self):
     return self._params_grads[0][0].dtype
 
-  def _compute_new_cov(self, idx=0):
-    params_grads_flat = utils.tensors_to_column(self._params_grads[idx])
+  def _compute_new_cov(self, source, tower):
+    assert tower == 0
+
+    params_grads_flat = utils.tensors_to_column(self._params_grads[source])
     return (math_ops.square(params_grads_flat) / math_ops.cast(
         self._batch_size, params_grads_flat.dtype))
 
+  def _get_data_device(self, tower):
+    return None
+
 
 class EmbeddingInputKroneckerFactor(DiagonalFactor):
   r"""FisherFactor for input to an embedding layer.
@@ -720,8 +784,8 @@ class EmbeddingInputKroneckerFactor(DiagonalFactor):
     """Instantiate EmbeddingInputKroneckerFactor.
 
     Args:
-      input_ids: Tensor of shape [batch_size, input_size] and dtype int32.
-        Indices into embedding matrix.
+      input_ids: List of Tensors of shape [batch_size, input_size] and dtype
+        int32. Indices into embedding matrix. List index is tower.
       vocab_size: int or 0-D Tensor. Maximum value for entries in 'input_ids'.
       dtype: dtype for covariance statistics. Must be a floating point type.
         Defaults to float32.
@@ -744,15 +808,18 @@ class EmbeddingInputKroneckerFactor(DiagonalFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _num_towers(self):
+    return len(self._input_ids)
+
   @property
   def _dtype(self):
     return self._cov_dtype
 
-  def _compute_new_cov(self, idx=0):
-    if idx != 0:
-      raise ValueError("EmbeddingInputKroneckerFactor only supports idx = 0")
+  def _compute_new_cov(self, source, tower):
+    assert source == 0
 
-    input_ids = self._input_ids
+    input_ids = self._input_ids[tower]
 
     if len(input_ids.shape) > 2:
       raise ValueError(
@@ -782,6 +849,9 @@ class EmbeddingInputKroneckerFactor(DiagonalFactor):
 
     return new_cov
 
+  def _get_data_device(self, tower):
+    return self._input_ids[tower].device
+
 
 class FullyConnectedDiagonalFactor(DiagonalFactor):
   r"""FisherFactor for a diagonal approx of a fully-connected layer's Fisher.
@@ -801,10 +871,11 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
     """Instantiate FullyConnectedDiagonalFactor.
 
     Args:
-      inputs: Tensor of shape [batch_size, input_size]. Inputs to this layer.
+      inputs: List of Tensors of shape [batch_size, input_size]. Inputs to this
+        layer.  List index is towers.
       outputs_grads: List of Tensors, each of shape [batch_size, output_size],
         which are the gradients of the loss with respect to the layer's
-        outputs. One Tensor for each "source".
+        outputs. First index is source, second is tower.
 
       has_bias: bool. If True, append '1' to each input.
     """
@@ -818,47 +889,58 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
   @property
   def _var_scope(self):
     return "ff_diagfc_" + scope_string_from_params(
-        (self._inputs,) + tuple(self._outputs_grads))
+        tuple(self._inputs) + tuple(nest.flatten(self._outputs_grads)))
 
   @property
   def _cov_shape(self):
-    input_size = self._inputs.shape[1] + self._has_bias
-    output_size = self._outputs_grads[0].shape[1]
+    input_size = self._inputs[0].shape[1] + self._has_bias
+    output_size = self._outputs_grads[0][0].shape[1]
     return [input_size, output_size]
 
   @property
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _num_towers(self):
+    return len(self._inputs)
+
   @property
   def _dtype(self):
-    return self._outputs_grads[0].dtype
+    return self._outputs_grads[0][0].dtype
 
   def make_covariance_update_op(self, ema_decay):
-    inputs = self._inputs
 
-    if self._has_bias:
-      inputs = append_homog(inputs)
-    self._squared_inputs = math_ops.square(inputs)
+    self._squared_inputs = []
+    for tower in range(self._num_towers):
+      inputs = self._inputs[tower]
+
+      with place_on_device(self._get_data_device(tower)):
+        if self._has_bias:
+          inputs = append_homog(inputs)
+        self._squared_inputs.append(math_ops.square(inputs))
 
     return super(FullyConnectedDiagonalFactor, self).make_covariance_update_op(
         ema_decay)
 
-  def _compute_new_cov(self, idx=0):
-    batch_size = array_ops.shape(self._squared_inputs)[0]
-    outputs_grad = self._outputs_grads[idx]
+  def _compute_new_cov(self, source, tower):
+    batch_size = array_ops.shape(self._squared_inputs[tower])[0]
+    outputs_grad = self._outputs_grads[source][tower]
 
     # The well-known special formula that uses the fact that the entry-wise
     # square of an outer product is the outer-product of the entry-wise squares.
     # The gradient is the outer product of the input and the output gradients,
     # so we just square both and then take their outer-product.
     new_cov = math_ops.matmul(
-        self._squared_inputs,
+        self._squared_inputs[tower],
         math_ops.square(outputs_grad),
         transpose_a=True)
     new_cov /= math_ops.cast(batch_size, new_cov.dtype)
     return new_cov
 
+  def _get_data_device(self, tower):
+    return self._inputs[tower].device
+
 
 class ConvDiagonalFactor(DiagonalFactor):
   """FisherFactor for a diagonal approx of a convolutional layer's Fisher."""
@@ -875,11 +957,12 @@ class ConvDiagonalFactor(DiagonalFactor):
     """Creates a ConvDiagonalFactor object.
 
     Args:
-      inputs: Tensor of shape [batch_size, height, width, in_channels].
-        Input activations to this layer.
+      inputs: List of Tensors of shape [batch_size, height, width, in_channels].
+        Input activations to this layer.  List index is towers.
       outputs_grads: List of Tensors, each of shape [batch_size,
         height, width, out_channels], which are the gradients of the loss
-        with respect to the layer's outputs. One Tensor for each "source".
+        with respect to the layer's outputs.  First index is source, second
+        index is tower.
       filter_shape: Tuple of 4 ints: (kernel_height, kernel_width, in_channels,
         out_channels). Represents shape of kernel used in this layer.
       strides: The stride size in this layer (1-D Tensor of length 4).
@@ -897,14 +980,15 @@ class ConvDiagonalFactor(DiagonalFactor):
     """
     if not utils.is_data_format_channel_last(data_format):
       raise ValueError("Channel must be last.")
-    if inputs.shape.ndims != 4:
-      raise ValueError("inputs must be 4-D Tensor.")
-    if inputs.shape.as_list()[-1] != filter_shape[-2]:
+    if any(input_.shape.ndims != 4 for input_ in inputs):
+      raise ValueError("inputs must be a list of 4-D Tensors.")
+    if any(input_.shape.as_list()[-1] != filter_shape[-2] for input_ in inputs):
       raise ValueError("inputs and filter_shape must agree on in_channels.")
     for i, outputs_grad in enumerate(outputs_grads):
-      if outputs_grad.shape.ndims != 4:
+      if any(output_grad.shape.ndims != 4 for output_grad in outputs_grad):
         raise ValueError("outputs[%d] must be 4-D Tensor." % i)
-      if outputs_grad.shape.as_list()[-1] != filter_shape[-1]:
+      if any(output_grad.shape.as_list()[-1] != filter_shape[-1]
+             for output_grad in outputs_grad):
         raise ValueError(
             "outputs[%d] and filter_shape must agree on out_channels." % i)
     if len(strides) != 4:
@@ -927,7 +1011,7 @@ class ConvDiagonalFactor(DiagonalFactor):
   @property
   def _var_scope(self):
     return "ff_convdiag_" + scope_string_from_params(
-        (self._inputs,) + tuple(self._outputs_grads))
+        tuple(self._inputs) + tuple(nest.flatten(self._outputs_grads)))
 
   @property
   def _cov_shape(self):
@@ -941,9 +1025,13 @@ class ConvDiagonalFactor(DiagonalFactor):
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _num_towers(self):
+    return len(self._inputs)
+
   @property
   def _dtype(self):
-    return self._outputs_grads[0].dtype
+    return self._inputs[0].dtype
 
   def make_covariance_update_op(self, ema_decay):
     filter_height, filter_width, _, _ = self._filter_shape
@@ -954,25 +1042,30 @@ class ConvDiagonalFactor(DiagonalFactor):
       rates = (1, 1, 1, 1)
     else:
       rates = tuple(self._dilations)
-    patches = array_ops.extract_image_patches(
-        self._inputs,
-        ksizes=[1, filter_height, filter_width, 1],
-        strides=self._strides,
-        rates=rates,
-        padding=self._padding)
 
-    if self._has_bias:
-      patches = append_homog(patches)
+    self._patches = []
+    for tower in range(self._num_towers):
+      with place_on_device(self._get_data_device(tower)):
+        patches = array_ops.extract_image_patches(
+            self._inputs[tower],
+            ksizes=[1, filter_height, filter_width, 1],
+            strides=self._strides,
+            rates=rates,
+            padding=self._padding)
+
+        if self._has_bias:
+          patches = append_homog(patches)
 
-    self._patches = patches
+        self._patches.append(patches)
 
     return super(ConvDiagonalFactor, self).make_covariance_update_op(ema_decay)
 
-  def _compute_new_cov(self, idx=0):
-    batch_size = array_ops.shape(self._patches)[0]
-    outputs_grad = self._outputs_grads[idx]
+  def _compute_new_cov(self, source, tower):
+    patches = self._patches[tower]
+    batch_size = array_ops.shape(patches)[0]
+    outputs_grad = self._outputs_grads[source][tower]
 
-    new_cov = self._convdiag_sum_of_squares(self._patches, outputs_grad)
+    new_cov = self._convdiag_sum_of_squares(patches, outputs_grad)
     new_cov /= math_ops.cast(batch_size, new_cov.dtype)
 
     return new_cov
@@ -985,6 +1078,9 @@ class ConvDiagonalFactor(DiagonalFactor):
                                                   outputs_grad)
     return math_ops.reduce_sum(math_ops.square(case_wise_gradients), axis=0)
 
+  def _get_data_device(self, tower):
+    return self._inputs[tower].device
+
 
 class FullyConnectedKroneckerFactor(InverseProvidingFactor):
   """Kronecker factor for the input or output side of a fully-connected layer.
@@ -996,9 +1092,9 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
     """Instantiate FullyConnectedKroneckerFactor.
 
     Args:
-      tensors: List of Tensors, each of shape [batch_size, n], one for each
-      source.  The Tensors are typically either a layer's inputs or its
-      output's gradients.
+      tensors: List of list of Tensors, each of shape [batch_size, n]. The
+        Tensors are typically either a layer's inputs or its output's gradients.
+        The first list index is source, the second is tower.
       has_bias: bool. If True, append '1' to each row.
     """
     # The tensor argument is either a tensor of input activations or a tensor of
@@ -1010,27 +1106,34 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
   @property
   def _var_scope(self):
     return "ff_fckron_" + scope_string_from_params(
-        tuple(self._tensors) + (self._has_bias,))
+        tuple(nest.flatten(self._tensors)) + (self._has_bias,))
 
   @property
   def _cov_shape(self):
-    size = self._tensors[0].shape[1] + self._has_bias
+    size = self._tensors[0][0].shape[1] + self._has_bias
     return [size, size]
 
   @property
   def _num_sources(self):
     return len(self._tensors)
 
+  @property
+  def _num_towers(self):
+    return len(self._tensors[0])
+
   @property
   def _dtype(self):
-    return self._tensors[0].dtype
+    return self._tensors[0][0].dtype
 
-  def _compute_new_cov(self, idx=0):
-    tensor = self._tensors[idx]
+  def _compute_new_cov(self, source, tower):
+    tensor = self._tensors[source][tower]
     if self._has_bias:
       tensor = append_homog(tensor)
     return compute_cov(tensor)
 
+  def _get_data_device(self, tower):
+    return self._tensors[0][tower].device
+
 
 class ConvInputKroneckerFactor(InverseProvidingFactor):
   r"""Kronecker factor for the input side of a convolutional layer.
@@ -1054,8 +1157,8 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
     """Initializes ConvInputKroneckerFactor.
 
     Args:
-      inputs: Tensor of shape [batch_size, ..spatial_input_size.., in_channels].
-        Inputs to layer.
+      inputs: List of Tensors of shape [batch_size, ..spatial_input_size..,
+        in_channels]. Inputs to layer. List index is tower.
       filter_shape: List of ints. Contains [..spatial_filter_size..,
         in_channels, out_channels]. Shape of convolution kernel.
       padding: str. Padding method for layer. "SAME" or "VALID".
@@ -1084,10 +1187,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
 
   @property
   def _var_scope(self):
-    return "ff_convinkron_" + scope_string_from_params([
-        self._inputs, self._filter_shape, self._strides, self._padding,
-        self._dilation_rate, self._data_format, self._has_bias
-    ])
+    return "ff_convinkron_" + scope_string_from_params(
+        tuple(self._inputs) +
+        tuple((self._filter_shape, self._strides, self._padding,
+               self._dilation_rate, self._data_format, self._has_bias)))
 
   @property
   def _cov_shape(self):
@@ -1100,19 +1203,24 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _num_towers(self):
+    return len(self._inputs)
+
   @property
   def _dtype(self):
-    return self._inputs.dtype
+    return self._inputs[0].dtype
 
-  def _compute_new_cov(self, idx=0):
-    if idx != 0:
-      raise ValueError("ConvInputKroneckerFactor only supports idx = 0")
+  def _compute_new_cov(self, source, tower):
+    assert source == 0
+
+    inputs = self._inputs[tower]
 
     # TODO(b/64144716): there is potential here for a big savings in terms of
     # memory use.
     if self._extract_patches_fn in [None, "extract_convolution_patches"]:
       patches = utils.extract_convolution_patches(
-          self._inputs,
+          inputs,
           self._filter_shape,
           padding=self._padding,
           strides=self._strides,
@@ -1120,7 +1228,7 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
           data_format=self._data_format)
 
     elif self._extract_patches_fn == "extract_image_patches":
-      assert self._inputs.shape.ndims == 4
+      assert inputs.shape.ndims == 4
       assert len(self._filter_shape) == 4
       assert len(self._strides) == 4, self._strides
       if self._dilation_rate is None:
@@ -1130,7 +1238,7 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
         assert len(rates) == 4
         assert rates[0] == rates[-1] == 1
       patches = array_ops.extract_image_patches(
-          self._inputs,
+          inputs,
           ksizes=[1] + list(self._filter_shape[0:-2]) + [1],
           strides=self._strides,
           rates=rates,
@@ -1140,7 +1248,7 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
       assert self._strides in [None, [1, 1, 1, 1], (1, 1, 1, 1)]
       assert self._filter_shape[0] == self._filter_shape[1] == 1
       patches = utils.extract_pointwise_conv2d_patches(
-          self._inputs, self._filter_shape, data_format=None)
+          inputs, self._filter_shape, data_format=None)
 
     else:
       raise NotImplementedError(self._extract_patches_fn)
@@ -1165,6 +1273,9 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
     # (Tilde omitted over A for clarity.)
     return compute_cov(patches_flat)
 
+  def _get_data_device(self, tower):
+    return self._inputs[tower].device
+
 
 class ConvOutputKroneckerFactor(InverseProvidingFactor):
   r"""Kronecker factor for the output side of a convolutional layer.
@@ -1181,9 +1292,9 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
     """Initializes ConvOutputKroneckerFactor.
 
     Args:
-      outputs_grads: list of Tensors. Each Tensor is of shape
-          [batch_size, ..spatial_input_size.., out_channels]. One Tensor per
-          source.
+      outputs_grads: List of list of Tensors. Each Tensor is of shape
+          [batch_size, ..spatial_input_size.., out_channels].  First list index
+          is source, the second is tower.
       data_format: None or str. Format of outputs_grads.
 
     Raises:
@@ -1191,13 +1302,14 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
     """
     if not utils.is_data_format_channel_last(data_format):
       raise ValueError("Channel must be last.")
-    self._out_channels = outputs_grads[0].shape.as_list()[-1]
+    self._out_channels = outputs_grads[0][0].shape.as_list()[-1]
     self._outputs_grads = outputs_grads
     super(ConvOutputKroneckerFactor, self).__init__()
 
   @property
   def _var_scope(self):
-    return "ff_convoutkron_" + scope_string_from_params(self._outputs_grads)
+    return "ff_convoutkron_" + scope_string_from_params(
+        nest.flatten(self._outputs_grads))
 
   @property
   def _cov_shape(self):
@@ -1208,12 +1320,16 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _num_towers(self):
+    return len(self._outputs_grads[0])
+
   @property
   def _dtype(self):
-    return self._outputs_grads[0].dtype
+    return self._outputs_grads[0][0].dtype
 
-  def _compute_new_cov(self, idx=0):
-    outputs_grad = self._outputs_grads[idx]
+  def _compute_new_cov(self, source, tower):
+    outputs_grad = self._outputs_grads[source][tower]
 
     # reshaped_tensor below is the matrix DS_l defined in the KFC paper
     # (tilde omitted over S for clarity). It has shape M|T| x I, where
@@ -1226,28 +1342,30 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
     # (Tilde omitted over S for clarity.)
     return compute_cov(reshaped_tensor)
 
+  def _get_data_device(self, tower):
+    return self._outputs_grads[0][tower].device
 
-class FullyConnectedMultiKF(InverseProvidingFactor):
+
+class FullyConnectedMultiKF(FullyConnectedKroneckerFactor):
   """Kronecker factor for a fully connected layer used multiple times."""
 
   def __init__(self,
-               tensor_lists,
+               tensors,
+               num_uses=None,
                has_bias=False):
     """Constructs a new `FullyConnectedMultiKF`.
 
     Args:
-      tensor_lists: 2D array (list of lists) of Tensors of shape
-        [batch_size, n]. Each of these tensors is usually a layer's inputs or
-        its output's gradients. The first dimension of the array is the source,
-        and the second is the use in the graph (which is sometimes a
-        "time-step").
+      tensors: List of list of Tensors of shape, each of shape
+        [num_uses * batch_size, n], and is a reshape version of a Tensor of
+        shape [num_uses, batch_size, n]. Each of these tensors is usually a
+        layer's inputs or its output's gradients. The first list index is
+        sources, the second is towers.
+      num_uses: int. The number of time-steps / uses.
       has_bias: bool. If True, '1' is appended to each row.
     """
 
-    self._tensor_lists = tensor_lists
-    self._has_bias = has_bias
-    self._num_timesteps = len(tensor_lists[0])
-    self._tensors = [None] * len(tensor_lists)
+    self._num_uses = num_uses
 
     self._cov_dt1 = None
     self._make_cov_dt1 = False
@@ -1256,29 +1374,38 @@ class FullyConnectedMultiKF(InverseProvidingFactor):
     self._option1quants_registrations = set()
     self._option2quants_registrations = set()
 
-    super(FullyConnectedMultiKF, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_fc_multi_" + scope_string_from_params(
-        tuple(nest.flatten(self._tensor_lists)) + (self._has_bias,))
+    super(FullyConnectedMultiKF, self).__init__(tensors=tensors,
+                                                has_bias=has_bias)
 
   @property
-  def _num_sources(self):
-    return len(self._tensor_lists)
+  def _num_timesteps(self):
+    return self._num_uses
 
   @property
-  def _dtype(self):
-    return self._tensor_lists[0][0].dtype
+  def _var_scope(self):
+    return "ff_fc_multi_" + scope_string_from_params(
+        tuple(nest.flatten(self._tensors))
+        + (self._num_timesteps, self._has_bias,))
 
   def make_covariance_update_op(self, ema_decay):
 
     op = super(FullyConnectedMultiKF, self).make_covariance_update_op(ema_decay)
 
     if self._cov_dt1 is not None:
-      new_cov_dt1_contribs = tuple(self._compute_new_cov_dt1(idx)
-                                   for idx in range(self._num_sources))
-      new_cov_dt1 = math_ops.add_n(new_cov_dt1_contribs)
+      new_cov_dt1_contribs = []
+      for source in range(self._num_sources):
+        for tower in range(self._num_towers):
+          with place_on_device(self._get_data_device(tower)):
+            new_cov_dt1_contribs.append(self._compute_new_cov_dt1(source,
+                                                                  tower))
+
+      new_cov_dt1 = (math_ops.add_n(new_cov_dt1_contribs)
+                     / float(self._num_towers))
+
+      # See comments in FisherFactor.make_covariance_update_op() for details.
+      if utils.on_tpu():
+        new_cov_dt1 = utils.cross_replica_mean(new_cov_dt1)
+
       op2 = moving_averages.assign_moving_average(
           self._cov_dt1, new_cov_dt1, ema_decay, zero_debias=ZERO_DEBIAS)
 
@@ -1291,36 +1418,31 @@ class FullyConnectedMultiKF(InverseProvidingFactor):
 
     return op
 
-  def _compute_new_cov(self, idx=0):
-    # Concatenate across time/replications
-    tensor = array_ops.concat(self._tensor_lists[idx], 0)
+  def _compute_new_cov_dt1(self, source, tower):  # pylint: disable=missing-docstring
+    tensor = self._tensors[source][tower]
     if self._has_bias:
+      # This appending is technically done twice (the other time is for
+      # _compute_new_cov())
       tensor = append_homog(tensor)
-    # We save these so they can be used by _compute_new_cov_dt1
-    self._tensors[idx] = tensor
-    return compute_cov(tensor)
 
-  def _compute_new_cov_dt1(self, idx=0):  # pylint: disable=missing-docstring
-    tensor = self._tensors[idx]
-    batch_size = array_ops.shape(self._tensor_lists[idx][0])[0]
-    # Is there a more elegant way to do this computation?
+    total_len = array_ops.shape(tensor)[0]
+    batch_size = total_len // self._num_timesteps
+
     tensor_present = tensor[:-batch_size, :]
     tensor_future = tensor[batch_size:, :]
+
     # We specify a normalizer for this computation to ensure a PSD Fisher
     # block estimate.  This is equivalent to padding with zeros, as was done
     # in Section B.2 of the appendix.
-    normalizer = self._num_timesteps * batch_size
     return compute_cov(
-        tensor_future, tensor_right=tensor_present, normalizer=normalizer)
+        tensor_future, tensor_right=tensor_present, normalizer=total_len)
 
-  @property
-  def _cov_shape(self):
-    size = self._tensor_lists[0][0].shape[1] + self._has_bias
-    return [size, size]
+  def _get_data_device(self, tower):
+    return self._tensors[0][tower].device
 
   @property
   def _vec_shape(self):
-    size = self._tensor_lists[0][0].shape[1] + self._has_bias
+    size = self._tensors[0][0].shape[1] + self._has_bias
     return [size]
 
   def get_option1quants(self, damping_func):
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 4eb5e4c092b50ff4a908a22312330c40ca93cbee..19608aca4716a08ec9f9bea35d07de3a434bbe3f 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -60,6 +60,10 @@ _CONV2D_APPROX_TO_BLOCK_TYPES = {
     APPROX_DIAGONAL_NAME: fb.ConvDiagonalFB,
 }
 
+_EMBEDDING_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_NAME: fb.EmbeddingKFACFB
+}
+
 APPROX_KRONECKER_INDEP_NAME = "kron_indep"
 APPROX_KRONECKER_SERIES_1_NAME = "kron_series_1"
 APPROX_KRONECKER_SERIES_2_NAME = "kron_series_2"
@@ -72,6 +76,14 @@ _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES = {
                                             option=2)
 }
 
+_CONV2D_MULTI_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_INDEP_NAME: fb.ConvKFCBasicMultiIndepFB
+}
+
+_EMBEDDING_MULTI_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_INDEP_NAME: fb.EmbeddingKFACMultiIndepFB
+}
+
 # Possible value for 'reuse' keyword argument. Sets 'reuse' to
 # tf.get_variable_scope().reuse.
 VARIABLE_SCOPE = "VARIABLE_SCOPE"
@@ -169,9 +181,12 @@ class LayerCollection(object):
     self._default_generic_approximation = APPROX_FULL_NAME
     self._default_embedding_approximation = APPROX_KRONECKER_NAME
     self._default_fully_connected_approximation = APPROX_KRONECKER_NAME
-    self._default_convolution_2d_approximation = APPROX_KRONECKER_NAME
+    self._default_conv2d_approximation = APPROX_KRONECKER_NAME
     self._default_fully_connected_multi_approximation = (
-        APPROX_KRONECKER_SERIES_2_NAME)
+        APPROX_KRONECKER_INDEP_NAME)
+    self._default_conv2d_multi_approximation = (
+        APPROX_KRONECKER_INDEP_NAME)
+    self._default_embedding_multi_approximation = APPROX_KRONECKER_INDEP_NAME
     self.loss_colocation_ops = {}
     self._vars_to_uses = defaultdict(lambda: 0)
 
@@ -245,14 +260,14 @@ class LayerCollection(object):
 
   @property
   def default_conv2d_approximation(self):
-    return self._default_convolution_2d_approximation
+    return self._default_conv2d_approximation
 
   def set_default_conv2d_approximation(self, value):
     if value not in _CONV2D_APPROX_TO_BLOCK_TYPES:
       raise ValueError(
           "{} is not a valid approximation for 2d convolutional layers.".format(
               value))
-    self._default_convolution_2d_approximation = value
+    self._default_conv2d_approximation = value
 
   @property
   def default_fully_connected_multi_approximation(self):
@@ -264,6 +279,14 @@ class LayerCollection(object):
                        "multi layer.".format(value))
     self._default_fully_connected_multi_approximation = value
 
+  @property
+  def default_conv2d_multi_approximation(self):
+    return self._default_conv2d_multi_approximation
+
+  @property
+  def default_embedding_multi_approximation(self):
+    return self._default_embedding_multi_approximation
+
   def register_block(self, layer_key, fisher_block, reuse=VARIABLE_SCOPE):
     """Validates and registers the layer_key associated with the fisher_block.
 
@@ -367,7 +390,7 @@ class LayerCollection(object):
       if name in self._loss_dict:
         raise KeyError(
             "Loss function named {} already exists. Set reuse=True to append "
-            "another minibatch/tower.".format(name))
+            "another tower.".format(name))
 
       loss_list = []
       self._loss_dict[name] = loss_list
@@ -526,45 +549,54 @@ class LayerCollection(object):
     else:
       return None
 
+  def _get_block_type(self, params, approx, default, approx_to_type):
+    if approx is None:
+      approx = self._get_linked_approx(params)
+      if approx is None:
+        approx = default
+
+    if approx not in approx_to_type:
+      raise ValueError("Bad value {} for approx.".format(approx))
+
+    return approx_to_type[approx], approx
+
   def register_embedding(self,
                          params,
                          inputs,
                          outputs,
                          approx=None,
                          reuse=VARIABLE_SCOPE):
-    """Registers a fully connnected layer.
+    """Registers an embedding layer.
 
     Args:
       params: Embedding matrix of shape [vocab_size, embedding_size].
       inputs: Tensor of shape [batch_size, input_size] and dtype int32. Indices
         into embedding matrix.
-      outputs: Tensor of shape [batch_size, output_size]. Outputs
+      outputs: Tensor of shape [batch_size, embedding_size]. Outputs
         produced by layer.
-      approx: str. Must be "kron".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      approx: str or None. If not None must be "kron".  The Fisher
+        approximation to use. If None the default value is used. (Default: None)
+      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
       ValueError: For improper value to 'approx'.
       KeyError: If reuse == True but no FisherBlock found for 'params'.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_embedding_approximation
-
-    if approx != APPROX_KRONECKER_NAME:
-      raise ValueError("Bad value {} for approx.".format(approx))
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_embedding_approximation,
+        _EMBEDDING_APPROX_TO_BLOCK_TYPES)
 
     if isinstance(params, (tuple, list)):
       raise ValueError("Bias not supported.")
-
     vocab_size = int(params.shape[0])
     block = self.register_block(
-        params, fb.EmbeddingKFACFB(self, vocab_size), reuse=reuse)
-    block.register_additional_minibatch(inputs, outputs)
+        params, block_type(self, vocab_size), reuse=reuse)
+    block.register_additional_tower(inputs, outputs)
 
     self._add_uses(params, 1)
 
@@ -583,30 +615,29 @@ class LayerCollection(object):
       inputs: Tensor of shape [batch_size, input_size]. Inputs to layer.
       outputs: Tensor of shape [batch_size, output_size]. Outputs
         produced by layer.
-      approx: str. One of "kron" or "diagonal".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      approx: str or None. If not None must be one of "kron" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
       ValueError: For improper value to 'approx'.
       KeyError: If reuse == True but no FisherBlock found for 'params'.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_fully_connected_approximation
 
-    if approx not in _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES:
-      raise ValueError("Bad value {} for approx.".format(approx))
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_fully_connected_approximation,
+        _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES)
 
-    block_type = _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES[approx]
     has_bias = isinstance(params, (tuple, list))
-
     block = self.register_block(params, block_type(self, has_bias=has_bias),
                                 reuse=reuse)
-    block.register_additional_minibatch(inputs, outputs)
+    block.register_additional_tower(inputs, outputs)
 
     self._add_uses(params, 1)
 
@@ -635,10 +666,14 @@ class LayerCollection(object):
         Output produced by layer.
       data_format: str or None. Format of data.
       dilations: List of 4 ints. Dilations along each dimension.
-      approx: str. One of "kron" or "diagonal".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      approx: str or None. If not None must be one of "kron" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
       ValueError: For improper value to 'approx'.
@@ -646,15 +681,14 @@ class LayerCollection(object):
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
 
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_conv2d_approximation
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_conv2d_approximation,
+        _CONV2D_APPROX_TO_BLOCK_TYPES)
 
-    if approx not in _CONV2D_APPROX_TO_BLOCK_TYPES:
-      raise ValueError("Bad value {} for approx.".format(approx))
-
-    block_type = _CONV2D_APPROX_TO_BLOCK_TYPES[approx]
+    # It feels bad to pass in configuration that has to do with the internal
+    # implementation.  And then we can't use the same constructor for both
+    # anymore and are thus forced to use this ugly if-statement.
+    # TODO(b/74793309): Clean this up?
     if approx == APPROX_KRONECKER_NAME:
       block = self.register_block(
           params,
@@ -680,9 +714,9 @@ class LayerCollection(object):
               data_format=data_format),
           reuse=reuse)
     else:
-      raise NotImplementedError
+      raise NotImplementedError(approx)
 
-    block.register_additional_minibatch(inputs, outputs)
+    block.register_additional_tower(inputs, outputs)
 
     self._add_uses(params, 1)
 
@@ -712,16 +746,22 @@ class LayerCollection(object):
       dilation_rate: List of ints of length len(..input_spatial_size..).
         Dilations along spatial dimension.
       data_format: str or None. Format of data.
-      approx: str. One of "kron" or "diagonal".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      approx: str or None. If not None must be one of "kron" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
       ValueError: For improper value to 'approx'.
       KeyError: If reuse == True but no FisherBlock found for 'params'.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
+    # TODO(b/74793309): Have this use _get_block_type like the other
+    # registration functions?
     assert approx is None or approx == APPROX_KRONECKER_NAME
 
     block = self.register_block(
@@ -734,7 +774,7 @@ class LayerCollection(object):
             dilation_rate=dilation_rate,
             data_format=data_format),
         reuse=reuse)
-    block.register_additional_minibatch(inputs, outputs)
+    block.register_additional_tower(inputs, outputs)
 
     self._add_uses(params, 1)
 
@@ -762,16 +802,21 @@ class LayerCollection(object):
       rate: None or List of ints of length 2. Dilation rates in spatial
         dimensions.
       data_format: str or None. Format of data.
-      approx: None or str. Must be "diagonal" if non-None.
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      approx: str or None. If not None must "diagonal".  The Fisher
+        approximation to use. If None the default value is used. (Default: None)
+      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
       ValueError: For improper value to 'approx'.
       KeyError: If reuse == True but no FisherBlock found for 'params'.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
+    # TODO(b/74793309): Have this use _get_block_type like the other
+    # registration functions?
     assert approx is None or approx == APPROX_DIAGONAL_NAME
     assert data_format in [None, "NHWC"]
 
@@ -785,7 +830,7 @@ class LayerCollection(object):
             rate=rate,
             data_format=data_format),
         reuse=reuse)
-    block.register_additional_minibatch(inputs, outputs)
+    block.register_additional_tower(inputs, outputs)
 
     self._add_uses(params, 1)
 
@@ -803,7 +848,7 @@ class LayerCollection(object):
                                 reuse=VARIABLE_SCOPE):
     """Register a call to tf.nn.separable_conv2d().
 
-    Note: This requires access to intermediate outputs betwee depthwise and
+    Note: This requires access to intermediate outputs between depthwise and
     pointwise convolutions.
 
     Args:
@@ -824,10 +869,14 @@ class LayerCollection(object):
       rate: None or List of ints of length 2. Dilation rate of depthwise conv2d
         kernel in spatial dimensions.
       data_format: str or None. Format of data.
-      approx: None or str. Must be "kron" if non-None.
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      approx: str or None. If not None must be one of "kron" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
       ValueError: For improper value to 'approx'.
@@ -864,34 +913,32 @@ class LayerCollection(object):
 
     Args:
       params: Tensor or tuple of Tensors corresponding to the parameters.
-      batch_size: 0-D Tensor. Size of the minibatch.
-      approx: str. One of "full" or "diagonal".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      batch_size: 0-D Tensor. Size of the minibatch (for this tower).
+      approx: str or None. It not None, must be one of "full" or "diagonal".
+        The Fisher approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str. If True, this adds 'batch_size' to the total
+        mini-batch size use when estimating the Fisher block for this layer
+        (which must have already been registered). If "VARIABLE_SCOPE", use
+        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
 
     Raises:
       ValueError: For improper value to 'approx'.
       KeyError: If reuse == True but no FisherBlock found for 'params'.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_generic_approximation,
+        _GENERIC_APPROX_TO_BLOCK_TYPES)
 
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_generic_approximation
-
-    if approx not in _GENERIC_APPROX_TO_BLOCK_TYPES:
-      raise ValueError("Bad value {} for approx.".format(approx))
-
-    block_type = _GENERIC_APPROX_TO_BLOCK_TYPES[approx]
     block = self.register_block(params, block_type(self, params), reuse=reuse)
-    block.register_additional_minibatch(batch_size)
+    block.register_additional_tower(batch_size)
 
     self._add_uses(params, float("inf"))
 
   def register_fully_connected_multi(self, params, inputs, outputs,
-                                     approx=None, reuse=VARIABLE_SCOPE):
+                                     num_uses=None, approx=None,
+                                     reuse=VARIABLE_SCOPE):
     """Register fully connected layers with shared parameters.
 
     This can handle general fully-connected layers with shared parameters, but
@@ -902,41 +949,195 @@ class LayerCollection(object):
       params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
         this layer. Weight matrix should have shape [input_size, output_size].
         Bias should have shape [output_size].
-      inputs: A list of tensors, each of shape [batch_size, input_size]. Inputs
-        to layer. In the case of RNNs, one Tensor per time step.
-      outputs: A list of tensors, the same length as 'inputs', each of shape
-        [batch_size, output_size]. Outputs produced by layer. In the case of
-        RNNs, one Tensor per time step.
-      approx: str. One of "kron_indep", "kron_series_1", or "kron_series_2".
-      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse.
+      inputs: A list of Tensors, each of shape [batch_size, input_size]. Inputs
+        to layer. The list indexes each use in the graph (which might
+        correspond to a "time-step" in an RNN). OR, can be single Tensor, of
+        shape [num_uses * batch_size , input_size], which is a reshaped version
+        of a Tensor of shape [num_uses, batch_size, input_size].
+      outputs: A list of Tensors, the same length as 'inputs', each of shape
+        [batch_size, output_size]. Outputs produced by layer. The list indexes
+        each use in the graph (which might correspond to a "time-step" in an
+        RNN). Needs to correspond with the order used in 'inputs'.  OR, can be
+        a single Tensor of shape [num_uses * batch_size, output_size], which is
+        a reshaped version of a Tensor of shape [num_uses, batch_size,
+        output_size].
+      num_uses: int or None. The number uses/time-steps in the graph where the
+        layer appears. Only needed if both inputs and outputs are given in the
+        single Tensor format. (Default: None)
+      approx: str or None. If not None, must be of "kron_indep", "kron_series_1"
+        or "kron_series_2". The Fisher approximation to use. If None the default
+        value is used. (Default: None)
+      reuse: bool or str.  If True, this adds inputs and outputs as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
+        word 'use' here has a completely different meaning to "use in the graph"
+        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        (Default: "VARIABLE_SCOPE")
 
     Raises:
       ValueError: For improper value to 'approx'.
     """
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = self.default_fully_connected_multi_approximation
-    has_bias = isinstance(params, (tuple, list))
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_fully_connected_multi_approximation,
+        _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES)
 
     # TODO(b/70283649): something along the lines of find_canonical_output
     # should be added back in here (and for the other block types, arguably).
 
-    if approx not in _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES:
-      raise ValueError("Bad value {} for approx.".format(approx))
-    block_type = _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES[approx]
-
-    block = self.register_block(params, block_type(self, has_bias=has_bias),
+    has_bias = isinstance(params, (tuple, list))
+    block = self.register_block(params, block_type(self, has_bias=has_bias,
+                                                   num_uses=num_uses),
                                 reuse=reuse)
-    block.register_additional_minibatch(inputs, outputs)
-    self._add_uses(params, len(inputs))
+    block.register_additional_tower(inputs, outputs)
+    if isinstance(inputs, (tuple, list)):
+      assert len(inputs) == len(outputs)
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
+
+  def register_conv2d_multi(self,
+                            params,
+                            strides,
+                            padding,
+                            inputs,
+                            outputs,
+                            num_uses=None,
+                            data_format=None,
+                            dilations=None,
+                            approx=None,
+                            reuse=VARIABLE_SCOPE):
+    """Registers convolutional layers with shared parameters.
+
+    Args:
+      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
+        this layer. Weight matrix should have shape [kernel_height,
+        kernel_width, in_channels, out_channels].  Bias should have shape
+        [out_channels].
+      strides: 1-D Tensor of length 4. Strides for convolution kernel.
+      padding: string. see tf.nn.conv2d for valid values.
+      inputs: A list of Tensors, each of shape [batch_size, height, width,
+        in_channels]. Inputs to layer. The list indexes each use in the graph
+        (which might correspond to a "time-step" in an RNN). OR, can be single
+        Tensor, of shape [num_uses * batch_size, height, width, in_channels],
+        which is a reshaped version of a Tensor of shape [num_uses, batch_size,
+        height, width, in_channels].
+      outputs: A list of Tensors, each of shape [batch_size, height, width,
+        out_channels]. Output produced by layer. The list indexes each use
+        in the graph (which might correspond to a "time-step" in an RNN).
+        Needs to correspond with the order used in 'inputs'.  OR, can be a
+        single Tensor, of shape [num_uses * batch_size, height, width,
+        out_channels], which is a reshaped version of a Tensor of shape
+        [num_uses, batch_size, height, width, out_channels].
+      num_uses: int or None. The number uses/time-steps in the graph where the
+        layer appears. Only needed if both inputs and outputs are given in the
+        single Tensor format. (Default: None)
+      data_format: str or None. Format of data.
+      dilations: List of 4 ints. Dilations along each dimension.
+      approx: str or None. If not None must by "kron_indep". The Fisher
+        approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds inputs and outputs as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
+        word 'use' here has a completely different meaning to "use in the graph"
+        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        (Default: "VARIABLE_SCOPE")
+
+    Raises:
+      ValueError: For improper value to 'approx'.
+      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: If reuse == True and FisherBlock found but of the wrong type.
+    """
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_conv2d_multi_approximation,
+        _CONV2D_MULTI_APPROX_TO_BLOCK_TYPES)
+
+    block = self.register_block(
+        params,
+        block_type(
+            layer_collection=self,
+            params=params,
+            padding=padding,
+            strides=strides,
+            data_format=data_format,
+            dilation_rate=dilations,
+            extract_patches_fn="extract_image_patches",
+            num_uses=num_uses),
+        reuse=reuse)
+
+    block.register_additional_tower(inputs, outputs)
+    if isinstance(inputs, (tuple, list)):
+      assert len(inputs) == len(outputs)
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
 
   # TODO(b/74108452): change the loss registration functions names to refer
   # to "loss functions" instead of distributions.  Following naming convention
   # of the loss function classes themselves.
 
+  def register_embedding_multi(self,
+                               params,
+                               inputs,
+                               outputs,
+                               num_uses=None,
+                               approx=None,
+                               reuse=VARIABLE_SCOPE):
+    """Registers embedding layers with shared parameters.
+
+    Args:
+      params: Embedding matrix of shape [vocab_size, embedding_size].
+      inputs: A list of Tensors, each of shape [batch_size, input_size] and
+        dtype int32. Indices into embedding matrix. The list indexes each use
+        in the graph (which might correspond to a "time-step" in an RNN).
+        OR, can be single Tensor, of shape [num_uses*batch_size, input_size],
+        which is a reshaped version of a Tensor of shape [num_uses, batch_size,
+        input_size].
+      outputs: A list of Tensors, each of shape [batch_size, embedding_size].
+        Outputs produced by layer. The list indexes each use in the graph
+        (which might correspond to a "time-step" in an RNN). Needs to
+        correspond with the order used in 'inputs'. OR, can be a
+        single Tensor, of shape [num_uses * batch_size, embedding_size], which
+        is a reshaped version of a Tensor of shape [num_uses, batch_size,
+        embedding_size].
+      num_uses: int or None. The number uses/time-steps in the graph where the
+        layer appears. Only needed if both inputs and outputs are given in the
+        single Tensor format. (Default: None)
+      approx: str or None. If not None must by "kron_indep". The Fisher
+        approximation to use. If None the default value is used.
+        (Default: None)
+      reuse: bool or str.  If True, this adds inputs and outputs as an
+        additional mini-batch/tower of data to use when estimating the Fisher
+        block for this layer (which must have already been registered). If
+        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
+        word 'use' here has a completely different meaning to "use in the graph"
+        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        (Default: "VARIABLE_SCOPE")
+
+    Raises:
+      ValueError: For improper value to 'approx'.
+      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: If reuse == True and FisherBlock found but of the wrong type.
+    """
+    block_type, approx = self._get_block_type(
+        params, approx, self.default_embedding_multi_approximation,
+        _EMBEDDING_MULTI_APPROX_TO_BLOCK_TYPES)
+
+    if isinstance(params, (tuple, list)):
+      raise ValueError("Bias not supported.")
+    vocab_size = int(params.shape[0])
+
+    block = self.register_block(
+        params, block_type(self, vocab_size, num_uses=num_uses), reuse=reuse)
+    block.register_additional_tower(inputs, outputs)
+
+    if isinstance(inputs, (tuple, list)):
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
+
   def register_categorical_predictive_distribution(self,
                                                    logits,
                                                    seed=None,
@@ -955,9 +1156,10 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: (OPTIONAL) bool or str.  If True, reuse an existing FisherBlock.
-        If False, create a new FisherBlock.  If VARIABLE_SCOPE, use
-        tf.get_variable_scope().reuse.
+      reuse: bool or str.  If True, this adds 'logits' as an additional
+        mini-batch/tower of inputs to the loss-function/predictive distribution
+        (which must have already been registered). If "VARIABLE_SCOPE", use
+        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
     """
     loss = lf.CategoricalLogitsNegativeLogProbLoss(logits, targets=targets,
                                                    seed=seed)
@@ -988,9 +1190,10 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: (OPTIONAL) bool or str.  If True, reuse an existing FisherBlock.
-        If False, create a new FisherBlock.  If VARIABLE_SCOPE, use
-        tf.get_variable_scope().reuse.
+      reuse: bool or str.  If True, this adds 'mean' and 'var' as an additional
+        mini-batch/tower of inputs to the loss-function/predictive distribution
+        (which must have already been registered). If "VARIABLE_SCOPE", use
+        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
     """
     loss = lf.NormalMeanNegativeLogProbLoss(mean, var, targets=targets,
                                             seed=seed)
@@ -1016,9 +1219,10 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: (OPTIONAL) bool or str.  If True, reuse an existing FisherBlock.
-        If False, create a new FisherBlock.  If VARIABLE_SCOPE, use
-        tf.get_variable_scope().reuse.
+      reuse: bool or str.  If True, this adds 'logits' as an additional
+        mini-batch/tower of inputs to the loss-function/predictive distribution
+        (which must have already been registered). If "VARIABLE_SCOPE", use
+        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
     """
     loss = lf.MultiBernoulliNegativeLogProbLoss(logits, targets=targets,
                                                 seed=seed)
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index 083da768ec97aca3e63995491bb579835bb5377f..843aeef7d82df064b757ab4618f2b0ccbbec4cbe 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import warnings
-
 # pylint disable=long-line
 from tensorflow.contrib.kfac.python.ops import curvature_matrix_vector_products as cmvp
 from tensorflow.contrib.kfac.python.ops import estimator as est
@@ -53,8 +52,8 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
                estimation_mode="gradients",
                colocate_gradients_with_ops=True,
                batch_size=None,
-               cov_devices=None,
-               inv_devices=None):
+               placement_strategy=None,
+               **kwargs):
     """Initializes the KFAC optimizer with the given settings.
 
     Args:
@@ -96,14 +95,11 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           (Default: True)
       batch_size: The size of the mini-batch. Only needed when momentum_type
           == 'qmodel' or when automatic adjustment is used.  (Default: None)
-      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
-          computations will be placed on these devices in a round-robin fashion.
-          Can be None, which means that no devices are specified. Only used
-          with (soon-to-be-depcrecated "convenience" properties).
-      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
-          computations will be placed on these devices in a round-robin fashion.
-          Can be None, which means that no devices are specified. Only used
-          with (soon-to-be-depcrecated "convenience" properties).
+      placement_strategy: string, Device placement strategy used when creating
+        covariance variables, covariance ops, and inverse ops.
+        (Default: `None`)
+      **kwargs: Arguments to be passesd to specific placement
+        strategy mixin. Check `placement.RoundRobinPlacementMixin` for example.
 
     Raises:
       ValueError: If the momentum type is unsupported.
@@ -123,8 +119,6 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     self._layers = layer_collection
     self._estimation_mode = estimation_mode
     self._colocate_gradients_with_ops = colocate_gradients_with_ops
-    self._cov_devices = cov_devices
-    self._inv_devices = inv_devices
 
     # The below paramaters are required only if damping needs to be adapated.
     # These parameters can be set by calling
@@ -164,16 +158,19 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     self._momentum_type = momentum_type
     self._norm_constraint = norm_constraint
     self._batch_size = batch_size
+    self._placement_strategy = placement_strategy
 
     with variable_scope.variable_scope(name):
-      self._fisher_est = est.FisherEstimator(
-          self._variables,
-          self._cov_ema_decay,
-          self.damping,
-          self._layers,
+      self._fisher_est = est.make_fisher_estimator(
+          placement_strategy=placement_strategy,
+          variables=self._variables,
+          cov_ema_decay=self._cov_ema_decay,
+          damping=self.damping,
+          layer_collection=self._layers,
           exps=(-1,),
           estimation_mode=self._estimation_mode,
-          colocate_gradients_with_ops=self._colocate_gradients_with_ops)
+          colocate_gradients_with_ops=self._colocate_gradients_with_ops,
+          **kwargs)
 
     super(KfacOptimizer, self).__init__(learning_rate, name=name)
 
@@ -236,6 +233,21 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       self._damping = variable_scope.get_variable(
           "damping", initializer=self._damping_constant, trainable=False)
 
+  @property
+  def variables(self):
+    return self._variables
+
+  @property
+  def damping(self):
+    if self._damping:
+      return self._damping
+    else:
+      return self._damping_constant
+
+  @property
+  def damping_adaptation_interval(self):
+    return self._damping_adaptation_interval
+
   @property
   def cov_update_thunks(self):
     self._maybe_make_and_save_everything()
@@ -266,37 +278,20 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     self._maybe_make_and_save_everything()
     return self._inv_update_op
 
-  @property
-  def variables(self):
-    return self._variables
-
-  @property
-  def damping(self):
-    if self._damping:
-      return self._damping
-    else:
-      return self._damping_constant
-
-  @property
-  def damping_adaptation_interval(self):
-    return self._damping_adaptation_interval
-
   def _maybe_make_and_save_everything(self):
     if not self._fisher_est.made_vars():
       warnings.warn("These convenience properties will be depcrecated soon. "
                     "Please use explicit op/thunk creation methods instead "
-                    "(e.g. make_ops_and_vars_round_robin, etc).",
+                    "(e.g. make_ops_and_vars, etc).",
                     DeprecationWarning)
       (self._cov_update_ops, self._cov_update_op, self._inv_update_ops,
        self._inv_update_op, self._cov_update_thunks,
-       self._inv_update_thunks) = self.make_ops_and_vars_round_robin(
-           cov_devices=self._cov_devices,
-           inv_devices=self._inv_devices)
+       self._inv_update_thunks) = self.make_ops_and_vars()
 
   def make_ops_and_vars(self):
-    """Make ops and vars with no specific device placement.
+    """Make ops and vars with device placement `self._placement_strategy`.
 
-    See make_ops_and_vars_round_robin for details.
+    See `FisherEstimator.make_ops_and_vars` for details.
 
     Returns:
       cov_update_ops: List of ops that compute the cov updates. Corresponds
@@ -307,77 +302,11 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       cov_update_op: cov_update_ops grouped into a single op.
       inv_update_op: inv_update_ops grouped into a single op.
     """
-    with variable_scope.variable_scope(self.get_name()):
-      return self._fisher_est.make_ops_and_vars()
-
-  def make_ops_and_vars_round_robin(self, cov_devices=None, inv_devices=None):
-    """Make ops and vars with a round-robin device placement strategy.
-
-    For each factor, all of that factor's cov variables and their associated
-    update ops will be placed on a particular device.  A new device is chosen
-    for each factor by cycling through list of devices in the cov_devices
-    argument. If cov_devices is None then no explicit device placement occurs.
-
-    An analogous strategy is followed for inverse update ops, with the list of
-    devices being given by the inv_devices argument.
+    return self._fisher_est.make_ops_and_vars(scope=self.get_name())
 
-    Inverse variables on the other hand are not placed on any specific device
-    (they will just use the current the device placement context, whatever
-    that happens to be).  The idea is that the inverse variable belong where
-    they will be accessed most often, which is the device that actually applies
-    the preconditioner to the gradient. The user will be responsible for setting
-    the device context for this.
-
-    Args:
-      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
-      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
+  def make_vars_and_create_op_thunks(self):
+    """Make vars and create op thunks.
 
-    Returns:
-      cov_update_ops: List of ops that compute the cov updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      cov_update_op: cov_update_ops grouped into a single op.
-      inv_update_ops: List of ops that compute the inv updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      cov_update_op: cov_update_ops grouped into a single op.
-      inv_update_op: inv_update_ops grouped into a single op.
-      cov_update_thunks: Thunks that make the ops in cov_update_ops.
-      inv_update_thunks: Thunks that make the ops in inv_update_ops.
-    """
-    with variable_scope.variable_scope(self.get_name()):
-      return self._fisher_est.make_ops_and_vars_round_robin(
-          cov_devices=cov_devices, inv_devices=inv_devices)
-
-  def make_vars_and_create_op_thunks_round_robin(self,
-                                                 cov_devices=None,
-                                                 inv_devices=None):
-    """Make vars and create op thunks w/ a round-robin device placement strat.
-
-    For each factor, all of that factor's cov variables and their associated
-    update ops will be placed on a particular device.  A new device is chosen
-    for each factor by cycling through list of devices in the cov_devices
-    argument. If cov_devices is None then no explicit device placement occurs.
-
-    An analogous strategy is followed for inverse update ops, with the list of
-    devices being given by the inv_devices argument.
-
-    Inverse variables on the other hand are not placed on any specific device
-    (they will just use the current the device placement context, whatever
-    that happens to be).  The idea is that the inverse variable belong where
-    they will be accessed most often, which is the device that actually applies
-    the preconditioner to the gradient. The user will be responsible for setting
-    the device context for this.
-
-    Args:
-      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
-      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
     Returns:
       cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
         the list of factors given by the "factors" property.
@@ -385,10 +314,9 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
         the list of factors given by the "factors" property.
     """
     scope = self.get_name() + "/" + self._fisher_est.name
-    return self._fisher_est.make_vars_and_create_op_thunks_round_robin(
-        scope=scope, cov_devices=cov_devices, inv_devices=inv_devices)
+    return self._fisher_est.make_vars_and_create_op_thunks(scope=scope)
 
-  def ops_and_vars_thunks(self):
+  def create_ops_and_vars_thunks(self):
     """Create thunks that make the ops and vars on demand.
 
     This function returns 4 lists of thunks: cov_variable_thunks,
@@ -413,7 +341,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       inv_update_thunks: A list of thunks that make the inv update ops.
     """
     scope = self.get_name() + "/" + self._fisher_est.name
-    return self._fisher_est.ops_and_vars_thunks(scope=scope)
+    return self._fisher_est.create_ops_and_vars_thunks(scope=scope)
 
   def minimize(self, *args, **kwargs):
     # Should this variable scope encompass everything below?  Or will the super-
@@ -462,7 +390,6 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       An `Operation` that applies the specified gradients.
     """
     self._maybe_make_and_save_everything()
-
     # In Python 3, grads_and_vars can be a zip() object which can only be
     # iterated over once. By converting it to a list, we ensure that it can be
     # iterated over more than once.
@@ -618,7 +545,6 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     # compute the matrix-vector products with the transposed Fisher factor
     fft_precon_grads = cmvpc.multiply_fisher_factor_transpose(precon_grads)
     fft_prev_updates = cmvpc.multiply_fisher_factor_transpose(prev_updates)
-
     batch_size = math_ops.cast(
         self._batch_size, dtype=fft_precon_grads[0].dtype)
 
@@ -802,7 +728,6 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     # Go through variable and update its associated part of the velocity vector.
     return [_update_velocity(vec, var) for vec, var in vecs_and_vars]
 
-  # TODO(b/73448937): Move all update damping code to a separate class/function.
   def _update_damping(self, prev_batch, global_step):
     """Adapts damping parameter. Check KFAC (Section 6.5) for the details.
 
diff --git a/tensorflow/contrib/kfac/python/ops/placement.py b/tensorflow/contrib/kfac/python/ops/placement.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf12dbaa9adbaa4af1511034aef0b5ab59d53e26
--- /dev/null
+++ b/tensorflow/contrib/kfac/python/ops/placement.py
@@ -0,0 +1,167 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements placement strategies for cov and inv ops, cov variables."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.framework import ops as tf_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+
+
+def _make_thunk_on_device(func, device):
+  def thunk():
+    with tf_ops.device(device):
+      return func()
+  return thunk
+
+
+class RoundRobinPlacementMixin(object):
+  """Implements round robin placement strategy for ops and variables."""
+
+  def __init__(self, cov_devices=None, inv_devices=None, *args, **kwargs):
+    """Initializes the RoundRobinPlacementMixin class.
+
+    Args:
+      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
+        computations will be placed on these devices in a round-robin fashion.
+        Can be None, which means that no devices are specified.
+      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
+        computations will be placed on these devices in a round-robin fashion.
+        Can be None, which means that no devices are specified.
+      *args:
+      **kwargs:
+
+    """
+    super(RoundRobinPlacementMixin, self).__init__(*args, **kwargs)
+    self._cov_devices = cov_devices
+    self._inv_devices = inv_devices
+
+  def make_ops_and_vars(self, scope=None):
+    """Make ops and vars with a round-robin device placement strategy.
+
+    For each factor, all of that factor's cov variables and their associated
+    update ops will be placed on a particular device.  A new device is chosen
+    for each factor by cycling through list of devices in the
+    `self._cov_devices` attribute. If `self._cov_devices` is `None` then no
+    explicit device placement occurs.
+
+    An analogous strategy is followed for inverse update ops, with the list of
+    devices being given by the `self._inv_devices` attribute.
+
+    Inverse variables on the other hand are not placed on any specific device
+    (they will just use the current the device placement context, whatever
+    that happens to be).  The idea is that the inverse variable belong where
+    they will be accessed most often, which is the device that actually applies
+    the preconditioner to the gradient. The user will be responsible for setting
+    the device context for this.
+
+    Args:
+      scope: A string or None.  If None it will be set to the name of this
+        estimator (given by the name property). All variables will be created,
+        and all ops will execute, inside of a variable scope of the given
+        name. (Default: None)
+
+    Returns:
+      cov_update_ops: List of ops that compute the cov updates. Corresponds
+        one-to-one with the list of factors given by the "factors" property.
+      cov_update_op: cov_update_ops grouped into a single op.
+      inv_update_ops: List of ops that compute the inv updates. Corresponds
+        one-to-one with the list of factors given by the "factors" property.
+      inv_update_op: inv_update_ops grouped into a single op.
+      cov_update_thunks: Thunks that make the ops in cov_update_ops.
+      inv_update_thunks: Thunks that make the ops in inv_update_ops.
+    """
+    (cov_update_thunks,
+     inv_update_thunks) = self.make_vars_and_create_op_thunks(scope=scope)
+    cov_update_ops = [thunk() for thunk in cov_update_thunks]
+    inv_update_ops = [thunk() for thunk in inv_update_thunks]
+
+    scope = self.name if scope is None else scope
+    with variable_scope.variable_scope(scope):
+      cov_update_op = control_flow_ops.group(cov_update_ops,
+                                             name="cov_update_op")
+      inv_update_op = control_flow_ops.group(inv_update_ops,
+                                             name="inv_update_op")
+
+    return (cov_update_ops, cov_update_op, inv_update_ops, inv_update_op,
+            cov_update_thunks, inv_update_thunks)
+
+  def make_vars_and_create_op_thunks(self, scope=None):
+    """Make vars and create op thunks w/ a round-robin device placement strat.
+
+    For each factor, all of that factor's cov variables and their associated
+    update ops will be placed on a particular device.  A new device is chosen
+    for each factor by cycling through list of devices in the
+    `self._cov_devices` attribute. If `self._cov_devices` is `Non`e then no
+    explicit device placement occurs.
+
+    An analogous strategy is followed for inverse update ops, with the list of
+    devices being given by the `self._inv_devices` attribute.
+
+    Inverse variables on the other hand are not placed on any specific device
+    (they will just use the current the device placement context, whatever
+    that happens to be).  The idea is that the inverse variable belong where
+    they will be accessed most often, which is the device that actually applies
+    the preconditioner to the gradient. The user will be responsible for setting
+    the device context for this.
+
+    Args:
+      scope: A string or None.  If None it will be set to the name of this
+        estimator (given by the name property). All variables will be created,
+        and all thunks will execute, inside of a variable scope of the given
+        name. (Default: None)
+
+    Returns:
+      cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+      inv_update_thunks: List of inv update thunks. Corresponds one-to-one with
+        the list of factors given by the "factors" property.
+    """
+    # Note: `create_ops_and_vars_thunks` is implemented in `FisherEstimator`.
+    (cov_variable_thunks_raw, cov_update_thunks_raw, inv_variable_thunks_raw,
+     inv_update_thunks_raw) = self.create_ops_and_vars_thunks(scope=scope)
+
+    if self._cov_devices:
+      cov_update_thunks = []
+      for cov_variable_thunk, cov_update_thunk, device in zip(
+          cov_variable_thunks_raw, cov_update_thunks_raw,
+          itertools.cycle(self._cov_devices)):
+        with tf_ops.device(device):
+          cov_variable_thunk()
+        cov_update_thunks.append(_make_thunk_on_device(cov_update_thunk,
+                                                       device))
+    else:
+      for cov_variable_thunk in cov_variable_thunks_raw:
+        cov_variable_thunk()
+      cov_update_thunks = cov_update_thunks_raw
+
+    for inv_variable_thunk in inv_variable_thunks_raw:
+      inv_variable_thunk()
+
+    if self._inv_devices:
+      inv_update_thunks = []
+      for inv_update_thunk, device in zip(inv_update_thunks_raw,
+                                          itertools.cycle(self._inv_devices)):
+        inv_update_thunks.append(_make_thunk_on_device(inv_update_thunk,
+                                                       device))
+    else:
+      inv_update_thunks = inv_update_thunks_raw
+
+    return cov_update_thunks, inv_update_thunks
diff --git a/tensorflow/contrib/kfac/python/ops/utils.py b/tensorflow/contrib/kfac/python/ops/utils.py
index af26f5e56bf9bb22cc9bc2b409209d027477ed89..b6f42815e79fa5eb9c6a2aa9f99ac3ec5a70ad0a 100644
--- a/tensorflow/contrib/kfac/python/ops/utils.py
+++ b/tensorflow/contrib/kfac/python/ops/utils.py
@@ -649,9 +649,6 @@ class PartitionedTensor(object):
   def dtype(self):
     return self.tensors[0].dtype
 
-  def devices(self):
-    return set(tensor.device for tensor in self.tensors)
-
   def __str__(self):
     return "PartitionedTensor([%s, ...], dtype=%s, shape=%s)" % (
         self.tensors[0].name, self.dtype.name, tuple(self.shape.as_list()))
@@ -659,6 +656,17 @@ class PartitionedTensor(object):
   def __hash__(self):
     return hash(tuple(self.tensors))
 
+  def __eq__(self, other):
+    if not isinstance(other, PartitionedTensor):
+      return False
+    return self.tensors == other.tensors
+
+  def __ne__(self, other):
+    return not self == other  # pylint: disable=g-comparison-negation
+
+  def __getitem__(self, key):
+    return self.as_tensor()[key]
+
   def as_tensor(self, dtype=None, name=None, as_ref=False):
     with ops.name_scope(name, "PartitionedTensor.as_tensor", self.tensors):
       assert not as_ref
@@ -670,6 +678,15 @@ class PartitionedTensor(object):
         self._concats[result.device] = result
       return self._concats[result.device]
 
+  @property
+  def device(self):
+    # PartitionedTensors in general do not live on a single device.  If the
+    # device cannot be determined unambiguously this property will return None.
+    device = self.tensors[0].device
+    if all(tensor.device == device for tensor in self.tensors):
+      return device
+    return None
+
 
 ops.register_tensor_conversion_function(
     PartitionedTensor,
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 544065dac6a10094a376c18e84521b1a26401cdd..c8812d4b23f94102d093db878a709b090a3318d6 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -214,14 +214,3 @@ py_test(
         "//tensorflow/python:math_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index cc7bbabf210ded9a31eb789fa8b94e8bde62ea43..d5b3b279a1b7327602790c0260349cb0c758aa86 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -392,15 +392,3 @@ py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 337c9e06b870b2cca53fcdbf3d94225660e193c4..00f03a111ae8be7f49761ef5fb5a82810bcca182 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -104,6 +104,7 @@ See the @{$python/contrib.layers} guide.
 @@infer_real_valued_columns
 @@sequence_input_from_feature_columns
 
+@@group_norm
 @@instance_norm
 """
 
@@ -122,6 +123,7 @@ _allowed_symbols = ['bias_add',
                     'conv3d',
                     'elu',
                     'feature_column',
+                    'group_norm',
                     'instance_norm',
                     'legacy_fully_connected',
                     'legacy_linear',
diff --git a/tensorflow/contrib/layers/kernels/BUILD b/tensorflow/contrib/layers/kernels/BUILD
index e407a9ce015603094c7bbab72856403e2f0eb1a1..7aae09ff3e9995b2d92b05211b3bf8a94a26ff43 100644
--- a/tensorflow/contrib/layers/kernels/BUILD
+++ b/tensorflow/contrib/layers/kernels/BUILD
@@ -18,14 +18,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 350bcb3bca11b4cad18ce863ab1496076477aa3c..10d7f6d076b4b4c6578d7adcffc4e9cc44d77ac6 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -3045,16 +3045,16 @@ def legacy_fully_connected(x,
   `activation_fn` is `None`, the result of `y = w * x + b` is
   returned.
 
-  If `x` has shape [\\\(\\text{dim}_0, \\text{dim}_1, ..., \\text{dim}_n\\\)]
-  with more than 2 dimensions (\\\(n > 1\\\)), then we repeat the matrix
+  If `x` has shape [\\(\text{dim}_0, \text{dim}_1, ..., \text{dim}_n\\)]
+  with more than 2 dimensions (\\(n > 1\\)), then we repeat the matrix
   multiply along the first dimensions. The result r is a tensor of shape
-  [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`],
-  where \\\( r_{i_0, ..., i_{n-1}, k} =
-  \\sum_{0 \\leq j < \\text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\\).
+  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`],
+  where \\( r_{i_0, ..., i_{n-1}, k} =
+  \sum_{0 \leq j < \text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\).
   This is accomplished by reshaping `x` to 2-D
-  [\\\(\\text{dim}_0 \\cdot ... \\cdot \\text{dim}_{n-1}, \\text{dim}_n\\\)]
+  [\\(\text{dim}_0 \cdot ... \cdot \text{dim}_{n-1}, \text{dim}_n\\)]
   before the matrix multiply and afterwards reshaping it to
-  [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`].
+  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`].
 
   This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting
   `bias_init` to `None`.
diff --git a/tensorflow/contrib/layers/python/layers/normalization.py b/tensorflow/contrib/layers/python/layers/normalization.py
index e7d4080ff769327cc74b6629a7705ddfa552169b..c807ab0f2e5c8ac3ec2ae1d84a5b36b5f4ba76a4 100644
--- a/tensorflow/contrib/layers/python/layers/normalization.py
+++ b/tensorflow/contrib/layers/python/layers/normalization.py
@@ -24,11 +24,13 @@ from tensorflow.contrib.layers.python.layers import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope
 
 
 __all__ = [
+    'group_norm',
     'instance_norm',
 ]
 
@@ -158,3 +160,196 @@ def instance_norm(inputs,
     if activation_fn is not None:
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
+
+@add_arg_scope
+def group_norm(inputs,
+               groups=32,
+               channels_axis=-1,
+               reduction_axes=(-3, -2),
+               center=True,
+               scale=True,
+               epsilon=1e-6,
+               activation_fn=None,
+               param_initializers=None,
+               reuse=None,
+               variables_collections=None,
+               outputs_collections=None,
+               trainable=True,
+               scope=None):
+  """Functional interface for the group normalization layer.
+
+  Reference: https://arxiv.org/abs/1803.08494.
+
+    "Group Normalization", Yuxin Wu, Kaiming He
+
+  Args:
+    inputs: A Tensor with at least 2 dimensions one which is channels. All
+     shape dimensions must be fully defined.
+    groups: Integer. Divide the channels into this number of groups over which
+      normalization statistics are computed. This number must be commensurate
+      with the number of channels in `inputs`.
+    channels_axis: An integer. Specifies index of channels axis which will be
+      broken into `groups`, each of which whose statistics will be computed
+      across. Must be mutually exclusive with `reduction_axes`. Preferred usage
+      is to specify negative integers to be agnostic as to whether a batch
+      dimension is included.
+    reduction_axes: Tuple of integers. Specifies dimensions over which
+       statistics will be accumulated. Must be mutually exclusive with
+       `channels_axis`. Statistics will not be accumulated across axes not
+       specified in `reduction_axes` nor `channel_axis`. Preferred usage is to
+       specify negative integers to be agnostic to whether a batch dimension is
+       included.
+
+      Some sample usage cases:
+        NHWC format: channels_axis=-1, reduction_axes=[-3, -2]
+        NCHW format: channels_axis=-3, reduction_axes=[-2, -1]
+
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is
+      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling can be done by the next layer.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    activation_fn: Activation function, default set to None to skip it and
+      maintain a linear activation.
+    param_initializers: Optional initializers for beta, gamma, moving mean and
+      moving variance.
+    reuse: Whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: Optional collections for the variables.
+    outputs_collections: Collections to add the outputs.
+    trainable: If `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    scope: Optional scope for `variable_scope`.
+
+  Returns:
+    A `Tensor` representing the output of the operation.
+
+  Raises:
+    ValueError: If the rank of `inputs` is undefined.
+    ValueError: If rank or channels dimension of `inputs` is undefined.
+    ValueError: If number of groups is not commensurate with number of channels.
+    ValueError: If reduction_axes or channels_axis are out of bounds.
+    ValueError: If reduction_axes are not mutually exclusive with channels_axis.
+  """
+  # TODO(shlens): Support partially defined shapes for the inputs.
+  inputs = ops.convert_to_tensor(inputs)
+  original_shape = inputs.shape
+
+  if inputs.shape.ndims is None:
+    raise ValueError('Inputs %s has undefined rank.' % inputs.name)
+  if channels_axis > (inputs.shape.ndims - 1):
+    raise ValueError('Axis is out of bounds.')
+
+  # Standardize the channels_axis to be positive and identify # of channels.
+  if channels_axis < 0:
+    channels_axis = inputs.shape.ndims + channels_axis
+  channels = inputs.shape[channels_axis].value
+
+  if channels is None:
+    raise ValueError('Inputs %s has undefined channel dimension: %d.' % (
+        inputs.name, channels_axis))
+
+  # Standardize the reduction_axes to be positive.
+  reduction_axes = list(reduction_axes)
+  for i in range(len(reduction_axes)):
+    if reduction_axes[i] < 0:
+      reduction_axes[i] += inputs.shape.ndims
+
+  for a in reduction_axes:
+    if a > inputs.shape.ndims:
+      raise ValueError('Axis is out of bounds.')
+    if inputs.shape[a].value is None:
+      raise ValueError('Inputs %s has undefined dimensions %d.' % (
+          inputs.name, a))
+    if channels_axis == a:
+      raise ValueError('reduction_axis must be mutually exclusive '
+                       'with channels_axis')
+  if groups > channels:
+    raise ValueError('Invalid groups %d for %d channels.' % (groups, channels))
+  if channels % groups != 0:
+    raise ValueError('%d channels is not commensurate with %d groups.' %
+                     (channels, groups))
+
+  # Determine axes before channels. Some examples of common image formats:
+  #  'NCHW': before = [N], after = [HW]
+  #  'NHWC': before = [NHW], after = []
+  axes_before_channels = inputs.shape.as_list()[:channels_axis]
+  axes_after_channels = inputs.shape.as_list()[channels_axis+1:]
+
+  # Manually broadcast the parameters to conform to the number of groups.
+  params_shape_broadcast = ([1] * len(axes_before_channels) +
+                            [groups, channels // groups] +
+                            [1] * len(axes_after_channels))
+
+  # Reshape the input by the group within the channel dimension.
+  inputs_shape = (axes_before_channels + [groups, channels // groups] +
+                  axes_after_channels)
+  inputs = array_ops.reshape(inputs, inputs_shape)
+
+  # Determine the dimensions across which moments are calculated.
+  moments_axes = [channels_axis + 1]
+  for a in reduction_axes:
+    if a > channels_axis:
+      moments_axes.append(a + 1)
+    else:
+      moments_axes.append(a)
+
+  with variable_scope.variable_scope(
+      scope, 'GroupNorm', [inputs], reuse=reuse) as sc:
+    # Note that the params_shape is the number of channels always.
+    params_shape = [channels]
+
+    # Allocate parameters for the beta and gamma of the normalization.
+    beta, gamma = None, None
+    dtype = inputs.dtype.base_dtype
+    if param_initializers is None:
+      param_initializers = {}
+    if center:
+      beta_collections = utils.get_variable_collections(
+          variables_collections, 'beta')
+      beta_initializer = param_initializers.get(
+          'beta', init_ops.zeros_initializer())
+      beta = variables.model_variable('beta',
+                                      shape=params_shape,
+                                      dtype=dtype,
+                                      initializer=beta_initializer,
+                                      collections=beta_collections,
+                                      trainable=trainable)
+      beta = array_ops.reshape(beta, params_shape_broadcast)
+
+    if scale:
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
+      gamma_initializer = param_initializers.get(
+          'gamma', init_ops.ones_initializer())
+      gamma = variables.model_variable('gamma',
+                                       shape=params_shape,
+                                       dtype=dtype,
+                                       initializer=gamma_initializer,
+                                       collections=gamma_collections,
+                                       trainable=trainable)
+      gamma = array_ops.reshape(gamma, params_shape_broadcast)
+
+    # Calculate the moments.
+    mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)
+
+    # Compute normalization.
+    # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor
+    # appropriately so that this operation may be faster.
+    gain = math_ops.rsqrt(variance + epsilon)
+    offset = -mean * gain
+    if gamma is not None:
+      gain *= gamma
+      offset *= gamma
+    if beta is not None:
+      offset += beta
+    outputs = inputs * gain + offset
+
+    # Collapse the groups into the channel dimension.
+    outputs = array_ops.reshape(outputs, original_shape)
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
diff --git a/tensorflow/contrib/layers/python/layers/normalization_test.py b/tensorflow/contrib/layers/python/layers/normalization_test.py
index 5cff1bf0ebb2fe8bc6933de882ecd47a9edf0f94..b6e96350db92baf4770683273be7e5dde73dbcec 100644
--- a/tensorflow/contrib/layers/python/layers/normalization_test.py
+++ b/tensorflow/contrib/layers/python/layers/normalization_test.py
@@ -166,5 +166,231 @@ class InstanceNormTest(test.TestCase):
   def testOutputBigInput5DNCHW(self):
     self.doOutputTest((1, 100, 100, 1, 1), 'NCHW', tol=1e-3)
 
+
+class GroupNormTest(test.TestCase):
+
+  def testInvalidGroupSize(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(5, 2, 10, 10))
+    with self.assertRaisesRegexp(ValueError,
+                                 'Invalid groups 10 for 2 channels.'):
+      normalization.group_norm(inputs, groups=10,
+                               reduction_axes=[-2, -1], channels_axis=-3)
+
+  def testBadCommensurateGroup(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(5, 4, 10, 10))
+    with self.assertRaisesRegexp(ValueError,
+                                 '4 channels is not commensurate with '
+                                 '3 groups.'):
+      normalization.group_norm(inputs, groups=3,
+                               reduction_axes=[-2, -1], channels_axis=-3)
+
+  def testAxisIsBad(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 2, 4, 5))
+    with self.assertRaisesRegexp(ValueError,
+                                 'Axis is out of bounds.'):
+      normalization.group_norm(inputs, channels_axis=5)
+    with self.assertRaisesRegexp(ValueError,
+                                 'Axis is out of bounds.'):
+      normalization.group_norm(inputs, reduction_axes=[1, 5])
+
+  def testNotMutuallyExclusiveAxis(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(10, 32, 32, 32))
+    # Specify axis with negative values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=-2, reduction_axes=[-2])
+    # Specify axis with positive values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=1, reduction_axes=[1, 3])
+    # Specify axis with mixed positive and negative values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=-2, reduction_axes=[2])
+
+  def testUnknownShape(self):
+    inputs = array_ops.placeholder(dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'undefined rank'):
+      normalization.group_norm(inputs)
+
+  def testParamsShapeNotFullyDefinedReductionAxes(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 32, None, 4))
+    with self.assertRaisesRegexp(ValueError, 'undefined dimensions'):
+      normalization.group_norm(inputs)
+
+  def testParamsShapeNotFullyDefinedChannelsAxis(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 3, 4, None))
+    with self.assertRaisesRegexp(ValueError, 'undefined channel dimension'):
+      normalization.group_norm(inputs, channels_axis=-1,
+                               reduction_axes=[-3, -2])
+
+  def testCreateOp(self):
+    height, width, groups = 3, 3, 4
+    images = random_ops.random_uniform((5, height, width, 2*groups), seed=1)
+    output = normalization.group_norm(images, groups=groups, channels_axis=-1,
+                                      reduction_axes=[-3, -2])
+    print('name: ', output.op.name)
+    self.assertListEqual([5, height, width, 2*groups], output.shape.as_list())
+
+  def testCreateOpFloat64(self):
+    height, width, groups = 3, 3, 5
+    images = random_ops.random_uniform(
+        (5, height, width, 4*groups), dtype=dtypes.float64, seed=1)
+    output = normalization.group_norm(images, groups=groups)
+    self.assertEqual(dtypes.float64, output.dtype)
+    self.assertListEqual([5, height, width, 4*groups], output.shape.as_list())
+
+  def testCreateOpNoScaleCenter(self):
+    height, width, groups = 3, 3, 7
+    images = random_ops.random_uniform(
+        (5, height, width, 3*groups), dtype=dtypes.float32, seed=1)
+    output = normalization.group_norm(images, groups=groups, center=False,
+                                      scale=False)
+    self.assertListEqual([5, height, width, 3*groups], output.shape.as_list())
+    self.assertEqual(0, len(contrib_variables.get_variables_by_name('beta')))
+    self.assertEqual(0, len(contrib_variables.get_variables_by_name('gamma')))
+
+  def testCreateVariables_NHWC(self):
+    height, width = 3, 3
+    images = random_ops.random_uniform((5, height, width, 8), seed=1)
+    normalization.group_norm(images, groups=4,
+                             channels_axis=-1, reduction_axes=(-3, -2),
+                             center=True, scale=True)
+    beta = contrib_variables.get_variables_by_name('beta')[0]
+    gamma = contrib_variables.get_variables_by_name('gamma')[0]
+    self.assertEqual('GroupNorm/beta', beta.op.name)
+    self.assertEqual('GroupNorm/gamma', gamma.op.name)
+
+  def testCreateVariables_NCHW(self):
+    height, width, groups = 3, 3, 4
+    images = random_ops.random_uniform((5, 2*groups, height, width), seed=1)
+    normalization.group_norm(images, groups=4,
+                             channels_axis=-3, reduction_axes=(-2, -1),
+                             center=True, scale=True)
+    beta = contrib_variables.get_variables_by_name('beta')[0]
+    gamma = contrib_variables.get_variables_by_name('gamma')[0]
+    self.assertEqual('GroupNorm/beta', beta.op.name)
+    self.assertEqual('GroupNorm/gamma', gamma.op.name)
+
+  def testReuseVariables(self):
+    height, width = 3, 3
+    images = random_ops.random_uniform((5, height, width, 4), seed=1)
+    normalization.group_norm(images, groups=2, scale=True, scope='IN')
+    normalization.group_norm(images, groups=2, scale=True, scope='IN',
+                             reuse=True)
+    beta = contrib_variables.get_variables_by_name('beta')
+    gamma = contrib_variables.get_variables_by_name('gamma')
+    self.assertEqual(1, len(beta))
+    self.assertEqual(1, len(gamma))
+
+  def testValueCorrectWithReuseVars(self):
+    height, width = 3, 3
+    image_shape = (10, height, width, 4)
+    images = random_ops.random_uniform(image_shape, seed=1)
+    output_train = normalization.group_norm(images, groups=2, scope='IN')
+    output_eval = normalization.group_norm(images, groups=2, scope='IN',
+                                           reuse=True)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      # output_train and output_eval should be the same.
+      train_np, eval_np = sess.run([output_train, output_eval])
+      self.assertAllClose(train_np, eval_np)
+
+  def doOutputTest(self, input_shape, channels_axis=None, reduction_axes=None,
+                   groups=2, tol=1e-2):
+    # Select the axis for the channel and the dimensions along which statistics
+    # are accumulated.
+    if channels_axis < 0:
+      channels_axis += len(input_shape)
+    reduced_axes = [channels_axis + 1]
+    for a in reduction_axes:
+      if a < 0:
+        a += len(input_shape)
+      if a < channels_axis:
+        reduced_axes.append(a)
+      else:
+        reduced_axes.append(a+1)
+    reduced_axes = tuple(reduced_axes)
+
+    # Calculate the final shape for the output Tensor.
+    axes_before_channels = input_shape[:channels_axis]
+    axes_after_channels = input_shape[channels_axis+1:]
+    channels = input_shape[channels_axis]
+    outputs_shape = (axes_before_channels + [groups, channels // groups] +
+                     axes_after_channels)
+
+    # Calculate the final shape for the output statistics.
+    reduced_shape = []
+    for i, a in enumerate(outputs_shape):
+      if i not in reduced_axes:
+        reduced_shape.append(a)
+
+    for mu in (0.0, 1e2):
+      for sigma in (1.0, 0.1):
+        # Determine shape of Tensor after normalization.
+        expected_mean = np.zeros(reduced_shape)
+        expected_var = np.ones(reduced_shape)
+
+        inputs = random_ops.random_uniform(input_shape, seed=0) * sigma + mu
+        output_op = normalization.group_norm(
+            inputs, groups=groups, center=False, scale=False,
+            channels_axis=channels_axis,
+            reduction_axes=reduction_axes)
+        with self.test_session() as sess:
+          sess.run(variables.global_variables_initializer())
+          outputs = sess.run(output_op)
+          # Make sure that there are no NaNs
+          self.assertFalse(np.isnan(outputs).any())
+
+          outputs = np.reshape(outputs, outputs_shape)
+          mean = np.mean(outputs, axis=reduced_axes)
+          var = np.var(outputs, axis=reduced_axes)
+          # The mean and variance of each example should be close to 0 and 1
+          # respectively.
+          self.assertAllClose(expected_mean, mean, rtol=tol, atol=tol)
+          self.assertAllClose(expected_var, var, rtol=tol, atol=tol)
+
+  def testOutputSmallInput4D_NHWC(self):
+    input_shape = [10, 10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=3, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+
+  def testOutputSmallInput3D_NHWC(self):
+    input_shape = [10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=2, reduction_axes=[0, 1])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+
+  def testOutputSmallInput4D_NCHW(self):
+    input_shape = [10, 10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=1, reduction_axes=[2, 3])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+
+  def testOutputSmallInput3D_NCHW(self):
+    input_shape = [10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=0, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+
+  def testOutputBigInput4D_NHWC(self):
+    self.doOutputTest([5, 100, 100, 1], channels_axis=3, reduction_axes=[1, 2],
+                      groups=1)
+
+  def testOutputBigInput4D_NCHW(self):
+    self.doOutputTest([1, 100, 100, 4], channels_axis=1, reduction_axes=[2, 3],
+                      groups=4)
+
+  def testOutputSmallInput2D_NC(self):
+    self.doOutputTest([10, 7*100], channels_axis=1, reduction_axes=[], groups=7)
+
+  def testOutputSmallInput5D_NCXXX(self):
+    self.doOutputTest([10, 10, 20, 40, 5],
+                      channels_axis=1,
+                      reduction_axes=[2, 3, 4],
+                      groups=5)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 123275e1fde047cd3772528641b2e3b09742fbdc..e49589ddf627aa456496cebb2d0fc72fcdad710f 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -29,14 +29,17 @@ from __future__ import print_function
 import functools
 import re
 
+import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.framework.python import ops as contrib_framework_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
@@ -46,6 +49,7 @@ from tensorflow.python.util import nest
 __all__ = ["rev_block", "RevBlock", "recompute_grad"]
 
 LAYER_RE = re.compile(".*revlayer_([0-9]*)/([fg])/.*")
+_USE_DEFAULT = "__rev_block_lib_default"
 
 
 def _acc_grads(*lists_of_grads):
@@ -219,7 +223,13 @@ class RevBlock(base.Layer):
 
   def _efficient_grad_fn(self, inputs, variables, ys, grad_ys):
     """Custom gradient fn for a block of reversible residual layers."""
+    # Inputs have passed through an Identity. Recover the original Tensors to
+    # be able to match up side inputs.
+    assert [u"Identity"] == list(set([x.op.type for x in inputs]))
+    inputs = [x.op.inputs[0] for x in inputs]
     side_inputs = inputs[2:]
+    del inputs
+
     f_side_idxs = [None] * len(self.f_side_input)
     g_side_idxs = [None] * len(self.g_side_input)
     assert len(side_inputs) == len(self.f_side_input) + len(self.g_side_input)
@@ -405,12 +415,36 @@ def rev_block(x1,
   return block.forward(x1, x2)
 
 
-def recompute_grad(fn):
+def enable_with_args(dec):
+  """A decorator for decorators to enable their usage with or without args."""
+
+  @functools.wraps(dec)
+  def new_dec(*args, **kwargs):
+    if len(args) == 1 and not kwargs and callable(args[0]):
+      # Used as decorator without args
+      fn = args[0]
+      return dec(fn)
+    else:
+      return lambda fn: dec(fn, *args, **kwargs)
+
+  return new_dec
+
+
+@enable_with_args
+def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """Decorator that recomputes the function on the backwards pass.
 
   Args:
     fn: a function that takes Tensors (all as positional arguments) and returns
       a tuple of Tensors.
+    use_data_dep: `bool`, if `True` will use a dummy data dependency to force
+      the recompute to happen. If `False` will use a control dependency. By
+      default will be `True` if in an XLA context and `False` otherwise. XLA
+      ignores control dependencies and so this data dependency is necessary.
+    tupleize_grads: `bool`, if `True` will use control dependencies to ensure
+      that all gradients are produced before any are consumed by downstream ops.
+      If `use_data_dep` is also `True`, will use a data dependency instead of
+      a control dependency.
 
   Returns:
     A wrapped fn that is identical to fn when called, but its activations will
@@ -420,13 +454,25 @@ def recompute_grad(fn):
 
   @functools.wraps(fn)
   def wrapped(*args):
-    return _recompute_grad(fn, args)
+    return _recompute_grad(
+        fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads)
 
   return wrapped
 
 
-def _recompute_grad(fn, args):
+def _is_on_tpu():
+  ctxt = framework_ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
+  return control_flow_util.GetContainingXLAContext(ctxt) is not None
+
+
+def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """See recompute_grad."""
+  for arg in args:
+    if not isinstance(arg, framework_ops.Tensor):
+      raise ValueError("All inputs to function must be Tensors")
+  use_data_dep_ = use_data_dep
+  if use_data_dep_ == _USE_DEFAULT:
+    use_data_dep_ = _is_on_tpu()
 
   cached_vs = []
   cached_arg_scope = []
@@ -436,6 +482,8 @@ def _recompute_grad(fn, args):
     del outputs
     # Recompute outputs
     with framework_ops.control_dependencies(output_grads):
+      if use_data_dep_:
+        inputs = _force_data_dependency(output_grads, inputs)
       with contrib_framework_ops.arg_scope(cached_arg_scope[0]):
         with variable_scope.variable_scope(cached_vs[0], reuse=True):
           outputs = fn(*inputs)
@@ -444,6 +492,13 @@ def _recompute_grad(fn, args):
       outputs = [outputs]
     outputs = list(outputs)
     grads = gradients_impl.gradients(outputs, inputs + variables, output_grads)
+
+    if tupleize_grads:
+      if use_data_dep_:
+        grads = _tuple_with_data_dep(grads)
+      else:
+        grads = control_flow_ops.tuple(grads)
+
     grad_inputs = grads[:len(inputs)]
     grad_vars = grads[len(inputs):]
     return grad_inputs, grad_vars
@@ -532,7 +587,7 @@ def _fn_with_custom_grad_internal(fn, inputs, grad_fn, use_global_vars=False):
   get_vars_fn = (
       vs.global_variables if use_global_vars else vs.trainable_variables)
   len_before_vars = len(get_vars_fn())
-  inputs = list(inputs)
+  inputs = [array_ops.identity(x) for x in inputs]
   outputs = fn(*inputs)
   train_vars = get_vars_fn()[len_before_vars:]
 
@@ -581,3 +636,48 @@ def _fn_with_custom_grad_internal(fn, inputs, grad_fn, use_global_vars=False):
   flat_inputs = nest.flatten(defun_inputs)
   id_out = identity(*flat_inputs)
   return id_out
+
+
+def _force_data_dependency(first_compute, then_compute):
+  """Force all of `then_compute` to depend on all of `first_compute`.
+
+  Uses a dummy data dependency, which is useful when running on TPUs because
+  XLA ignores control dependencies. Only supports float arguments.
+
+  Args:
+    first_compute: `list<Tensor>`. These will be made to run before the
+      `Tensor`s `then_compute`.
+    then_compute: `list<Tensor>`. These will run after all the `Tensor`s in
+      `first_compute`.
+
+  Returns:
+    `list<Tensor>`, same length as `then_compute`.
+
+  Raises:
+    ValueError: if ranks are unknown or types are not floating.
+  """
+
+  def _first_element(x):
+    if x.get_shape().ndims is None:
+      raise ValueError("Rank of Tensor %s must be known" % x)
+    ndims = x.get_shape().ndims
+    begin = framework_ops.convert_to_tensor([0] * ndims, dtype=dtypes.int32)
+    size = framework_ops.convert_to_tensor([1] * ndims, dtype=dtypes.int32)
+    return array_ops.reshape(array_ops.slice(x, begin, size), [])
+
+  first_compute_sum = math_ops.add_n(
+      [_first_element(x) for x in first_compute if x is not None])
+  dtype = first_compute_sum.dtype
+  if not dtype.is_floating:
+    raise ValueError("_force_data_dependency only supports floating dtypes.")
+  epsilon = np.finfo(dtype.as_numpy_dtype).tiny
+  zero = array_ops.stop_gradient(epsilon * first_compute_sum)
+
+  return [
+      array_ops.identity(x) + zero if x is not None else None
+      for x in then_compute
+  ]
+
+
+def _tuple_with_data_dep(tensors):
+  return _force_data_dependency(tensors, tensors)
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index cbcbcd75114a522b95631e4e7e95c1641b0a9987..d1ad4e8c98de3e5c5ac212d55cc93707ba9c01cc 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -154,7 +154,7 @@ class RevBlockTest(test.TestCase):
       y_val, yd_val, gd_val, g_val = sess.run([y, y_rev, grads_rev, grads])
       self.assertAllClose(y_val, yd_val)
       for g1, g2 in zip(gd_val, g_val):
-        self.assertAllClose(g1, g2)
+        self.assertAllClose(g1, g2, rtol=1e-5)
 
   def testRevBlock(self):
     self._testRevBlock()
@@ -255,25 +255,54 @@ class RecomputeTest(test.TestCase):
     def fn_recompute(x):
       return fn(x)
 
+    @rev_block_lib.recompute_grad(use_data_dep=True)
+    def fn_use_data_dep(x):
+      return fn(x)
+
+    @rev_block_lib.recompute_grad(tupleize_grads=True)
+    def fn_tupleize(x):
+      return fn(x)
+
+    @rev_block_lib.recompute_grad(use_data_dep=True, tupleize_grads=True)
+    def fn_both(x):
+      return fn(x)
+
     x = random_ops.random_uniform((3, 1, 3))
-    recompute_vars = None
-    with variable_scope.variable_scope("recompute") as vs:
-      out1 = math_ops.reduce_sum(fn_recompute(x))
-      recompute_vars = vs.trainable_variables()
-    reg_vars = None
-    with variable_scope.variable_scope("regular") as vs:
-      out2 = math_ops.reduce_sum(fn(x))
-      reg_vars = vs.trainable_variables()
-
-    grad1 = gradients_impl.gradients(out1, recompute_vars)
-    grad2 = gradients_impl.gradients(out2, reg_vars)
+
+    names_and_fns = [
+        ("recompute", fn_recompute),
+        ("regular", fn),
+        ("use_data_dep", fn_use_data_dep),
+        ("tupleize", fn_tupleize),
+        ("tuple_and_data_dep", fn_both),
+    ]
+    outputs_and_vars = []
+    for name, wrapped_fn in names_and_fns:
+      with variable_scope.variable_scope(name) as vs:
+        out = math_ops.reduce_sum(wrapped_fn(x))
+        outputs_and_vars.append((out, vs.trainable_variables()))
+
+    all_grads = []
+    for out, scope_vars in outputs_and_vars:
+      all_grads.append(gradients_impl.gradients(out, scope_vars))
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      outs = sess.run([out1, out2, grad1, grad2])
-      self.assertAllClose(outs[0], outs[1])
-      for g1, g2 in zip(outs[2], outs[3]):
-        self.assertAllClose(g1, g2)
+      outputs = list(zip(*outputs_and_vars))[0]
+      outs, all_grads_val = sess.run([outputs, all_grads])
+
+      # All outputs are the same
+      current = outs[0]
+      for out in outs[1:]:
+        self.assertAllClose(current, out)
+        current = out
+
+      # All gradients are the same
+      for grads in zip(all_grads_val):
+        current = grads[0]
+        for g in grads[1:]:
+          self.assertAllClose(current, g)
+          current = g
 
 
 class FnWithCustomGradTest(test.TestCase):
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index b05f5eeaeee8fb927970b608f65495f33d63f764..d665fc9335cf22cdfa1e7330ab67003042502515 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -229,6 +229,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/monitors_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip_gpu"],  # b/74437598
     deps = [
         ":learn",
         "//tensorflow/contrib/framework:framework_py",
@@ -878,15 +879,3 @@ py_binary(
         "//tensorflow/python:platform",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/BUILD b/tensorflow/contrib/learn/python/learn/datasets/BUILD
index 8bf372841d04dc9e1339925474801d5aa3af4ccd..2c7215bba3816ff3762e5b7927f650d1c9cbf617 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/BUILD
+++ b/tensorflow/contrib/learn/python/learn/datasets/BUILD
@@ -44,18 +44,6 @@ py_binary(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_test(
     name = "base_test",
     size = "small",
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index 3b5c9b97c08a388e1f35249967b6cab26861f100..4676eedb206147d178c6a652aa7c2cb48ef888c0 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -139,15 +139,48 @@ def retry(initial_delay,
 
   Args:
     initial_delay: the initial delay.
+    max_delay: the maximum delay allowed (actual max is
+        max_delay * (1 + jitter).
     factor: each subsequent retry, the delay is multiplied by this value.
         (must be >= 1).
     jitter: to avoid lockstep, the returned delay is multiplied by a random
         number between (1-jitter) and (1+jitter). To add a 20% jitter, set
         jitter = 0.2. Must be < 1.
+    is_retriable: (optional) a function that takes an Exception as an argument
+        and returns true if retry should be applied.
+
+  Returns:
+    A function that wraps another function to automatically retry it.
+  """
+  return _internal_retry(
+      initial_delay=initial_delay,
+      max_delay=max_delay,
+      factor=factor,
+      jitter=jitter,
+      is_retriable=is_retriable)
+
+
+def _internal_retry(initial_delay,
+                    max_delay,
+                    factor=2.0,
+                    jitter=0.25,
+                    is_retriable=None):
+  """Simple decorator for wrapping retriable functions, for internal use only.
+
+  Args:
+    initial_delay: the initial delay.
     max_delay: the maximum delay allowed (actual max is
         max_delay * (1 + jitter).
+    factor: each subsequent retry, the delay is multiplied by this value.
+        (must be >= 1).
+    jitter: to avoid lockstep, the returned delay is multiplied by a random
+        number between (1-jitter) and (1+jitter). To add a 20% jitter, set
+        jitter = 0.2. Must be < 1.
     is_retriable: (optional) a function that takes an Exception as an argument
         and returns true if retry should be applied.
+
+  Returns:
+    A function that wraps another function to automatically retry it.
   """
   if factor < 1:
     raise ValueError('factor must be >= 1; was %f' % (factor,))
@@ -195,7 +228,7 @@ def _is_retriable(e):
 
 
 @deprecated(None, 'Please use urllib or similar directly.')
-@retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
+@_internal_retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
 def urlretrieve_with_retry(url, filename=None):
   return urllib.request.urlretrieve(url, filename)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index 64d7ecc68e7abb1d36a3eb098fedd8184d6e9d77..70b70af98c51dcb991c19152607272673953ee2a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -243,8 +243,8 @@ def sdca_model_fn(features, labels, mode, params):
 
   parent_scope = "linear"
 
-  with variable_scope.variable_op_scope(
-      features.values(), parent_scope) as scope:
+  with variable_scope.variable_scope(
+      values=features.values(), name_or_scope=parent_scope) as scope:
     features = features.copy()
     features.update(layers.transform_features(features, feature_columns))
     logits, columns_to_variables, bias = (
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 1d161093de01ef838d0c75ec9a39574c7529bd57..8c85c431be69caaca6872111896b9487faf9e679 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -290,8 +290,15 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
         Note - using this argument, it is easy to provide settings which break
         otherwise perfectly good models. Use with care.
     """
-    super(RunConfig, self).__init__(
-        master=master, evaluation_master=evaluation_master)
+    # Neither parent class calls super().__init__(), so here we have to
+    # manually call their __init__() methods.
+    ClusterConfig.__init__(
+        self, master=master, evaluation_master=evaluation_master)
+    # For too long this code didn't call:
+    #   core_run_config.RunConfig.__init__(self)
+    # so instead of breaking compatibility with that assumption, we
+    # just manually initialize this field:
+    self._train_distribute = None
 
     gpu_options = config_pb2.GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
index 1fa55132b1fc0cd3367ca2eb331b6870edc30c3b..8c2c4fd29c0502d4199f27a65e4827b2db973c3d 100644
--- a/tensorflow/contrib/legacy_seq2seq/BUILD
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -60,15 +60,3 @@ cuda_py_tests(
     ],
     tags = ["noasan"],  # times out b/63678675
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/libsvm/BUILD b/tensorflow/contrib/libsvm/BUILD
index df96402a4ffd51840f77d58d8066487030362340..4dccb9be7cd2e603edcf10c020cc0ee1675f518a 100644
--- a/tensorflow/contrib/libsvm/BUILD
+++ b/tensorflow/contrib/libsvm/BUILD
@@ -88,15 +88,3 @@ tf_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index d4f2e7063184d962f4654cf8df4ab966c1941139..a7812f74d1e69276a4bba597b41e442bc4dbbc4a 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -58,16 +58,6 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    shard_count = 4,
+    tags = ["noasan"],
 )
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
index 5d7a99664d38eca035bd5a86710050bce4b22c1e..9d3af66c92b59dd030d4b2a829ab733eec6cf0c1 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
 
@@ -137,8 +138,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
         This is true by default, and will raise a `ValueError` otherwise.
       name: A name for this `LinearOperator`.  Default is the individual
@@ -333,6 +333,18 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
     mat.set_shape(self.shape)
     return mat
 
+  def _assert_non_singular(self):
+    return control_flow_ops.group([
+        operator.assert_non_singular() for operator in self.operators])
+
+  def _assert_self_adjoint(self):
+    return control_flow_ops.group([
+        operator.assert_self_adjoint() for operator in self.operators])
+
+  def _assert_positive_definite(self):
+    return control_flow_ops.group([
+        operator.assert_positive_definite() for operator in self.operators])
+
   def _split_input_into_blocks(self, x, axis=-1):
     """Split `x` into blocks matching `operators`'s `domain_dimension`.
 
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index cea3627ed565f0de86d8d9bb6b45c4b19c5b5558..5b89c6cef9fa9fdef7c26ddee1efa03f3056d881 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -138,14 +138,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index 05794a42c5f2d0eece6adab36fb5610078cece31..d4e54c82f988e0adcd16aad29702ee9f8b16aea3 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -140,8 +140,8 @@ def sdca_model_fn(features, labels, mode, params, config=None):
 
   parent_scope = "linear"
 
-  with variable_scope.variable_op_scope(features.values(),
-                                        parent_scope) as scope:
+  with variable_scope.variable_scope(
+      values=features.values(), name_or_scope=parent_scope) as scope:
     features = features.copy()
     features.update(layers.transform_features(features, feature_columns))
     logits, columns_to_variables, bias = (
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 5cfbb544b73991195a7bba9528ee9550104f3d78..9c4533079c72f5ed68c6f45582fb1cecaa3a3679 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -89,6 +89,7 @@ cc_library(
     hdrs = [
         "builtin_op_data.h",
     ],
+    deps = [":context"],
 )
 
 cc_library(
@@ -133,10 +134,10 @@ cc_library(
         ":schema_fbs_version",
         ":simple_memory_arena",
         ":util",
+        "//tensorflow/contrib/lite/kernels:eigen_support",
         "//tensorflow/contrib/lite/kernels:gemm_support",
         "//tensorflow/contrib/lite/nnapi:nnapi_lib",
         "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/core:lib_platform",
     ],
 )
 
@@ -170,6 +171,7 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/contrib/lite/kernels:kernel_util",
         "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
@@ -270,18 +272,3 @@ cc_test(
 #        ],
 #    }),
 #)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "downloads",
-            "examples",
-            "gen",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index 5194f015b5b84189e3a8caf5fb0bc0204deb7bb2..a676b705f143b393c7e5bfa9e40d23f9adb68dcc 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@@ -1,235 +1,8 @@
 # TensorFlow Lite
-TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded devices. It enables low-latency inference of on-device machine learning models with a small binary size and fast performance supporting hardware acceleration.
 
-TensorFlow Lite uses many techniques for achieving low latency like optimizing the kernels for specific mobile apps, pre-fused activations, quantized kernels that allow smaller and faster (fixed-point math) models, and in the future, leverage specialized machine learning hardware to get the best possible performance for a particular model on a particular device.
+TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded
+devices. It enables low-latency inference of on-device machine learning models
+with a small binary size and fast performance supporting hardware acceleration.
 
-![image](g3doc/TFLite-Architecture.jpg)
-# Getting Started with an Android Demo App
-
-This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using either a quantized Mobilenet model or a floating point Inception-v3 model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
-
-There are 3 ways to get the demo app to your device
- - Download the prebuilt binary or
- - Use Android Studio to build the application or
- - Download the source code for TensorFlow Lite and the demo and build it using bazel
-
-## Description
-In the demo app, inference is done using the TensorFlow Lite Java API. The demo app classifies frames in real-time, displaying the top most probable classifications. It also displays the time taken to detect the object.
-
-## Downloading the pre-built binary
-The fastest path to trying the demo, is to download the pre-built binary
-[TfLiteCameraDemo.apk](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
-
-Once the apk is installed, click the app icon to start the app. The first-time the app is opened, the app asks for runtime permissions to access the device camera. The demo app opens the back-camera of the device and recognizes the objects in the camera's field of view. At the bottom of the image (or at the left of the image if the device is in landscape mode), it shows the latency of classification and the top three objects classified.
-
-## Building in Android Studio using TensorFlow Lite AAR from JCenter
-The simplest way to compile the demo app, and try out changes to the project code is to use AndroidStudio.
-
- - Install the latest version of Android Studio 3 as specified [here](https://developer.android.com/studio/index.html).
- - Make sure the Android SDK version is greater than 26 and NDK version is greater than 14 (in the Android Studio Settings).
- - Import the `tensorflow/contrib/lite/java/demo` directory as a new Android Studio project.
- - Click through installing all the Gradle extensions it requests.
- - Either
-     - Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
-         - unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
-           `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
-     - Or download the floating point Inception-v3 model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
-         - unzip and copy inceptionv3_non_slim_2015.tflite to the assets directory
-         - change the chosen classifier in [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java) from
-         `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`
-         to
-         `classifier = new ImageClassifierFloatInception(getActivity());`
- - Build and run the demo app
-
-## Building TensorFlow Lite and the demo app from source
-
-### Clone the TensorFlow repo
-- git clone
-  [https://github.com/tensorflow/tensorflow](https://github.com/tensorflow/tensorflow)
-
-### Install Bazel
-If bazel is not installed on your system, install it now by following [these directions](https://bazel.build/versions/master/docs/install.html)
-
-NOTE: Bazel does not fully support building Android on Windows yet. Full support for Gradle/CMake builds is coming soon, but in the meantime Windows users should download the [prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead.
-
-### Install Android NDK and SDK
-Bazel is the primary build system for TensorFlow. Bazel and the Android NDK and SDK must be installed on your system.
- - Install the latest version of Bazel as per the instructions on the [Bazel website](https://bazel.build/versions/master/docs/install.html)
- - The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The current recommended version is 14b, which can be found [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads).
- - The Android SDK and build tools may be obtained [here](https://developer.android.com/tools/revisions/build-tools.html), or alternatively as part of [Android Studio](https://developer.android.com/studio/index.html). Build tools API >= 23 is required to build the TF Android demo (though it will run on API >= 21 devices).
- - In the root of the TensorFlow repository update the `WORKSPACE` file with the `api_level` and location of the SDK and NDK. If you installed it with AndroidStudio the SDK path can be found in the SDK manager, and the default NDK path is:`{SDK path}/ndk-bundle.`
-
-```
-android_sdk_repository (
-    name = "androidsdk",
-    api_level = 23,
-    build_tools_version = "23.0.2",
-    path = "/home/xxxx/android-sdk-linux/",
-)
-
-android_ndk_repository(
-    name = "androidndk",
-    path = "/home/xxxx/android-ndk-r10e/",
-    api_level = 19,
-)
-```
-
-Additional details on building with Android can be found [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
-
-### Build the source code
-Run bazel with the following command to build the demo.
-
-Build the demo app:
-
-```
-bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
-```
-
-### Note
-
-Currently, we only support building the Android demo app within a Python 2
-environment (due to a Bazel bug).
-
-### More about the demo
-The demo is resizing each camera image frame to (224 width * 224 height) to match the quantized Mobilenet model being used (299 * 299 for Inception-v3). The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch. 224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. Both models have 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The model file must be downloaded and bundled within the assets directory of the app.
-
-# iOS Demo App
-
-Similar to the Android demo app, there's an iOS camera app that uses exactly the same model (224 * 224 quantized Mobilenet).
-
-This demo app requires a camera so it doesn't work with simulators. It need to be executed on a real iOS device. Follow the instructions to build and run the demo app:
-
-1.   Run `tensorflow/contrib/lite/examples/ios/download_models.sh` to download the model files used by the demo app.
-1.   Install [CocoaPods](https://cocoapods.org/) if it wasn't installed yet: `sudo gem install cocoapods`.
-1.   Run `pod install` in `tensorflow/contrib/lite/examples/ios/camera` to generate the workspace file.
-1.   Open the project by running `open tflite_camera_example.xcworkspace`, and build the app in XCode.
-
-# TensorFlow Lite Quick Start
-
-## Step 1. Decide which GraphDef to use
- Depending on the use case, the developer may choose to use one of the popular
- open-sourced models such as InceptionV3 or MobileNets, re-train these models
- with their own custom data set or even build their own custom model.
-
-### Using a pre-trained model
-
-[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html) is a family of mobile-first computer vision models for [TensorFlow](https://www.tensorflow.org/) designed to effectively maximize accuracy while being mindful of the restricted resources for an on-device or embedded application. MobileNets are small, low-latency, low-power models parameterized to meet the resource constraints of a variety of use cases. They can be built upon for classification, detection, embeddings and segmentation similar to how other popular large scale models, such as [Inception](https://arxiv.org/pdf/1602.07261.pdf), are used. Google provides 16 pre-trained [ImageNet](http://www.image-net.org/challenges/LSVRC/)  classification checkpoints for MobileNets for use in mobile projects of all sizes.
-
-[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model which achieves fairly high accuracy in recognizing general objects with 1000 classes, like "Zebra", "Dalmatian", and "Dishwasher". The model extracts general features from input images using a convolutional neural network and classifies them based on those features with fully-connected and softmax layers.
-
-[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)  is an on-device model which provides one-touch replies for an incoming text message by suggesting contextually relevant messages. The model is built specifically for memory constrained devices such as watches & phones and it has been successfully used to surface [Smart Replies on Android Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html). Note that this model only works on Android as of now.
-
-These pre-trained models can be downloaded from [here](g3doc/models.md).
-
-### Retrain Inception-V3 or MobileNet for a custom data set
-The above pre-trained models have been trained on the ImageNet data set, which consists of 1000 predefined classes. A model will need to be re-trained if these classes are not relevant or useful for a given use case. This technique is called transfer learning, which starts with a model that has been already trained on a problem and will then be retrained on a similar problem. Deep learning from scratch can take days, but transfer learning can be done fairly quickly. In order to do this, a developer will need to generate their custom data set labeled with the relevant classes.
-
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/) codelab walks through this process step-by-step. The retraining code supports retraining for both floating point and quantized inference.
-
-
-### Train a custom model
-A developer may choose to train a custom model using Tensorflow. TensorFlow documentation has [several tutorials](https://www.tensorflow.org/tutorials/) for building and training models. If the user has written a model using TensorFlow's Slim Framework the first step is to export this to a GraphDef file. This is necessary because Slim does not store the model structure outside the code, so to communicate with other parts of the framework it needs to be exported. Documentation for the export can be found [here](https://github.com/tensorflow/models/tree/master/research/slim#Export). The output of this step will be a .pb file for the custom model.
-
-TensorFlow Lite currently supports a subset of TensorFlow operators. Please refer to [this document](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for details of supported operators and their usage. This
-set will continue to expand in future releases of Tensorflow Lite.
-
-
-## Step 2. Model format conversion
-
-The model generated in Step 1 is a standard Tensorflow model. After the completion of Step 1 a user should have a standard .pb or .pbtxt GraphDef file. If the application developer is using a pre-trained model (as defined in Step 1 above), they can download a ready to use, already converted model for use from [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/models.md). Models generated using retraining (aka transfer learning) or custom models will need to be converted using the steps mentioned below.
-
-A prerequisite to converting the model to the Tensorflow Lite format is to freeze the graph.
-
-Since we employ several formats, the following definitions may be useful:
- - GraphDef (.pb) - a protobuf that represents the TensorFlow training and or computation graph. This contains operators, tensors, and variables definitions.
-
- - CheckPoint (.ckpt) - Serialized variables from a TensorFlow graph. Note, this does not contain the graph structure, so alone it cannot typically be interpreted.
-
- - FrozenGraphDef - a subclass of GraphDef that contains no variables. A GraphDef can be converted to a frozen graphdef by taking a checkpoint and a graphdef and converting every variable into a constant with the value looked up in the checkpoint.
-
- - SavedModel - A collection of GraphDef and CheckPoint together with a signature that labels input and output arguments to a model. A GraphDef and Checkpoint can be extracted from a saved model.
-
- - TensorFlow lite model (.tflite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
-
-### Freeze Graph
-To use this .pb GraphDef file within TensorFlow Lite, the application developer will need checkpoints containing trained weight parameters. The .pb contains only the structure of the graph. The process of merging the checkpoint values with the graph structure is known as "freezing" the graph.
-
-The developer should know where the checkpoints folder is present or checkpoints can also be downloaded for a pre-trained model (Example: Here is a link to the [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
-
-Graph freezing can be done using the command below (and modifying the arguments appropriately)
-
-```
-bazel build tensorflow/python/tools:freeze_graph
-
-bazel-bin/tensorflow/python/tools/freeze_graph\
-    --input_graph=/tmp/mobilenet_v1_224.pb \
-    --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
-    --input_binary=true --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
-    --output_node_names=MobileNet/Predictions/Reshape_1
-```
-
-The user has to first build the freeze_graph script using bazel and then run the script.  The input_binary flag has to be enabled to ensure that the protobuf is read and written in binary format.  The user has to input the .pb and the .ckpt files to freeze the graph The output_node_names may not be obvious outside of the code that built the model. The easiest way to find them is to visualize the graph, either with
-graphviz, or [in tensorboard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3).
-
-This frozen Graphdef is now ready to be converted to flatbuffer format (.tflite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
-
-Here is a sample command line to convert the frozen Graphdef to '.tflite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
-(Here is a link to the pb [file](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)).
-
-```
-bazel build tensorflow/contrib/lite/toco:toco
-
-bazel-bin/tensorflow/contrib/lite/toco/toco \
-  --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
-  --input_format=TENSORFLOW_GRAPHDEF  --output_format=TFLITE \
-  --output_file=/tmp/mobilenet_v1_1.0_224.tflite --inference_type=FLOAT \
-  --input_type=FLOAT --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 --input_shapes=1,224,224,3
-```
-
-- The input_file argument should point to the frozen GraphDef file that holds the model architecture.
-- The output_file argument should point to where the TensorFlow Lite model file should be generated.
-- The input_type and inference_type arguments should be set to FLOAT, unless converted a [quantized](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/) model.
-- Setting the input_array, output_array and input_shape arguments are a bit trickier. The easiest way to find these values is to explore the graph in tensorboard .  The user should reuse the arguments that were used for specifying the output nodes for inference in the `freeze_graph`step.
-
-Note, it is also possible to use the Tensorflow Optimizing Converter through protos either from Python or from the command line see the
-documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py). A developer can then integrate the conversion step into their model design workflow to ensure that a model will be easily convertible to a mobile inference graph. For example,
-
-```python
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-out = tf.identity(val, name="out")
-with tf.Session() as sess:
-  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
-  open("converteds_model.tflite", "wb").write(tflite_model)
-
-```
-For detailed instructions on how to use the Tensorflow Optimizing Converter, please see [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
-
-You may refer to the [Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for troubleshooting help. If that doesn't help, please file an [issue](https://github.com/tensorflow/tensorflow/issues).
-
-If you would like to see a visual description of your TensorFlow Lite model after conversion, you can use tensorflow/contrib/lite/tools/visualize.py by running
-```sh
-bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
-```
-and then visualize the resulting HTML file in a browser.
-
-## Step 3. Use the TensorFlow Lite model for inference in a mobile app
-
-After completion of Step 2 the developer should have a .tflite model.
-
-### For Android
-Because Android apps need to be written in Java, and core TensorFlow is in C++, a JNI library is provided to interface between the two. Its interface is aimed only at inference, so it provides the ability to load a graph, set up inputs, and run the model to calculate particular outputs. The full documentation for the set of methods can be seen [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/). The demo app is also open sourced on [github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
-
-The [demo app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app)  uses this interface, so it's a good place to look for example usage. You can also download the prebuilt binary [here](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-
-Note that you'd need to follow instructions for installing TensorFlow on Android, setting up bazel and Android Studio outlined [here](https://www.tensorflow.org/mobile/android_build).
-
-### For iOS
-Follow the documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md) to get integrate a TFLite model into your app.
-
-## Core ML support
-
-Core ML is a machine learning framework used across Apple products. In addition to using Tensorflow Lite models directly in their applications, developers have the option to convert their trained Tensorflow models to the [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple devices. For information on how to use the converter please refer to the [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+See the documentation: https://www.tensorflow.org/mobile/tflite/
+Documentation edits can be made here: [tensorflow/docs_src/mobile/tflite](../../docs_src/mobile/tflite)
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
index f84b3dad9550e789237c8e45971002c7d336b9d3..e9d0fbc5a9b5aec06e28da8757466b25f40da2f5 100644
--- a/tensorflow/contrib/lite/arena_planner.h
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -25,7 +25,7 @@ limitations under the License.
 
 namespace tflite {
 
-class AllocationInfo;
+struct AllocationInfo;
 
 // A memory planner that makes all the allocations using arenas.
 //
diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/build_ios_universal_lib.sh
index 4a9023ff33de15dd384531d51e39de4ffeecdb8b..9f398f4a9f3dcafd7bd49fd5d95e9991b8b36b75 100755
--- a/tensorflow/contrib/lite/build_ios_universal_lib.sh
+++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh
@@ -19,11 +19,16 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../.."
 
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_x86_64/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_i386/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7s/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_arm64/libtensorflow-lite.a
 
 lipo \
 tensorflow/contrib/lite/gen/lib/ios_x86_64/libtensorflow-lite.a \
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 5fc8954743e5b3b458e5c2004f4378cbad6056c0..2b6c24768c0f35b91d0dabf8a5723e73f040cc3b 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include "tensorflow/contrib/lite/context.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
@@ -174,6 +176,11 @@ typedef struct {
   int block_size;
 } TfLiteSpaceToDepthParams;
 
+typedef struct {
+  TfLiteType in_data_type;
+  TfLiteType out_data_type;
+} TfLiteCastParams;
+
 typedef enum {
   kTfLiteCombinerTypeSum = 0,
   kTfLiteCombinerTypeMean = 1,
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index ea3ae3489ecf07b22a02829c5235ad59264496af..17b791e4e2f38d9a1108d35d1298445a1c370727 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -24,8 +24,7 @@ extern "C" {
 #endif  // __cplusplus
 
 // The enum for builtin operators.
-// Note: CUSTOM and DELEGATE are 2 special ops which are not real builtin
-// ops.
+// Note: CUSTOM and DELEGATE are 2 special ops which are not real built-in ops.
 typedef enum {
   kTfLiteBuiltinAdd = 0,
   kTfLiteBuiltinAveragePool2d = 1,
@@ -79,6 +78,8 @@ typedef enum {
   kTfLiteBuiltinDelegate = 51,
   kTfLiteBuiltinBidirectionalSequenceLstm = 52,
   kTfLiteBuiltinCast = 53,
+  kTfLiteBuiltinPrelu = 54,
+  kTfLiteBuiltinMaximum = 55,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/examples/android/AndroidManifest.xml b/tensorflow/contrib/lite/examples/android/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..bc9574d646b7661de8ac9b745bd53cbba1eb9f31
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/AndroidManifest.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.tensorflow.lite.demo">
+
+    <uses-permission android:name="android.permission.CAMERA" />
+    <uses-feature android:name="android.hardware.camera" />
+    <uses-feature android:name="android.hardware.camera.autofocus" />
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
+    <uses-permission android:name="android.permission.RECORD_AUDIO" />
+
+    <uses-sdk
+        android:minSdkVersion="21"
+        android:targetSdkVersion="23" />
+
+    <application android:allowBackup="true"
+        android:debuggable="true"
+        android:label="@string/app_name"
+        android:icon="@drawable/ic_launcher"
+        android:theme="@style/MaterialTheme">
+
+        <activity android:name="org.tensorflow.demo.ClassifierActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_classification">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+        <activity android:name="org.tensorflow.demo.DetectorActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_detection">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+        <activity android:name="org.tensorflow.demo.SpeechActivity"
+            android:screenOrientation="portrait"
+            android:label="@string/activity_name_speech">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
diff --git a/tensorflow/contrib/lite/examples/android/BUILD b/tensorflow/contrib/lite/examples/android/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..49280129971e38247c2216d9422bc5de9176e13d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/BUILD
@@ -0,0 +1,86 @@
+# Description:
+#   TensorFlow camera demo app for Android.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Build the demo native demo lib from the original directory to reduce code
+# reuse. Note that the Java counterparts (ObjectTracker.java and
+# ImageUtils.java) are still duplicated.
+cc_library(
+    name = "tensorflow_native_libs",
+    srcs = [
+        "//tensorflow/examples/android:libtensorflow_demo.so",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
+android_binary(
+    name = "tflite_demo",
+    srcs = glob([
+        "src/**/*.java",
+    ]),
+    # Package assets from assets dir as well as all model targets.
+    # Remove undesired models (and corresponding Activities in source)
+    # to reduce APK size.
+    assets = [
+        "//tensorflow/contrib/lite/examples/android/assets:labels_mobilenet_quant_v1_224.txt",
+        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
+        "@tflite_conv_actions_frozen//:conv_actions_frozen.tflite",
+        "//tensorflow/contrib/lite/examples/android/assets:conv_actions_labels.txt",
+        "@tflite_mobilenet_ssd//:mobilenet_ssd.tflite",
+        "//tensorflow/contrib/lite/examples/android/assets:box_priors.txt",
+        "//tensorflow/contrib/lite/examples/android/assets:coco_labels_list.txt",
+    ],
+    assets_dir = "",
+    custom_package = "org.tensorflow.lite.demo",
+    inline_constants = 1,
+    manifest = "AndroidManifest.xml",
+    manifest_merger = "android",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":tensorflow_native_libs",
+        "//tensorflow/contrib/lite/java:tensorflowlite",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "bin/**",
+            "gen/**",
+            "gradleBuild/**",
+            "libs/**",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "java_files",
+    srcs = glob(["src/**/*.java"]),
+)
+
+filegroup(
+    name = "resource_files",
+    srcs = glob(["res/**"]),
+)
+
+exports_files(["AndroidManifest.xml"])
diff --git a/tensorflow/contrib/lite/examples/android/assets/BUILD b/tensorflow/contrib/lite/examples/android/assets/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..dd0cd6c98ff878e9c41875cab74c12191cadb173
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/examples/android/assets/box_priors.txt b/tensorflow/contrib/lite/examples/android/assets/box_priors.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7246b073fe7fd8b1d1340536457c8aeac24cd5a3
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/box_priors.txt
@@ -0,0 +1,5 @@
+        0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.16666667 0.16666667 0.16666666 0.16666667 0.16666669 0.16666667 0.16666667 0.16666667 0.16666666 0.16666667 0.16666669 0.16666667 0.16666667 0.16666667 0.16666666 0.16666667 0.16666669 0.16666667 0.5 0.5 0.49999997 0.5 0.5 0.5 0.5 0.5 0.49999997 0.5 0.5 0.5 0.5 0.5 0.49999997 0.5 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.25 0.25 0.25 0.24999999 0.25 0.25 0.25 0.25 0.25 0.24999999 0.25 0.25 0.75 0.75 0.75 0.75 0.74999994 0.75 0.75 0.75 0.75 0.75 0.74999994 0.75 0.5 0.5 0.5 0.5 0.5 0.5 
+        0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.16666667 0.16666669 0.16666667 0.16666669 0.16666667 0.16666667 0.49999997 0.5 0.5 0.50000006 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333333 0.8333334 0.8333334 0.16666667 0.16666669 0.16666667 0.16666669 0.16666667 0.16666667 0.49999997 0.5 0.5 0.50000006 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333333 0.8333334 0.8333334 0.16666667 0.16666669 0.16666667 0.16666669 0.16666667 0.16666667 0.49999997 0.5 0.5 0.50000006 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333333 0.8333334 0.8333334 0.25 0.25 0.25 0.25 0.25 0.25 0.75 0.75 0.75 0.75 0.75 0.75 0.25 0.25 0.25 0.25 0.25 0.25 0.75 0.75 0.75 0.75 0.75 0.75 0.5 0.5 0.5 0.5 0.5 0.5 
+        0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.65000004 0.45961943 0.91923887 0.37527767 1.1258893 0.7211102 0.65000004 0.45961943 0.91923887 0.37527767 1.1258893 0.7211102 0.65000004 0.45961943 0.91923887 0.37527767 1.1258893 0.7211102 0.6500001 0.4596194 0.9192388 0.37527764 1.1258893 0.7211102 0.6500001 0.4596194 0.9192388 0.37527764 1.1258893 0.7211102 0.6500001 0.4596194 0.9192388 0.37527764 1.1258893 0.7211102 0.6500001 0.45961946 0.9192388 0.3752777 1.1258893 0.72111017 0.6500001 0.45961946 0.9192388 0.3752777 1.1258893 0.72111017 0.6500001 0.45961946 0.9192388 0.3752777 1.1258893 0.72111017 0.8000001 0.5656855 1.131371 0.4618802 1.3857099 0.8717798 0.8000001 0.5656855 1.131371 0.4618802 1.3857099 0.8717798 0.80000013 0.5656855 1.131371 0.4618802 1.3857098 0.87177986 0.80000013 0.5656855 1.131371 0.4618802 1.3857098 0.87177986 0.95000005 0.6717515 1.343503 0.5484828 1.6455305 0.97467947 
+        0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.6499999 0.9192387 0.45961934 1.1258329 0.3752589 0.7211102 0.64999986 0.9192387 0.4596193 1.125833 0.37525892 0.7211102 0.64999986 0.91923875 0.45961928 1.1258328 0.37525892 0.72111017 0.6499999 0.9192387 0.45961934 1.1258329 0.3752589 0.7211102 0.64999986 0.9192387 0.4596193 1.125833 0.37525892 0.7211102 0.64999986 0.91923875 0.45961928 1.1258328 0.37525892 0.72111017 0.6499999 0.9192387 0.45961934 1.1258329 0.3752589 0.7211102 0.64999986 0.9192387 0.4596193 1.125833 0.37525892 0.7211102 0.64999986 0.91923875 0.45961928 1.1258328 0.37525892 0.72111017 0.79999995 1.1313708 0.5656854 1.3856406 0.46185714 0.8717798 0.79999995 1.1313708 0.56568533 1.3856406 0.46185708 0.87177986 0.79999995 1.1313708 0.5656854 1.3856406 0.46185714 0.8717798 0.79999995 1.1313708 0.56568533 1.3856406 0.46185708 0.87177986 0.9499999 1.3435028 0.6717514 1.6454482 0.54845536 0.97467947 
+
diff --git a/tensorflow/contrib/lite/examples/android/assets/coco_labels_list.txt b/tensorflow/contrib/lite/examples/android/assets/coco_labels_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a70ff82aa7b0fa7315ca591820e4cf7d2f5ad18
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/coco_labels_list.txt
@@ -0,0 +1,91 @@
+???
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+???
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+???
+backpack
+umbrella
+???
+???
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+???
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+???
+dining table
+???
+???
+toilet
+???
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+???
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/tensorflow/contrib/lite/examples/android/assets/conv_actions_labels.txt b/tensorflow/contrib/lite/examples/android/assets/conv_actions_labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba416458b011a7f4b96739eb6fcb6275a6ab3bec
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/conv_actions_labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/examples/android/assets/labels_mobilenet_quant_v1_224.txt b/tensorflow/contrib/lite/examples/android/assets/labels_mobilenet_quant_v1_224.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe811239d8e2989de19fecabb1ebb0c9dddac514
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/labels_mobilenet_quant_v1_224.txt
@@ -0,0 +1,1001 @@
+background
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/contrib/lite/examples/android/build.gradle b/tensorflow/contrib/lite/examples/android/build.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..0d4de358156a5d139e35cc542b8d36ab24e763b9
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/build.gradle
@@ -0,0 +1,52 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 26
+    buildToolsVersion "26.0.1"
+    defaultConfig {
+        applicationId "org.tensorflow.lite.demo"
+        minSdkVersion 15
+        targetSdkVersion 26
+        versionCode 1
+        versionName "1.0"
+        testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+
+        // Remove this block.
+        jackOptions {
+            enabled true
+        }
+    }
+    lintOptions {
+        abortOnError false
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+}
+
+repositories {
+    maven {
+        url 'https://google.bintray.com/tensorflow'
+    }
+}
+
+dependencies {
+    compile fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+        exclude group: 'com.android.support', module: 'support-annotations'
+    })
+    compile 'org.tensorflow:tensorflow-lite:+'
+
+    testCompile 'junit:junit:4.12'
+}
diff --git a/tensorflow/contrib/lite/examples/android/res/animator/color_animation.xml b/tensorflow/contrib/lite/examples/android/res/animator/color_animation.xml
new file mode 100644
index 0000000000000000000000000000000000000000..891d8cc1d4f3e59d0371030fd763c5ad468e7887
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/animator/color_animation.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<set xmlns:android="http://schemas.android.com/apk/res/android"
+  android:ordering="sequentially">
+  <objectAnimator
+    android:propertyName="backgroundColor"
+    android:duration="375"
+    android:valueFrom="0x00b3ccff"
+    android:valueTo="0xffb3ccff"
+    android:valueType="colorType"/>
+  <objectAnimator
+    android:propertyName="backgroundColor"
+    android:duration="375"
+    android:valueFrom="0xffb3ccff"
+    android:valueTo="0x00b3ccff"
+    android:valueType="colorType"/>
+</set>
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..32bd1aabcabb85ded957230533c00e735183a323
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3113cd15c3255405ee34c622a1e83674e6e5487
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/tile.9.png b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/tile.9.png
new file mode 100644
index 0000000000000000000000000000000000000000..135862883e26eddce2b19db021adf62e10357ad0
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/tile.9.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..8efbbf8b3c44418551699db9388cd77a88362112
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f87ee6507cebec6bff32b1a03b36ffc711689d
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba143ea7a80f03b0e850775ad672ccb2d6195e4c
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..6361d792dacd8ce09a14258878b5ce6db5e0debb
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..394eb7e534905e36fd24c3defac92c09b403ee39
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e27bec9785d4d51fe597bced7f04508994aa10c
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable/border.xml b/tensorflow/contrib/lite/examples/android/res/drawable/border.xml
new file mode 100644
index 0000000000000000000000000000000000000000..dd1d64d1d61f359422c79533f726991c78e47d99
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/drawable/border.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<shape xmlns:android="http://schemas.android.com/apk/res/android" android:shape="rectangle" >
+  <solid android:color="#00000000" />
+  <stroke android:width="1dip" android:color="#cccccc" />
+</shape>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/activity_camera.xml b/tensorflow/contrib/lite/examples/android/res/layout/activity_camera.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1a22d4b33ebbd755104272863c5cc6c93793b86b
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/activity_camera.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/container"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#000"
+    tools:context="org.tensorflow.demo.CameraActivity" />
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/activity_speech.xml b/tensorflow/contrib/lite/examples/android/res/layout/activity_speech.xml
new file mode 100644
index 0000000000000000000000000000000000000000..2fe1338da57122c7e26c64c653076b6746a25497
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/activity_speech.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context="org.tensorflow.demo.SpeechActivity">
+
+    <TextView
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Say one of the words below!"
+        android:id="@+id/textView"
+        android:textAlignment="center"
+        android:layout_gravity="top"
+        android:textSize="24dp"
+        android:layout_marginTop="10dp"
+        android:layout_marginLeft="10dp"
+        />
+
+    <ListView
+        android:id="@+id/list_view"
+        android:layout_width="240dp"
+        android:layout_height="wrap_content"
+        android:background="@drawable/border"
+        android:layout_gravity="top|center_horizontal"
+        android:textAlignment="center"
+        android:layout_marginTop="100dp"
+        />
+
+    <Button
+        android:id="@+id/quit"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Quit"
+        android:layout_gravity="bottom|center_horizontal"
+        android:layout_marginBottom="10dp"
+        />
+
+</FrameLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment.xml b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a1bbdf1702cea79088715d30b8746f7fc8fdac56
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+    <org.tensorflow.demo.AutoFitTextureView
+        android:id="@+id/texture"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_alignParentBottom="true" />
+
+    <org.tensorflow.demo.RecognitionScoreView
+        android:id="@+id/results"
+        android:layout_width="match_parent"
+        android:layout_height="112dp"
+        android:layout_alignParentTop="true" />
+
+    <org.tensorflow.demo.OverlayView
+        android:id="@+id/debug_overlay"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_alignParentBottom="true" />
+
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_stylize.xml b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_stylize.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1cdb24cab03222934ca2aa326a765150d58aa6a8
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_stylize.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+      android:orientation="vertical"
+      android:layout_width="match_parent"
+      android:layout_height="match_parent">
+  <org.tensorflow.demo.AutoFitTextureView
+    android:id="@+id/texture"
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    android:layout_alignParentTop="true" />
+
+  <RelativeLayout
+    android:id="@+id/black"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#FF000000" />
+
+  <GridView
+    android:id="@+id/grid_layout"
+    android:numColumns="7"
+    android:stretchMode="columnWidth"
+    android:layout_alignParentBottom="true"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content" />
+
+  <org.tensorflow.demo.OverlayView
+      android:id="@+id/overlay"
+      android:layout_width="match_parent"
+      android:layout_height="match_parent"
+      android:layout_alignParentTop="true" />
+
+  <org.tensorflow.demo.OverlayView
+    android:id="@+id/debug_overlay"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:layout_alignParentTop="true" />
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_tracking.xml b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_tracking.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ca18ea075dbb65d93bc895bc33211a171e52d62e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_tracking.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+      <org.tensorflow.demo.AutoFitTextureView
+          android:id="@+id/texture"
+          android:layout_width="wrap_content"
+          android:layout_height="wrap_content"/>
+
+      <org.tensorflow.demo.OverlayView
+          android:id="@+id/tracking_overlay"
+          android:layout_width="match_parent"
+          android:layout_height="match_parent"/>
+
+      <org.tensorflow.demo.OverlayView
+          android:id="@+id/debug_overlay"
+          android:layout_width="match_parent"
+          android:layout_height="match_parent"/>
+</FrameLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/list_text_item.xml b/tensorflow/contrib/lite/examples/android/res/layout/list_text_item.xml
new file mode 100644
index 0000000000000000000000000000000000000000..526017fbb24ecfa6765a21378f1ae0890a97a004
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/list_text_item.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<TextView
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/list_text_item"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:text="TextView"
+    android:textSize="24dp"
+    android:textAlignment="center"
+    android:gravity="center_horizontal"
+    />
diff --git a/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-dimens.xml b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-dimens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..820eda0e5585284c4b3f2bbaebdfee9d074d4c19
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-dimens.xml
@@ -0,0 +1,24 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Semantic definitions -->
+
+    <dimen name="horizontal_page_margin">@dimen/margin_huge</dimen>
+    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..09303314e91eed623b5aca91189372aaea767c9e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-styles.xml
@@ -0,0 +1,25 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <style name="Widget.SampleMessage">
+        <item name="android:textAppearance">?android:textAppearanceLarge</item>
+        <item name="android:lineSpacingMultiplier">1.2</item>
+        <item name="android:shadowDy">-6.5</item>
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v11/styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v11/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c2d1babc121ec5680f85e9d8b6a8f65f8fefbb6e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v11/styles.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+  <!--
+        Base application theme for API 11+. This theme completely replaces
+        AppBaseTheme from res/values/styles.xml on API 11+ devices.
+  -->
+  <style name="AppBaseTheme" parent="android:Theme.Holo.Light">
+    <!-- API 11 theme customizations can go here. -->
+  </style>
+
+  <style name="FullscreenTheme" parent="android:Theme.Holo">
+    <item name="android:actionBarStyle">@style/FullscreenActionBarStyle</item>
+    <item name="android:windowActionBarOverlay">true</item>
+    <item name="android:windowBackground">@null</item>
+    <item name="metaButtonBarStyle">?android:attr/buttonBarStyle</item>
+    <item name="metaButtonBarButtonStyle">?android:attr/buttonBarButtonStyle</item>
+  </style>
+
+  <style name="FullscreenActionBarStyle" parent="android:Widget.Holo.ActionBar">
+    <!--  <item name="android:background">@color/black_overlay</item>  -->
+  </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v11/template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v11/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1ad048439cf1c5207a609d4664674e9a4278ee6c
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v11/template-styles.xml
@@ -0,0 +1,22 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Activity themes -->
+    <style name="Theme.Base" parent="android:Theme.Holo.Light" />
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v14/styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v14/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..cc370849c0f90627283345bcfa03d0bb0b40e1b2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v14/styles.xml
@@ -0,0 +1,12 @@
+<resources>
+
+  <!--
+        Base application theme for API 14+. This theme completely replaces
+        AppBaseTheme from BOTH res/values/styles.xml and
+        res/values-v11/styles.xml on API 14+ devices.
+  -->
+  <style name="AppBaseTheme" parent="android:Theme.Holo.Light.DarkActionBar">
+    <!-- API 14 theme customizations can go here. -->
+  </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v21/base-colors.xml b/tensorflow/contrib/lite/examples/android/res/values-v21/base-colors.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c16da7c51ceeb3e634c349ff098f86eccb53b8f8
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v21/base-colors.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v21/base-template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v21/base-template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..8890d2f4a507e30c28457ea9692f03af5834c82f
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v21/base-template-styles.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+
+    <!-- Activity themes -->
+    <style name="Theme.Base" parent="android:Theme.Material.Light">
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/attrs.xml b/tensorflow/contrib/lite/examples/android/res/values/attrs.xml
new file mode 100644
index 0000000000000000000000000000000000000000..56e5beae76f2a148c147d599fe0e02bd78a5f729
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/attrs.xml
@@ -0,0 +1,14 @@
+<resources>
+
+  <!--
+         Declare custom theme attributes that allow changing which styles are
+         used for button bars depending on the API level.
+         ?android:attr/buttonBarStyle is new as of API 11 so this is
+         necessary to support previous API levels.
+  -->
+  <declare-styleable name="ButtonBarContainerTheme">
+    <attr name="metaButtonBarStyle" format="reference" />
+    <attr name="metaButtonBarButtonStyle" format="reference" />
+  </declare-styleable>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/base-strings.xml b/tensorflow/contrib/lite/examples/android/res/values/base-strings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ebc5dc8423ca6a481bbfcfabcbcd66e4367428eb
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/base-strings.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+    <string name="app_name">TFLite Demo</string>
+    <string name="activity_name_classification">TFL Classify</string>
+    <string name="activity_name_detection">TFL Detect</string>
+    <string name="activity_name_speech">TFL Speech</string>
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/colors.xml b/tensorflow/contrib/lite/examples/android/res/values/colors.xml
new file mode 100644
index 0000000000000000000000000000000000000000..584ed6052d4746bce5b60fef0b25633777262a11
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/colors.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <color name="control_background">#cc4285f4</color>
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/strings.xml b/tensorflow/contrib/lite/examples/android/res/values/strings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ea20ee78e0b99c0ad7f1c315269a7fd5435cff98
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/strings.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <string name="description_info">Info</string>
+    <string name="request_permission">This sample needs camera permission.</string>
+    <string name="camera_error">This device doesn\'t support Camera2 API.</string>
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/styles.xml b/tensorflow/contrib/lite/examples/android/res/values/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..dd1d973e9be8c82b68e39f755650efec71d95005
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/styles.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <style name="MaterialTheme" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" />
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/template-dimens.xml b/tensorflow/contrib/lite/examples/android/res/values/template-dimens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..069977b6a6f4c9d14ed859d4e8dd95d42f7ce74f
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/template-dimens.xml
@@ -0,0 +1,32 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Define standard dimensions to comply with Holo-style grids and rhythm. -->
+
+    <dimen name="margin_tiny">4dp</dimen>
+    <dimen name="margin_small">8dp</dimen>
+    <dimen name="margin_medium">16dp</dimen>
+    <dimen name="margin_large">32dp</dimen>
+    <dimen name="margin_huge">64dp</dimen>
+
+    <!-- Semantic definitions -->
+
+    <dimen name="horizontal_page_margin">@dimen/margin_medium</dimen>
+    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1b87714a49409a40d3a4649d83f0d0ff0fd57b9d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/template-styles.xml
@@ -0,0 +1,42 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Activity themes -->
+
+    <style name="Theme.Base" parent="android:Theme.Light" />
+
+    <style name="Theme.Sample" parent="Theme.Base" />
+
+    <style name="AppTheme" parent="Theme.Sample" />
+    <!-- Widget styling -->
+
+    <style name="Widget" />
+
+    <style name="Widget.SampleMessage">
+        <item name="android:textAppearance">?android:textAppearanceMedium</item>
+        <item name="android:lineSpacingMultiplier">1.1</item>
+    </style>
+
+    <style name="Widget.SampleMessageTile">
+        <item name="android:background">@drawable/tile</item>
+        <item name="android:shadowColor">#7F000000</item>
+        <item name="android:shadowDy">-3.5</item>
+        <item name="android:shadowRadius">2</item>
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java
new file mode 100644
index 0000000000000000000000000000000000000000..eff24afdba44e5d56c760c1692df5fc40f5c2f42
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.util.AttributeSet;
+import android.view.TextureView;
+
+/**
+ * A {@link TextureView} that can be adjusted to a specified aspect ratio.
+ */
+public class AutoFitTextureView extends TextureView {
+  private int ratioWidth = 0;
+  private int ratioHeight = 0;
+
+  public AutoFitTextureView(final Context context) {
+    this(context, null);
+  }
+
+  public AutoFitTextureView(final Context context, final AttributeSet attrs) {
+    this(context, attrs, 0);
+  }
+
+  public AutoFitTextureView(final Context context, final AttributeSet attrs, final int defStyle) {
+    super(context, attrs, defStyle);
+  }
+
+  /**
+   * Sets the aspect ratio for this view. The size of the view will be measured based on the ratio
+   * calculated from the parameters. Note that the actual sizes of parameters don't matter, that
+   * is, calling setAspectRatio(2, 3) and setAspectRatio(4, 6) make the same result.
+   *
+   * @param width  Relative horizontal size
+   * @param height Relative vertical size
+   */
+  public void setAspectRatio(final int width, final int height) {
+    if (width < 0 || height < 0) {
+      throw new IllegalArgumentException("Size cannot be negative.");
+    }
+    ratioWidth = width;
+    ratioHeight = height;
+    requestLayout();
+  }
+
+  @Override
+  protected void onMeasure(final int widthMeasureSpec, final int heightMeasureSpec) {
+    super.onMeasure(widthMeasureSpec, heightMeasureSpec);
+    final int width = MeasureSpec.getSize(widthMeasureSpec);
+    final int height = MeasureSpec.getSize(heightMeasureSpec);
+    if (0 == ratioWidth || 0 == ratioHeight) {
+      setMeasuredDimension(width, height);
+    } else {
+      if (width < height * ratioWidth / ratioHeight) {
+        setMeasuredDimension(width, width * ratioHeight / ratioWidth);
+      } else {
+        setMeasuredDimension(height * ratioWidth / ratioHeight, height);
+      }
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..15d5456f027b27ae3cbb93f736dbb104af0218de
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -0,0 +1,450 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.Manifest;
+import android.app.Activity;
+import android.app.Fragment;
+import android.content.Context;
+import android.content.pm.PackageManager;
+import android.hardware.Camera;
+import android.hardware.camera2.CameraAccessException;
+import android.hardware.camera2.CameraCharacteristics;
+import android.hardware.camera2.CameraManager;
+import android.hardware.camera2.params.StreamConfigurationMap;
+import android.media.Image;
+import android.media.Image.Plane;
+import android.media.ImageReader;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.Build;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.os.Trace;
+import android.util.Size;
+import android.view.KeyEvent;
+import android.view.Surface;
+import android.view.WindowManager;
+import android.widget.Toast;
+import java.nio.ByteBuffer;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public abstract class CameraActivity extends Activity
+    implements OnImageAvailableListener, Camera.PreviewCallback {
+  private static final Logger LOGGER = new Logger();
+
+  private static final int PERMISSIONS_REQUEST = 1;
+
+  private static final String PERMISSION_CAMERA = Manifest.permission.CAMERA;
+  private static final String PERMISSION_STORAGE = Manifest.permission.WRITE_EXTERNAL_STORAGE;
+
+  private boolean debug = false;
+
+  private Handler handler;
+  private HandlerThread handlerThread;
+  private boolean useCamera2API;
+  private boolean isProcessingFrame = false;
+  private byte[][] yuvBytes = new byte[3][];
+  private int[] rgbBytes = null;
+  private int yRowStride;
+
+  protected int previewWidth = 0;
+  protected int previewHeight = 0;
+
+  private Runnable postInferenceCallback;
+  private Runnable imageConverter;
+
+  @Override
+  protected void onCreate(final Bundle savedInstanceState) {
+    LOGGER.d("onCreate " + this);
+    super.onCreate(null);
+    getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+
+    setContentView(R.layout.activity_camera);
+
+    if (hasPermission()) {
+      setFragment();
+    } else {
+      requestPermission();
+    }
+  }
+
+
+  protected int[] getRgbBytes() {
+    imageConverter.run();
+    return rgbBytes;
+  }
+
+  protected int getLuminanceStride() {
+    return yRowStride;
+  }
+
+  protected byte[] getLuminance() {
+    return yuvBytes[0];
+  }
+
+  /**
+   * Callback for android.hardware.Camera API
+   */
+  @Override
+  public void onPreviewFrame(final byte[] bytes, final Camera camera) {
+    if (isProcessingFrame) {
+      LOGGER.w("Dropping frame!");
+      return;
+    }
+
+    try {
+      // Initialize the storage bitmaps once when the resolution is known.
+      if (rgbBytes == null) {
+        Camera.Size previewSize = camera.getParameters().getPreviewSize();
+        previewHeight = previewSize.height;
+        previewWidth = previewSize.width;
+        rgbBytes = new int[previewWidth * previewHeight];
+        onPreviewSizeChosen(new Size(previewSize.width, previewSize.height), 90);
+      }
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+      return;
+    }
+
+    isProcessingFrame = true;
+    yuvBytes[0] = bytes;
+    yRowStride = previewWidth;
+
+    imageConverter =
+        new Runnable() {
+          @Override
+          public void run() {
+            ImageUtils.convertYUV420SPToARGB8888(bytes, previewWidth, previewHeight, rgbBytes);
+          }
+        };
+
+    postInferenceCallback =
+        new Runnable() {
+          @Override
+          public void run() {
+            camera.addCallbackBuffer(bytes);
+            isProcessingFrame = false;
+          }
+        };
+    processImage();
+  }
+
+  /**
+   * Callback for Camera2 API
+   */
+  @Override
+  public void onImageAvailable(final ImageReader reader) {
+    //We need wait until we have some size from onPreviewSizeChosen
+    if (previewWidth == 0 || previewHeight == 0) {
+      return;
+    }
+    if (rgbBytes == null) {
+      rgbBytes = new int[previewWidth * previewHeight];
+    }
+    try {
+      final Image image = reader.acquireLatestImage();
+
+      if (image == null) {
+        return;
+      }
+
+      if (isProcessingFrame) {
+        image.close();
+        return;
+      }
+      isProcessingFrame = true;
+      Trace.beginSection("imageAvailable");
+      final Plane[] planes = image.getPlanes();
+      fillBytes(planes, yuvBytes);
+      yRowStride = planes[0].getRowStride();
+      final int uvRowStride = planes[1].getRowStride();
+      final int uvPixelStride = planes[1].getPixelStride();
+
+      imageConverter =
+          new Runnable() {
+            @Override
+            public void run() {
+              ImageUtils.convertYUV420ToARGB8888(
+                  yuvBytes[0],
+                  yuvBytes[1],
+                  yuvBytes[2],
+                  previewWidth,
+                  previewHeight,
+                  yRowStride,
+                  uvRowStride,
+                  uvPixelStride,
+                  rgbBytes);
+            }
+          };
+
+      postInferenceCallback =
+          new Runnable() {
+            @Override
+            public void run() {
+              image.close();
+              isProcessingFrame = false;
+            }
+          };
+
+      processImage();
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+      Trace.endSection();
+      return;
+    }
+    Trace.endSection();
+  }
+
+  @Override
+  public synchronized void onStart() {
+    LOGGER.d("onStart " + this);
+    super.onStart();
+  }
+
+  @Override
+  public synchronized void onResume() {
+    LOGGER.d("onResume " + this);
+    super.onResume();
+
+    handlerThread = new HandlerThread("inference");
+    handlerThread.start();
+    handler = new Handler(handlerThread.getLooper());
+  }
+
+  @Override
+  public synchronized void onPause() {
+    LOGGER.d("onPause " + this);
+
+    if (!isFinishing()) {
+      LOGGER.d("Requesting finish");
+      finish();
+    }
+
+    handlerThread.quitSafely();
+    try {
+      handlerThread.join();
+      handlerThread = null;
+      handler = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+
+    super.onPause();
+  }
+
+  @Override
+  public synchronized void onStop() {
+    LOGGER.d("onStop " + this);
+    super.onStop();
+  }
+
+  @Override
+  public synchronized void onDestroy() {
+    LOGGER.d("onDestroy " + this);
+    super.onDestroy();
+  }
+
+  protected synchronized void runInBackground(final Runnable r) {
+    if (handler != null) {
+      handler.post(r);
+    }
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      final int requestCode, final String[] permissions, final int[] grantResults) {
+    if (requestCode == PERMISSIONS_REQUEST) {
+      if (grantResults.length > 0
+          && grantResults[0] == PackageManager.PERMISSION_GRANTED
+          && grantResults[1] == PackageManager.PERMISSION_GRANTED) {
+        setFragment();
+      } else {
+        requestPermission();
+      }
+    }
+  }
+
+  private boolean hasPermission() {
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      return checkSelfPermission(PERMISSION_CAMERA) == PackageManager.PERMISSION_GRANTED &&
+          checkSelfPermission(PERMISSION_STORAGE) == PackageManager.PERMISSION_GRANTED;
+    } else {
+      return true;
+    }
+  }
+
+  private void requestPermission() {
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      if (shouldShowRequestPermissionRationale(PERMISSION_CAMERA) ||
+          shouldShowRequestPermissionRationale(PERMISSION_STORAGE)) {
+        Toast.makeText(CameraActivity.this,
+            "Camera AND storage permission are required for this demo", Toast.LENGTH_LONG).show();
+      }
+      requestPermissions(new String[] {PERMISSION_CAMERA, PERMISSION_STORAGE}, PERMISSIONS_REQUEST);
+    }
+  }
+
+  // Returns true if the device supports the required hardware level, or better.
+  private boolean isHardwareLevelSupported(
+      CameraCharacteristics characteristics, int requiredLevel) {
+    int deviceLevel = characteristics.get(CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL);
+    if (deviceLevel == CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_LEGACY) {
+      return requiredLevel == deviceLevel;
+    }
+    // deviceLevel is not LEGACY, can use numerical sort
+    return requiredLevel <= deviceLevel;
+  }
+
+  private String chooseCamera() {
+    final CameraManager manager = (CameraManager) getSystemService(Context.CAMERA_SERVICE);
+    try {
+      for (final String cameraId : manager.getCameraIdList()) {
+        final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
+
+        // We don't use a front facing camera in this sample.
+        final Integer facing = characteristics.get(CameraCharacteristics.LENS_FACING);
+        if (facing != null && facing == CameraCharacteristics.LENS_FACING_FRONT) {
+          continue;
+        }
+
+        final StreamConfigurationMap map =
+            characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
+
+        if (map == null) {
+          continue;
+        }
+
+        // Fallback to camera1 API for internal cameras that don't have full support.
+        // This should help with legacy situations where using the camera2 API causes
+        // distorted or otherwise broken previews.
+        useCamera2API = (facing == CameraCharacteristics.LENS_FACING_EXTERNAL)
+            || isHardwareLevelSupported(characteristics, 
+                                        CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
+        LOGGER.i("Camera API lv2?: %s", useCamera2API);
+        return cameraId;
+      }
+    } catch (CameraAccessException e) {
+      LOGGER.e(e, "Not allowed to access camera");
+    }
+
+    return null;
+  }
+
+  protected void setFragment() {
+    String cameraId = chooseCamera();
+
+    Fragment fragment;
+    if (useCamera2API) {
+      CameraConnectionFragment camera2Fragment =
+          CameraConnectionFragment.newInstance(
+              new CameraConnectionFragment.ConnectionCallback() {
+                @Override
+                public void onPreviewSizeChosen(final Size size, final int rotation) {
+                  previewHeight = size.getHeight();
+                  previewWidth = size.getWidth();
+                  CameraActivity.this.onPreviewSizeChosen(size, rotation);
+                }
+              },
+              this,
+              getLayoutId(),
+              getDesiredPreviewFrameSize());
+
+      camera2Fragment.setCamera(cameraId);
+      fragment = camera2Fragment;
+    } else {
+      fragment =
+          new LegacyCameraConnectionFragment(this, getLayoutId(), getDesiredPreviewFrameSize());
+    }
+
+    getFragmentManager()
+        .beginTransaction()
+        .replace(R.id.container, fragment)
+        .commit();
+  }
+
+  protected void fillBytes(final Plane[] planes, final byte[][] yuvBytes) {
+    // Because of the variable row stride it's not possible to know in
+    // advance the actual necessary dimensions of the yuv planes.
+    for (int i = 0; i < planes.length; ++i) {
+      final ByteBuffer buffer = planes[i].getBuffer();
+      if (yuvBytes[i] == null) {
+        LOGGER.d("Initializing buffer %d at size %d", i, buffer.capacity());
+        yuvBytes[i] = new byte[buffer.capacity()];
+      }
+      buffer.get(yuvBytes[i]);
+    }
+  }
+
+  public boolean isDebug() {
+    return debug;
+  }
+
+  public void requestRender() {
+    final OverlayView overlay = (OverlayView) findViewById(R.id.debug_overlay);
+    if (overlay != null) {
+      overlay.postInvalidate();
+    }
+  }
+
+  public void addCallback(final OverlayView.DrawCallback callback) {
+    final OverlayView overlay = (OverlayView) findViewById(R.id.debug_overlay);
+    if (overlay != null) {
+      overlay.addCallback(callback);
+    }
+  }
+
+  public void onSetDebug(final boolean debug) {}
+
+  @Override
+  public boolean onKeyDown(final int keyCode, final KeyEvent event) {
+    if (keyCode == KeyEvent.KEYCODE_VOLUME_DOWN || keyCode == KeyEvent.KEYCODE_VOLUME_UP) {
+      debug = !debug;
+      requestRender();
+      onSetDebug(debug);
+      return true;
+    }
+    return super.onKeyDown(keyCode, event);
+  }
+
+  protected void readyForNextImage() {
+    if (postInferenceCallback != null) {
+      postInferenceCallback.run();
+    }
+  }
+
+  protected int getScreenOrientation() {
+    switch (getWindowManager().getDefaultDisplay().getRotation()) {
+      case Surface.ROTATION_270:
+        return 270;
+      case Surface.ROTATION_180:
+        return 180;
+      case Surface.ROTATION_90:
+        return 90;
+      default:
+        return 0;
+    }
+  }
+
+  protected abstract void processImage();
+
+  protected abstract void onPreviewSizeChosen(final Size size, final int rotation);
+  protected abstract int getLayoutId();
+  protected abstract Size getDesiredPreviewFrameSize();
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
new file mode 100644
index 0000000000000000000000000000000000000000..51a1adb538e48cad4807d35f3efc6feefe81309b
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
@@ -0,0 +1,634 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.app.Activity;
+import android.app.AlertDialog;
+import android.app.Dialog;
+import android.app.DialogFragment;
+import android.app.Fragment;
+import android.content.Context;
+import android.content.DialogInterface;
+import android.content.res.Configuration;
+import android.graphics.ImageFormat;
+import android.graphics.Matrix;
+import android.graphics.RectF;
+import android.graphics.SurfaceTexture;
+import android.hardware.camera2.CameraAccessException;
+import android.hardware.camera2.CameraCaptureSession;
+import android.hardware.camera2.CameraCharacteristics;
+import android.hardware.camera2.CameraDevice;
+import android.hardware.camera2.CameraManager;
+import android.hardware.camera2.CaptureRequest;
+import android.hardware.camera2.CaptureResult;
+import android.hardware.camera2.TotalCaptureResult;
+import android.hardware.camera2.params.StreamConfigurationMap;
+import android.media.ImageReader;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.text.TextUtils;
+import android.util.Size;
+import android.util.SparseIntArray;
+import android.view.LayoutInflater;
+import android.view.Surface;
+import android.view.TextureView;
+import android.view.View;
+import android.view.ViewGroup;
+import android.widget.Toast;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.TimeUnit;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public class CameraConnectionFragment extends Fragment {
+  private static final Logger LOGGER = new Logger();
+
+  /**
+   * The camera preview size will be chosen to be the smallest frame by pixel size capable of
+   * containing a DESIRED_SIZE x DESIRED_SIZE square.
+   */
+  private static final int MINIMUM_PREVIEW_SIZE = 320;
+
+  /**
+   * Conversion from screen rotation to JPEG orientation.
+   */
+  private static final SparseIntArray ORIENTATIONS = new SparseIntArray();
+  private static final String FRAGMENT_DIALOG = "dialog";
+
+  static {
+    ORIENTATIONS.append(Surface.ROTATION_0, 90);
+    ORIENTATIONS.append(Surface.ROTATION_90, 0);
+    ORIENTATIONS.append(Surface.ROTATION_180, 270);
+    ORIENTATIONS.append(Surface.ROTATION_270, 180);
+  }
+
+  /**
+   * {@link android.view.TextureView.SurfaceTextureListener} handles several lifecycle events on a
+   * {@link TextureView}.
+   */
+  private final TextureView.SurfaceTextureListener surfaceTextureListener =
+      new TextureView.SurfaceTextureListener() {
+        @Override
+        public void onSurfaceTextureAvailable(
+            final SurfaceTexture texture, final int width, final int height) {
+          openCamera(width, height);
+        }
+
+        @Override
+        public void onSurfaceTextureSizeChanged(
+            final SurfaceTexture texture, final int width, final int height) {
+          configureTransform(width, height);
+        }
+
+        @Override
+        public boolean onSurfaceTextureDestroyed(final SurfaceTexture texture) {
+          return true;
+        }
+
+        @Override
+        public void onSurfaceTextureUpdated(final SurfaceTexture texture) {}
+      };
+
+  /**
+   * Callback for Activities to use to initialize their data once the
+   * selected preview size is known.
+   */
+  public interface ConnectionCallback {
+    void onPreviewSizeChosen(Size size, int cameraRotation);
+  }
+
+  /**
+   * ID of the current {@link CameraDevice}.
+   */
+  private String cameraId;
+
+  /**
+   * An {@link AutoFitTextureView} for camera preview.
+   */
+  private AutoFitTextureView textureView;
+
+  /**
+   * A {@link CameraCaptureSession } for camera preview.
+   */
+  private CameraCaptureSession captureSession;
+
+  /**
+   * A reference to the opened {@link CameraDevice}.
+   */
+  private CameraDevice cameraDevice;
+
+  /**
+   * The rotation in degrees of the camera sensor from the display.
+   */
+  private Integer sensorOrientation;
+
+  /**
+   * The {@link android.util.Size} of camera preview.
+   */
+  private Size previewSize;
+
+  /**
+   * {@link android.hardware.camera2.CameraDevice.StateCallback}
+   * is called when {@link CameraDevice} changes its state.
+   */
+  private final CameraDevice.StateCallback stateCallback =
+      new CameraDevice.StateCallback() {
+        @Override
+        public void onOpened(final CameraDevice cd) {
+          // This method is called when the camera is opened.  We start camera preview here.
+          cameraOpenCloseLock.release();
+          cameraDevice = cd;
+          createCameraPreviewSession();
+        }
+
+        @Override
+        public void onDisconnected(final CameraDevice cd) {
+          cameraOpenCloseLock.release();
+          cd.close();
+          cameraDevice = null;
+        }
+
+        @Override
+        public void onError(final CameraDevice cd, final int error) {
+          cameraOpenCloseLock.release();
+          cd.close();
+          cameraDevice = null;
+          final Activity activity = getActivity();
+          if (null != activity) {
+            activity.finish();
+          }
+        }
+      };
+
+  /**
+   * An additional thread for running tasks that shouldn't block the UI.
+   */
+  private HandlerThread backgroundThread;
+
+  /**
+   * A {@link Handler} for running tasks in the background.
+   */
+  private Handler backgroundHandler;
+
+  /**
+   * An {@link ImageReader} that handles preview frame capture.
+   */
+  private ImageReader previewReader;
+
+  /**
+   * {@link android.hardware.camera2.CaptureRequest.Builder} for the camera preview
+   */
+  private CaptureRequest.Builder previewRequestBuilder;
+
+  /**
+   * {@link CaptureRequest} generated by {@link #previewRequestBuilder}
+   */
+  private CaptureRequest previewRequest;
+
+  /**
+   * A {@link Semaphore} to prevent the app from exiting before closing the camera.
+   */
+  private final Semaphore cameraOpenCloseLock = new Semaphore(1);
+
+  /**
+   * A {@link OnImageAvailableListener} to receive frames as they are available.
+   */
+  private final OnImageAvailableListener imageListener;
+
+  /** The input size in pixels desired by TensorFlow (width and height of a square bitmap). */
+  private final Size inputSize;
+
+  /**
+   * The layout identifier to inflate for this Fragment.
+   */
+  private final int layout;
+
+
+  private final ConnectionCallback cameraConnectionCallback;
+
+  private CameraConnectionFragment(
+      final ConnectionCallback connectionCallback,
+      final OnImageAvailableListener imageListener,
+      final int layout,
+      final Size inputSize) {
+    this.cameraConnectionCallback = connectionCallback;
+    this.imageListener = imageListener;
+    this.layout = layout;
+    this.inputSize = inputSize;
+  }
+
+  /**
+   * Shows a {@link Toast} on the UI thread.
+   *
+   * @param text The message to show
+   */
+  private void showToast(final String text) {
+    final Activity activity = getActivity();
+    if (activity != null) {
+      activity.runOnUiThread(
+          new Runnable() {
+            @Override
+            public void run() {
+              Toast.makeText(activity, text, Toast.LENGTH_SHORT).show();
+            }
+          });
+    }
+  }
+
+  /**
+   * Given {@code choices} of {@code Size}s supported by a camera, chooses the smallest one whose
+   * width and height are at least as large as the minimum of both, or an exact match if possible.
+   *
+   * @param choices The list of sizes that the camera supports for the intended output class
+   * @param width The minimum desired width
+   * @param height The minimum desired height
+   * @return The optimal {@code Size}, or an arbitrary one if none were big enough
+   */
+  protected static Size chooseOptimalSize(final Size[] choices, final int width, final int height) {
+    final int minSize = Math.max(Math.min(width, height), MINIMUM_PREVIEW_SIZE);
+    final Size desiredSize = new Size(width, height);
+
+    // Collect the supported resolutions that are at least as big as the preview Surface
+    boolean exactSizeFound = false;
+    final List<Size> bigEnough = new ArrayList<Size>();
+    final List<Size> tooSmall = new ArrayList<Size>();
+    for (final Size option : choices) {
+      if (option.equals(desiredSize)) {
+        // Set the size but don't return yet so that remaining sizes will still be logged.
+        exactSizeFound = true;
+      }
+
+      if (option.getHeight() >= minSize && option.getWidth() >= minSize) {
+        bigEnough.add(option);
+      } else {
+        tooSmall.add(option);
+      }
+    }
+
+    LOGGER.i("Desired size: " + desiredSize + ", min size: " + minSize + "x" + minSize);
+    LOGGER.i("Valid preview sizes: [" + TextUtils.join(", ", bigEnough) + "]");
+    LOGGER.i("Rejected preview sizes: [" + TextUtils.join(", ", tooSmall) + "]");
+
+    if (exactSizeFound) {
+      LOGGER.i("Exact size match found.");
+      return desiredSize;
+    }
+
+    // Pick the smallest of those, assuming we found any
+    if (bigEnough.size() > 0) {
+      final Size chosenSize = Collections.min(bigEnough, new CompareSizesByArea());
+      LOGGER.i("Chosen size: " + chosenSize.getWidth() + "x" + chosenSize.getHeight());
+      return chosenSize;
+    } else {
+      LOGGER.e("Couldn't find any suitable preview size");
+      return choices[0];
+    }
+  }
+
+  public static CameraConnectionFragment newInstance(
+      final ConnectionCallback callback,
+      final OnImageAvailableListener imageListener,
+      final int layout,
+      final Size inputSize) {
+    return new CameraConnectionFragment(callback, imageListener, layout, inputSize);
+  }
+
+  @Override
+  public View onCreateView(
+      final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
+    return inflater.inflate(layout, container, false);
+  }
+
+  @Override
+  public void onViewCreated(final View view, final Bundle savedInstanceState) {
+    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
+  }
+
+  @Override
+  public void onActivityCreated(final Bundle savedInstanceState) {
+    super.onActivityCreated(savedInstanceState);
+  }
+
+  @Override
+  public void onResume() {
+    super.onResume();
+    startBackgroundThread();
+
+    // When the screen is turned off and turned back on, the SurfaceTexture is already
+    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
+    // a camera and start preview from here (otherwise, we wait until the surface is ready in
+    // the SurfaceTextureListener).
+    if (textureView.isAvailable()) {
+      openCamera(textureView.getWidth(), textureView.getHeight());
+    } else {
+      textureView.setSurfaceTextureListener(surfaceTextureListener);
+    }
+  }
+
+  @Override
+  public void onPause() {
+    closeCamera();
+    stopBackgroundThread();
+    super.onPause();
+  }
+
+  public void setCamera(String cameraId) {
+    this.cameraId = cameraId;
+  }
+
+  /**
+   * Sets up member variables related to camera.
+   */
+  private void setUpCameraOutputs() {
+    final Activity activity = getActivity();
+    final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
+    try {
+      final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
+
+      final StreamConfigurationMap map =
+          characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
+
+      sensorOrientation = characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION);
+
+      // Danger, W.R.! Attempting to use too large a preview size could  exceed the camera
+      // bus' bandwidth limitation, resulting in gorgeous previews but the storage of
+      // garbage capture data.
+      previewSize =
+          chooseOptimalSize(map.getOutputSizes(SurfaceTexture.class),
+              inputSize.getWidth(),
+              inputSize.getHeight());
+
+      // We fit the aspect ratio of TextureView to the size of preview we picked.
+      final int orientation = getResources().getConfiguration().orientation;
+      if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
+        textureView.setAspectRatio(previewSize.getWidth(), previewSize.getHeight());
+      } else {
+        textureView.setAspectRatio(previewSize.getHeight(), previewSize.getWidth());
+      }
+    } catch (final CameraAccessException e) {
+      LOGGER.e(e, "Exception!");
+    } catch (final NullPointerException e) {
+      // Currently an NPE is thrown when the Camera2API is used but not supported on the
+      // device this code runs.
+      // TODO(andrewharp): abstract ErrorDialog/RuntimeException handling out into new method and
+      // reuse throughout app.
+      ErrorDialog.newInstance(getString(R.string.camera_error))
+          .show(getChildFragmentManager(), FRAGMENT_DIALOG);
+      throw new RuntimeException(getString(R.string.camera_error));
+    }
+
+    cameraConnectionCallback.onPreviewSizeChosen(previewSize, sensorOrientation);
+  }
+
+  /**
+   * Opens the camera specified by {@link CameraConnectionFragment#cameraId}.
+   */
+  private void openCamera(final int width, final int height) {
+    setUpCameraOutputs();
+    configureTransform(width, height);
+    final Activity activity = getActivity();
+    final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
+    try {
+      if (!cameraOpenCloseLock.tryAcquire(2500, TimeUnit.MILLISECONDS)) {
+        throw new RuntimeException("Time out waiting to lock camera opening.");
+      }
+      manager.openCamera(cameraId, stateCallback, backgroundHandler);
+    } catch (final CameraAccessException e) {
+      LOGGER.e(e, "Exception!");
+    } catch (final InterruptedException e) {
+      throw new RuntimeException("Interrupted while trying to lock camera opening.", e);
+    }
+  }
+
+  /**
+   * Closes the current {@link CameraDevice}.
+   */
+  private void closeCamera() {
+    try {
+      cameraOpenCloseLock.acquire();
+      if (null != captureSession) {
+        captureSession.close();
+        captureSession = null;
+      }
+      if (null != cameraDevice) {
+        cameraDevice.close();
+        cameraDevice = null;
+      }
+      if (null != previewReader) {
+        previewReader.close();
+        previewReader = null;
+      }
+    } catch (final InterruptedException e) {
+      throw new RuntimeException("Interrupted while trying to lock camera closing.", e);
+    } finally {
+      cameraOpenCloseLock.release();
+    }
+  }
+
+  /**
+   * Starts a background thread and its {@link Handler}.
+   */
+  private void startBackgroundThread() {
+    backgroundThread = new HandlerThread("ImageListener");
+    backgroundThread.start();
+    backgroundHandler = new Handler(backgroundThread.getLooper());
+  }
+
+  /**
+   * Stops the background thread and its {@link Handler}.
+   */
+  private void stopBackgroundThread() {
+    backgroundThread.quitSafely();
+    try {
+      backgroundThread.join();
+      backgroundThread = null;
+      backgroundHandler = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  private final CameraCaptureSession.CaptureCallback captureCallback =
+      new CameraCaptureSession.CaptureCallback() {
+        @Override
+        public void onCaptureProgressed(
+            final CameraCaptureSession session,
+            final CaptureRequest request,
+            final CaptureResult partialResult) {}
+
+        @Override
+        public void onCaptureCompleted(
+            final CameraCaptureSession session,
+            final CaptureRequest request,
+            final TotalCaptureResult result) {}
+      };
+
+  /**
+   * Creates a new {@link CameraCaptureSession} for camera preview.
+   */
+  private void createCameraPreviewSession() {
+    try {
+      final SurfaceTexture texture = textureView.getSurfaceTexture();
+      assert texture != null;
+
+      // We configure the size of default buffer to be the size of camera preview we want.
+      texture.setDefaultBufferSize(previewSize.getWidth(), previewSize.getHeight());
+
+      // This is the output Surface we need to start preview.
+      final Surface surface = new Surface(texture);
+
+      // We set up a CaptureRequest.Builder with the output Surface.
+      previewRequestBuilder = cameraDevice.createCaptureRequest(CameraDevice.TEMPLATE_PREVIEW);
+      previewRequestBuilder.addTarget(surface);
+
+      LOGGER.i("Opening camera preview: " + previewSize.getWidth() + "x" + previewSize.getHeight());
+
+      // Create the reader for the preview frames.
+      previewReader =
+          ImageReader.newInstance(
+              previewSize.getWidth(), previewSize.getHeight(), ImageFormat.YUV_420_888, 2);
+
+      previewReader.setOnImageAvailableListener(imageListener, backgroundHandler);
+      previewRequestBuilder.addTarget(previewReader.getSurface());
+
+      // Here, we create a CameraCaptureSession for camera preview.
+      cameraDevice.createCaptureSession(
+          Arrays.asList(surface, previewReader.getSurface()),
+          new CameraCaptureSession.StateCallback() {
+
+            @Override
+            public void onConfigured(final CameraCaptureSession cameraCaptureSession) {
+              // The camera is already closed
+              if (null == cameraDevice) {
+                return;
+              }
+
+              // When the session is ready, we start displaying the preview.
+              captureSession = cameraCaptureSession;
+              try {
+                // Auto focus should be continuous for camera preview.
+                previewRequestBuilder.set(
+                    CaptureRequest.CONTROL_AF_MODE,
+                    CaptureRequest.CONTROL_AF_MODE_CONTINUOUS_PICTURE);
+                // Flash is automatically enabled when necessary.
+                previewRequestBuilder.set(
+                    CaptureRequest.CONTROL_AE_MODE, CaptureRequest.CONTROL_AE_MODE_ON_AUTO_FLASH);
+
+                // Finally, we start displaying the camera preview.
+                previewRequest = previewRequestBuilder.build();
+                captureSession.setRepeatingRequest(
+                    previewRequest, captureCallback, backgroundHandler);
+              } catch (final CameraAccessException e) {
+                LOGGER.e(e, "Exception!");
+              }
+            }
+
+            @Override
+            public void onConfigureFailed(final CameraCaptureSession cameraCaptureSession) {
+              showToast("Failed");
+            }
+          },
+          null);
+    } catch (final CameraAccessException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  /**
+   * Configures the necessary {@link android.graphics.Matrix} transformation to `mTextureView`.
+   * This method should be called after the camera preview size is determined in
+   * setUpCameraOutputs and also the size of `mTextureView` is fixed.
+   *
+   * @param viewWidth  The width of `mTextureView`
+   * @param viewHeight The height of `mTextureView`
+   */
+  private void configureTransform(final int viewWidth, final int viewHeight) {
+    final Activity activity = getActivity();
+    if (null == textureView || null == previewSize || null == activity) {
+      return;
+    }
+    final int rotation = activity.getWindowManager().getDefaultDisplay().getRotation();
+    final Matrix matrix = new Matrix();
+    final RectF viewRect = new RectF(0, 0, viewWidth, viewHeight);
+    final RectF bufferRect = new RectF(0, 0, previewSize.getHeight(), previewSize.getWidth());
+    final float centerX = viewRect.centerX();
+    final float centerY = viewRect.centerY();
+    if (Surface.ROTATION_90 == rotation || Surface.ROTATION_270 == rotation) {
+      bufferRect.offset(centerX - bufferRect.centerX(), centerY - bufferRect.centerY());
+      matrix.setRectToRect(viewRect, bufferRect, Matrix.ScaleToFit.FILL);
+      final float scale =
+          Math.max(
+              (float) viewHeight / previewSize.getHeight(),
+              (float) viewWidth / previewSize.getWidth());
+      matrix.postScale(scale, scale, centerX, centerY);
+      matrix.postRotate(90 * (rotation - 2), centerX, centerY);
+    } else if (Surface.ROTATION_180 == rotation) {
+      matrix.postRotate(180, centerX, centerY);
+    }
+    textureView.setTransform(matrix);
+  }
+
+  /**
+   * Compares two {@code Size}s based on their areas.
+   */
+  static class CompareSizesByArea implements Comparator<Size> {
+    @Override
+    public int compare(final Size lhs, final Size rhs) {
+      // We cast here to ensure the multiplications won't overflow
+      return Long.signum(
+          (long) lhs.getWidth() * lhs.getHeight() - (long) rhs.getWidth() * rhs.getHeight());
+    }
+  }
+
+  /**
+   * Shows an error message dialog.
+   */
+  public static class ErrorDialog extends DialogFragment {
+    private static final String ARG_MESSAGE = "message";
+
+    public static ErrorDialog newInstance(final String message) {
+      final ErrorDialog dialog = new ErrorDialog();
+      final Bundle args = new Bundle();
+      args.putString(ARG_MESSAGE, message);
+      dialog.setArguments(args);
+      return dialog;
+    }
+
+    @Override
+    public Dialog onCreateDialog(final Bundle savedInstanceState) {
+      final Activity activity = getActivity();
+      return new AlertDialog.Builder(activity)
+          .setMessage(getArguments().getString(ARG_MESSAGE))
+          .setPositiveButton(
+              android.R.string.ok,
+              new DialogInterface.OnClickListener() {
+                @Override
+                public void onClick(final DialogInterface dialogInterface, final int i) {
+                  activity.finish();
+                }
+              })
+          .create();
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/Classifier.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/Classifier.java
new file mode 100644
index 0000000000000000000000000000000000000000..07995febaf5caab65dd4dfcc262ccf3750cfa303
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/Classifier.java
@@ -0,0 +1,107 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.graphics.Bitmap;
+import android.graphics.RectF;
+import java.util.List;
+
+/**
+ * Generic interface for interacting with different recognition engines.
+ */
+public interface Classifier {
+  /**
+   * An immutable result returned by a Classifier describing what was recognized.
+   */
+  public class Recognition {
+    /**
+     * A unique identifier for what has been recognized. Specific to the class, not the instance of
+     * the object.
+     */
+    private final String id;
+
+    /**
+     * Display name for the recognition.
+     */
+    private final String title;
+
+    /**
+     * A sortable score for how good the recognition is relative to others. Higher should be better.
+     */
+    private final Float confidence;
+
+    /** Optional location within the source image for the location of the recognized object. */
+    private RectF location;
+
+    public Recognition(
+        final String id, final String title, final Float confidence, final RectF location) {
+      this.id = id;
+      this.title = title;
+      this.confidence = confidence;
+      this.location = location;
+    }
+
+    public String getId() {
+      return id;
+    }
+
+    public String getTitle() {
+      return title;
+    }
+
+    public Float getConfidence() {
+      return confidence;
+    }
+
+    public RectF getLocation() {
+      return new RectF(location);
+    }
+
+    public void setLocation(RectF location) {
+      this.location = location;
+    }
+
+    @Override
+    public String toString() {
+      String resultString = "";
+      if (id != null) {
+        resultString += "[" + id + "] ";
+      }
+
+      if (title != null) {
+        resultString += title + " ";
+      }
+
+      if (confidence != null) {
+        resultString += String.format("(%.1f%%) ", confidence * 100.0f);
+      }
+
+      if (location != null) {
+        resultString += location + " ";
+      }
+
+      return resultString.trim();
+    }
+  }
+
+  List<Recognition> recognizeImage(Bitmap bitmap);
+
+  void enableStatLogging(final boolean debug);
+
+  String getStatString();
+
+  void close();
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..dcbbefbeab6627b37579902cd25841c0ae257dda
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.graphics.Bitmap;
+import android.graphics.Bitmap.Config;
+import android.graphics.Canvas;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Typeface;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.SystemClock;
+import android.util.Size;
+import android.util.TypedValue;
+import java.util.List;
+import java.util.Vector;
+import org.tensorflow.demo.OverlayView.DrawCallback;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public class ClassifierActivity extends CameraActivity implements OnImageAvailableListener {
+  private static final Logger LOGGER = new Logger();
+
+  protected static final boolean SAVE_PREVIEW_BITMAP = false;
+
+  private ResultsView resultsView;
+
+  private Bitmap rgbFrameBitmap = null;
+  private Bitmap croppedBitmap = null;
+  private Bitmap cropCopyBitmap = null;
+
+  private long lastProcessingTimeMs;
+
+  // These are the settings for the original v1 Inception model. If you want to
+  // use a model that's been produced from the TensorFlow for Poets codelab,
+  // you'll need to set IMAGE_SIZE = 299, IMAGE_MEAN = 128, IMAGE_STD = 128,
+  // INPUT_NAME = "Mul", and OUTPUT_NAME = "final_result".
+  // You'll also need to update the MODEL_FILE and LABEL_FILE paths to point to
+  // the ones you produced.
+  //
+  // To use v3 Inception model, strip the DecodeJpeg Op from your retrained
+  // model first:
+  //
+  // python strip_unused.py \
+  // --input_graph=<retrained-pb-file> \
+  // --output_graph=<your-stripped-pb-file> \
+  // --input_node_names="Mul" \
+  // --output_node_names="final_result" \
+  // --input_binary=true
+  private static final int INPUT_SIZE = 224;
+
+  private static final String MODEL_FILE = "mobilenet_quant_v1_224.tflite";
+  private static final String LABEL_FILE = "labels_mobilenet_quant_v1_224.txt";
+
+  private static final boolean MAINTAIN_ASPECT = true;
+
+  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
+
+
+  private Integer sensorOrientation;
+  private Classifier classifier;
+  private Matrix frameToCropTransform;
+  private Matrix cropToFrameTransform;
+
+  private BorderedText borderedText;
+
+  @Override
+  protected int getLayoutId() {
+    return R.layout.camera_connection_fragment;
+  }
+
+  @Override
+  protected Size getDesiredPreviewFrameSize() {
+    return DESIRED_PREVIEW_SIZE;
+  }
+
+  private static final float TEXT_SIZE_DIP = 10;
+
+  @Override
+  public void onPreviewSizeChosen(final Size size, final int rotation) {
+    final float textSizePx = TypedValue.applyDimension(
+        TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+    borderedText.setTypeface(Typeface.MONOSPACE);
+
+    classifier = TFLiteImageClassifier.create(getAssets(), MODEL_FILE, LABEL_FILE, INPUT_SIZE);
+
+    previewWidth = size.getWidth();
+    previewHeight = size.getHeight();
+
+    sensorOrientation = rotation - getScreenOrientation();
+    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
+
+    LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
+    rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
+    croppedBitmap = Bitmap.createBitmap(INPUT_SIZE, INPUT_SIZE, Config.ARGB_8888);
+
+    frameToCropTransform = ImageUtils.getTransformationMatrix(
+        previewWidth, previewHeight,
+        INPUT_SIZE, INPUT_SIZE,
+        sensorOrientation, MAINTAIN_ASPECT);
+
+    cropToFrameTransform = new Matrix();
+    frameToCropTransform.invert(cropToFrameTransform);
+
+    addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            renderDebug(canvas);
+          }
+        });
+  }
+
+  @Override
+  protected void processImage() {
+    rgbFrameBitmap.setPixels(getRgbBytes(), 0, previewWidth, 0, 0, previewWidth, previewHeight);
+    final Canvas canvas = new Canvas(croppedBitmap);
+    canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
+
+    // For examining the actual TF input.
+    if (SAVE_PREVIEW_BITMAP) {
+      ImageUtils.saveBitmap(croppedBitmap);
+    }
+    runInBackground(
+        new Runnable() {
+          @Override
+          public void run() {
+            final long startTime = SystemClock.uptimeMillis();
+            final List<Classifier.Recognition> results = classifier.recognizeImage(croppedBitmap);
+            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
+            LOGGER.i("Detect: %s", results);
+            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+            if (resultsView == null) {
+              resultsView = (ResultsView) findViewById(R.id.results);
+            }
+            resultsView.setResults(results);
+            requestRender();
+            readyForNextImage();
+          }
+        });
+  }
+
+  @Override
+  public void onSetDebug(boolean debug) {
+    classifier.enableStatLogging(debug);
+  }
+
+  private void renderDebug(final Canvas canvas) {
+    if (!isDebug()) {
+      return;
+    }
+    final Bitmap copy = cropCopyBitmap;
+    if (copy != null) {
+      final Matrix matrix = new Matrix();
+      final float scaleFactor = 2;
+      matrix.postScale(scaleFactor, scaleFactor);
+      matrix.postTranslate(
+          canvas.getWidth() - copy.getWidth() * scaleFactor,
+          canvas.getHeight() - copy.getHeight() * scaleFactor);
+      canvas.drawBitmap(copy, matrix, new Paint());
+
+      final Vector<String> lines = new Vector<String>();
+      if (classifier != null) {
+        String statString = classifier.getStatString();
+        String[] statLines = statString.split("\n");
+        for (String line : statLines) {
+          lines.add(line);
+        }
+      }
+
+      lines.add("Frame: " + previewWidth + "x" + previewHeight);
+      lines.add("Crop: " + copy.getWidth() + "x" + copy.getHeight());
+      lines.add("View: " + canvas.getWidth() + "x" + canvas.getHeight());
+      lines.add("Rotation: " + sensorOrientation);
+      lines.add("Inference time: " + lastProcessingTimeMs + "ms");
+
+      borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/DetectorActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..de997e454a1e33254cb7c2c932ca79d0072539fa
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.graphics.Bitmap;
+import android.graphics.Bitmap.Config;
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Paint.Style;
+import android.graphics.RectF;
+import android.graphics.Typeface;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.SystemClock;
+import android.util.Size;
+import android.util.TypedValue;
+import android.widget.Toast;
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Vector;
+import org.tensorflow.demo.OverlayView.DrawCallback;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.tracking.MultiBoxTracker;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+/**
+ * An activity that uses a TensorFlowMultiBoxDetector and ObjectTracker to detect and then track
+ * objects.
+ */
+public class DetectorActivity extends CameraActivity implements OnImageAvailableListener {
+  private static final Logger LOGGER = new Logger();
+
+  // Configuration values for the prepackaged SSD model.
+  private static final int TF_OD_API_INPUT_SIZE = 300;
+  private static final String TF_OD_API_MODEL_FILE = "mobilenet_ssd.tflite";
+  private static final String TF_OD_API_LABELS_FILE = "file:///android_asset/coco_labels_list.txt";
+
+  // Which detection model to use: by default uses Tensorflow Object Detection API frozen
+  // checkpoints.
+  private enum DetectorMode {
+    TF_OD_API;
+  }
+
+  private static final DetectorMode MODE = DetectorMode.TF_OD_API;
+
+  // Minimum detection confidence to track a detection.
+  private static final float MINIMUM_CONFIDENCE_TF_OD_API = 0.6f;
+
+  private static final boolean MAINTAIN_ASPECT = false;
+
+  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
+
+  private static final boolean SAVE_PREVIEW_BITMAP = false;
+  private static final float TEXT_SIZE_DIP = 10;
+
+  private Integer sensorOrientation;
+
+  private Classifier detector;
+
+  private long lastProcessingTimeMs;
+  private Bitmap rgbFrameBitmap = null;
+  private Bitmap croppedBitmap = null;
+  private Bitmap cropCopyBitmap = null;
+
+  private boolean computingDetection = false;
+
+  private long timestamp = 0;
+
+  private Matrix frameToCropTransform;
+  private Matrix cropToFrameTransform;
+
+  private MultiBoxTracker tracker;
+
+  private byte[] luminanceCopy;
+
+  private BorderedText borderedText;
+  @Override
+  public void onPreviewSizeChosen(final Size size, final int rotation) {
+    final float textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+    borderedText.setTypeface(Typeface.MONOSPACE);
+
+    tracker = new MultiBoxTracker(this);
+
+    int cropSize = TF_OD_API_INPUT_SIZE;
+
+    try {
+      detector =
+          TFLiteObjectDetectionAPIModel.create(
+              getAssets(), TF_OD_API_MODEL_FILE, TF_OD_API_LABELS_FILE, TF_OD_API_INPUT_SIZE);
+      cropSize = TF_OD_API_INPUT_SIZE;
+    } catch (final IOException e) {
+      LOGGER.e("Exception initializing classifier!", e);
+      Toast toast =
+          Toast.makeText(
+              getApplicationContext(), "Classifier could not be initialized", Toast.LENGTH_SHORT);
+      toast.show();
+      finish();
+    }
+
+
+    previewWidth = size.getWidth();
+    previewHeight = size.getHeight();
+
+    sensorOrientation = rotation - getScreenOrientation();
+    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
+
+    LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
+    rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
+    croppedBitmap = Bitmap.createBitmap(cropSize, cropSize, Config.ARGB_8888);
+
+    frameToCropTransform =
+        ImageUtils.getTransformationMatrix(
+            previewWidth, previewHeight,
+            cropSize, cropSize,
+            sensorOrientation, MAINTAIN_ASPECT);
+
+    cropToFrameTransform = new Matrix();
+    frameToCropTransform.invert(cropToFrameTransform);
+
+    trackingOverlay = (OverlayView) findViewById(R.id.tracking_overlay);
+    trackingOverlay.addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            tracker.draw(canvas);
+            if (isDebug()) {
+              tracker.drawDebug(canvas);
+            }
+          }
+        });
+
+    addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            if (!isDebug()) {
+              return;
+            }
+            final Bitmap copy = cropCopyBitmap;
+            if (copy == null) {
+              return;
+            }
+
+            final int backgroundColor = Color.argb(100, 0, 0, 0);
+            canvas.drawColor(backgroundColor);
+
+            final Matrix matrix = new Matrix();
+            final float scaleFactor = 2;
+            matrix.postScale(scaleFactor, scaleFactor);
+            matrix.postTranslate(
+                canvas.getWidth() - copy.getWidth() * scaleFactor,
+                canvas.getHeight() - copy.getHeight() * scaleFactor);
+            canvas.drawBitmap(copy, matrix, new Paint());
+
+            final Vector<String> lines = new Vector<String>();
+            if (detector != null) {
+              final String statString = detector.getStatString();
+              final String[] statLines = statString.split("\n");
+              for (final String line : statLines) {
+                lines.add(line);
+              }
+            }
+            lines.add("");
+
+            lines.add("Frame: " + previewWidth + "x" + previewHeight);
+            lines.add("Crop: " + copy.getWidth() + "x" + copy.getHeight());
+            lines.add("View: " + canvas.getWidth() + "x" + canvas.getHeight());
+            lines.add("Rotation: " + sensorOrientation);
+            lines.add("Inference time: " + lastProcessingTimeMs + "ms");
+
+            borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
+          }
+        });
+  }
+
+  OverlayView trackingOverlay;
+
+  @Override
+  protected void processImage() {
+    ++timestamp;
+    final long currTimestamp = timestamp;
+    byte[] originalLuminance = getLuminance();
+    tracker.onFrame(
+        previewWidth,
+        previewHeight,
+        getLuminanceStride(),
+        sensorOrientation,
+        originalLuminance,
+        timestamp);
+    trackingOverlay.postInvalidate();
+
+    // No mutex needed as this method is not reentrant.
+    if (computingDetection) {
+      readyForNextImage();
+      return;
+    }
+    computingDetection = true;
+    LOGGER.i("Preparing image " + currTimestamp + " for detection in bg thread.");
+
+    rgbFrameBitmap.setPixels(getRgbBytes(), 0, previewWidth, 0, 0, previewWidth, previewHeight);
+
+    if (luminanceCopy == null) {
+      luminanceCopy = new byte[originalLuminance.length];
+    }
+    System.arraycopy(originalLuminance, 0, luminanceCopy, 0, originalLuminance.length);
+    readyForNextImage();
+
+    final Canvas canvas = new Canvas(croppedBitmap);
+    canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
+    // For examining the actual TF input.
+    if (SAVE_PREVIEW_BITMAP) {
+      ImageUtils.saveBitmap(croppedBitmap);
+    }
+
+    runInBackground(
+        new Runnable() {
+          @Override
+          public void run() {
+            LOGGER.i("Running detection on image " + currTimestamp);
+            final long startTime = SystemClock.uptimeMillis();
+            final List<Classifier.Recognition> results = detector.recognizeImage(croppedBitmap);
+            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
+
+            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+            final Canvas canvas = new Canvas(cropCopyBitmap);
+            final Paint paint = new Paint();
+            paint.setColor(Color.RED);
+            paint.setStyle(Style.STROKE);
+            paint.setStrokeWidth(2.0f);
+
+            float minimumConfidence = MINIMUM_CONFIDENCE_TF_OD_API;
+            switch (MODE) {
+              case TF_OD_API:
+                minimumConfidence = MINIMUM_CONFIDENCE_TF_OD_API;
+                break;
+            }
+
+            final List<Classifier.Recognition> mappedRecognitions =
+                new LinkedList<Classifier.Recognition>();
+
+            for (final Classifier.Recognition result : results) {
+              final RectF location = result.getLocation();
+              if (location != null && result.getConfidence() >= minimumConfidence) {
+                canvas.drawRect(location, paint);
+
+                cropToFrameTransform.mapRect(location);
+                result.setLocation(location);
+                mappedRecognitions.add(result);
+              }
+            }
+
+            tracker.trackResults(mappedRecognitions, luminanceCopy, currTimestamp);
+            trackingOverlay.postInvalidate();
+
+            requestRender();
+            computingDetection = false;
+          }
+        });
+  }
+
+  @Override
+  protected int getLayoutId() {
+    return R.layout.camera_connection_fragment_tracking;
+  }
+
+  @Override
+  protected Size getDesiredPreviewFrameSize() {
+    return DESIRED_PREVIEW_SIZE;
+  }
+
+  @Override
+  public void onSetDebug(final boolean debug) {
+    detector.enableStatLogging(debug);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
new file mode 100644
index 0000000000000000000000000000000000000000..fd830297533bb8366e008a44a32255788d5e1ea6
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
@@ -0,0 +1,216 @@
+package org.tensorflow.demo;
+
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import android.app.Fragment;
+import android.graphics.SurfaceTexture;
+import android.hardware.Camera;
+import android.hardware.Camera.CameraInfo;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.util.Size;
+import android.util.SparseIntArray;
+import android.view.LayoutInflater;
+import android.view.Surface;
+import android.view.TextureView;
+import android.view.View;
+import android.view.ViewGroup;
+import java.io.IOException;
+import java.util.List;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public class LegacyCameraConnectionFragment extends Fragment {
+  private Camera camera;
+  private static final Logger LOGGER = new Logger();
+  private Camera.PreviewCallback imageListener;
+  private Size desiredSize;
+
+  /**
+   * The layout identifier to inflate for this Fragment.
+   */
+  private int layout;
+
+  public LegacyCameraConnectionFragment(
+      final Camera.PreviewCallback imageListener, final int layout, final Size desiredSize) {
+    this.imageListener = imageListener;
+    this.layout = layout;
+    this.desiredSize = desiredSize;
+  }
+
+  /**
+   * Conversion from screen rotation to JPEG orientation.
+   */
+  private static final SparseIntArray ORIENTATIONS = new SparseIntArray();
+
+  static {
+    ORIENTATIONS.append(Surface.ROTATION_0, 90);
+    ORIENTATIONS.append(Surface.ROTATION_90, 0);
+    ORIENTATIONS.append(Surface.ROTATION_180, 270);
+    ORIENTATIONS.append(Surface.ROTATION_270, 180);
+  }
+
+  /**
+   * {@link android.view.TextureView.SurfaceTextureListener} handles several lifecycle events on a
+   * {@link TextureView}.
+   */
+  private final TextureView.SurfaceTextureListener surfaceTextureListener =
+      new TextureView.SurfaceTextureListener() {
+        @Override
+        public void onSurfaceTextureAvailable(
+            final SurfaceTexture texture, final int width, final int height) {
+
+          int index = getCameraId();
+          camera = Camera.open(index);
+
+          try {
+            Camera.Parameters parameters = camera.getParameters();
+            List<String> focusModes = parameters.getSupportedFocusModes();
+            if (focusModes != null
+                && focusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE)) {
+              parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
+            }
+            List<Camera.Size> cameraSizes = parameters.getSupportedPreviewSizes();
+            Size[] sizes = new Size[cameraSizes.size()];
+            int i = 0;
+            for (Camera.Size size : cameraSizes) {
+              sizes[i++] = new Size(size.width, size.height);
+            }
+            Size previewSize =
+                CameraConnectionFragment.chooseOptimalSize(
+                    sizes, desiredSize.getWidth(), desiredSize.getHeight());
+            parameters.setPreviewSize(previewSize.getWidth(), previewSize.getHeight());
+            camera.setDisplayOrientation(90);
+            camera.setParameters(parameters);
+            camera.setPreviewTexture(texture);
+          } catch (IOException exception) {
+            camera.release();
+          }
+
+          camera.setPreviewCallbackWithBuffer(imageListener);
+          Camera.Size s = camera.getParameters().getPreviewSize();
+          camera.addCallbackBuffer(new byte[ImageUtils.getYUVByteSize(s.height, s.width)]);
+
+          textureView.setAspectRatio(s.height, s.width);
+
+          camera.startPreview();
+        }
+
+        @Override
+        public void onSurfaceTextureSizeChanged(
+            final SurfaceTexture texture, final int width, final int height) {}
+
+        @Override
+        public boolean onSurfaceTextureDestroyed(final SurfaceTexture texture) {
+          return true;
+        }
+
+        @Override
+        public void onSurfaceTextureUpdated(final SurfaceTexture texture) {}
+      };
+
+  /**
+   * An {@link AutoFitTextureView} for camera preview.
+   */
+  private AutoFitTextureView textureView;
+
+  /**
+   * An additional thread for running tasks that shouldn't block the UI.
+   */
+  private HandlerThread backgroundThread;
+
+  @Override
+  public View onCreateView(
+      final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
+    return inflater.inflate(layout, container, false);
+  }
+
+  @Override
+  public void onViewCreated(final View view, final Bundle savedInstanceState) {
+    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
+  }
+
+  @Override
+  public void onActivityCreated(final Bundle savedInstanceState) {
+    super.onActivityCreated(savedInstanceState);
+  }
+
+  @Override
+  public void onResume() {
+    super.onResume();
+    startBackgroundThread();
+    // When the screen is turned off and turned back on, the SurfaceTexture is already
+    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
+    // a camera and start preview from here (otherwise, we wait until the surface is ready in
+    // the SurfaceTextureListener).
+
+    if (textureView.isAvailable()) {
+      camera.startPreview();
+    } else {
+      textureView.setSurfaceTextureListener(surfaceTextureListener);
+    }
+  }
+
+  @Override
+  public void onPause() {
+    stopCamera();
+    stopBackgroundThread();
+    super.onPause();
+  }
+
+  /**
+   * Starts a background thread and its {@link Handler}.
+   */
+  private void startBackgroundThread() {
+    backgroundThread = new HandlerThread("CameraBackground");
+    backgroundThread.start();
+  }
+
+  /**
+   * Stops the background thread and its {@link Handler}.
+   */
+  private void stopBackgroundThread() {
+    backgroundThread.quitSafely();
+    try {
+      backgroundThread.join();
+      backgroundThread = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  protected void stopCamera() {
+    if (camera != null) {
+      camera.stopPreview();
+      camera.setPreviewCallback(null);
+      camera.release();
+      camera = null;
+    }
+  }
+
+  private int getCameraId() {
+    CameraInfo ci = new CameraInfo();
+    for (int i = 0; i < Camera.getNumberOfCameras(); i++) {
+      Camera.getCameraInfo(i, ci);
+      if (ci.facing == CameraInfo.CAMERA_FACING_BACK)
+        return i;
+    }
+    return -1; // No camera found
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/OverlayView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/OverlayView.java
new file mode 100644
index 0000000000000000000000000000000000000000..0f8d109fb46d769d0ada9c9daa6292a80470be8a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/OverlayView.java
@@ -0,0 +1,52 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.util.AttributeSet;
+import android.view.View;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * A simple View providing a render callback to other classes.
+ */
+public class OverlayView extends View {
+  private final List<DrawCallback> callbacks = new LinkedList<DrawCallback>();
+
+  public OverlayView(final Context context, final AttributeSet attrs) {
+    super(context, attrs);
+  }
+
+  /**
+   * Interface defining the callback for client classes.
+   */
+  public interface DrawCallback {
+    public void drawCallback(final Canvas canvas);
+  }
+
+  public void addCallback(final DrawCallback callback) {
+    callbacks.add(callback);
+  }
+
+  @Override
+  public synchronized void draw(final Canvas canvas) {
+    for (final DrawCallback callback : callbacks) {
+      callback.drawCallback(canvas);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
new file mode 100644
index 0000000000000000000000000000000000000000..31a4b07c8387bf0b1da9e967f37628d0ce642dc4
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
@@ -0,0 +1,67 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.graphics.Paint;
+import android.util.AttributeSet;
+import android.util.TypedValue;
+import android.view.View;
+import java.util.List;
+import org.tensorflow.demo.Classifier.Recognition;
+
+public class RecognitionScoreView extends View implements ResultsView {
+  private static final float TEXT_SIZE_DIP = 24;
+  private List<Recognition> results;
+  private final float textSizePx;
+  private final Paint fgPaint;
+  private final Paint bgPaint;
+
+  public RecognitionScoreView(final Context context, final AttributeSet set) {
+    super(context, set);
+
+    textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    fgPaint = new Paint();
+    fgPaint.setTextSize(textSizePx);
+
+    bgPaint = new Paint();
+    bgPaint.setColor(0xcc4285f4);
+  }
+
+  @Override
+  public void setResults(final List<Recognition> results) {
+    this.results = results;
+    postInvalidate();
+  }
+
+  @Override
+  public void onDraw(final Canvas canvas) {
+    final int x = 10;
+    int y = (int) (fgPaint.getTextSize() * 1.5f);
+
+    canvas.drawPaint(bgPaint);
+
+    if (results != null) {
+      for (final Recognition recog : results) {
+        canvas.drawText(recog.getTitle() + ": " + recog.getConfidence(), x, y, fgPaint);
+        y += (int) (fgPaint.getTextSize() * 1.5f);
+      }
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognizeCommands.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
new file mode 100644
index 0000000000000000000000000000000000000000..9e91aea7efc8c1aea00913ba863eb57f0692343a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.util.Log;
+import android.util.Pair;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Deque;
+import java.util.List;
+
+/** Reads in results from an instantaneous audio recognition model and smoothes them over time. */
+public class RecognizeCommands {
+  // Configuration settings.
+  private List<String> labels = new ArrayList<String>();
+  private long averageWindowDurationMs;
+  private float detectionThreshold;
+  private int suppressionMs;
+  private int minimumCount;
+  private long minimumTimeBetweenSamplesMs;
+
+  // Working variables.
+  private Deque<Pair<Long, float[]>> previousResults = new ArrayDeque<Pair<Long, float[]>>();
+  private String previousTopLabel;
+  private int labelsCount;
+  private long previousTopLabelTime;
+  private float previousTopLabelScore;
+
+  private static final String SILENCE_LABEL = "_silence_";
+  private static final long MINIMUM_TIME_FRACTION = 4;
+
+  public RecognizeCommands(
+      List<String> inLabels,
+      long inAverageWindowDurationMs,
+      float inDetectionThreshold,
+      int inSuppressionMS,
+      int inMinimumCount,
+      long inMinimumTimeBetweenSamplesMS) {
+    labels = inLabels;
+    averageWindowDurationMs = inAverageWindowDurationMs;
+    detectionThreshold = inDetectionThreshold;
+    suppressionMs = inSuppressionMS;
+    minimumCount = inMinimumCount;
+    labelsCount = inLabels.size();
+    previousTopLabel = SILENCE_LABEL;
+    previousTopLabelTime = Long.MIN_VALUE;
+    previousTopLabelScore = 0.0f;
+    minimumTimeBetweenSamplesMs = inMinimumTimeBetweenSamplesMS;
+  }
+
+  /** Holds information about what's been recognized. */
+  public static class RecognitionResult {
+    public final String foundCommand;
+    public final float score;
+    public final boolean isNewCommand;
+
+    public RecognitionResult(String inFoundCommand, float inScore, boolean inIsNewCommand) {
+      foundCommand = inFoundCommand;
+      score = inScore;
+      isNewCommand = inIsNewCommand;
+    }
+  }
+
+  private static class ScoreForSorting implements Comparable<ScoreForSorting> {
+    public final float score;
+    public final int index;
+
+    public ScoreForSorting(float inScore, int inIndex) {
+      score = inScore;
+      index = inIndex;
+    }
+
+    @Override
+    public int compareTo(ScoreForSorting other) {
+      if (this.score > other.score) {
+        return -1;
+      } else if (this.score < other.score) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  }
+
+  public RecognitionResult processLatestResults(float[] currentResults, long currentTimeMS) {
+    if (currentResults.length != labelsCount) {
+      throw new RuntimeException(
+          "The results for recognition should contain "
+              + labelsCount
+              + " elements, but there are "
+              + currentResults.length);
+    }
+
+    if ((!previousResults.isEmpty()) && (currentTimeMS < previousResults.getFirst().first)) {
+      throw new RuntimeException(
+          "You must feed results in increasing time order, but received a timestamp of "
+              + currentTimeMS
+              + " that was earlier than the previous one of "
+              + previousResults.getFirst().first);
+    }
+
+    final int howManyResults = previousResults.size();
+    // Ignore any results that are coming in too frequently.
+    if (howManyResults > 1) {
+      final long timeSinceMostRecent = currentTimeMS - previousResults.getLast().first;
+      if (timeSinceMostRecent < minimumTimeBetweenSamplesMs) {
+        return new RecognitionResult(previousTopLabel, previousTopLabelScore, false);
+      }
+    }
+
+    // Add the latest results to the head of the queue.
+    previousResults.addLast(new Pair<Long, float[]>(currentTimeMS, currentResults));
+
+    // Prune any earlier results that are too old for the averaging window.
+    final long timeLimit = currentTimeMS - averageWindowDurationMs;
+    while (previousResults.getFirst().first < timeLimit) {
+      previousResults.removeFirst();
+    }
+
+    // If there are too few results, assume the result will be unreliable and
+    // bail.
+    final long earliestTime = previousResults.getFirst().first;
+    final long samplesDuration = currentTimeMS - earliestTime;
+    if ((howManyResults < minimumCount)
+        || (samplesDuration < (averageWindowDurationMs / MINIMUM_TIME_FRACTION))) {
+      Log.v("RecognizeResult", "Too few results");
+      return new RecognitionResult(previousTopLabel, 0.0f, false);
+    }
+
+    // Calculate the average score across all the results in the window.
+    float[] averageScores = new float[labelsCount];
+    for (Pair<Long, float[]> previousResult : previousResults) {
+      final float[] scoresTensor = previousResult.second;
+      int i = 0;
+      while (i < scoresTensor.length) {
+        averageScores[i] += scoresTensor[i] / howManyResults;
+        ++i;
+      }
+    }
+
+    // Sort the averaged results in descending score order.
+    ScoreForSorting[] sortedAverageScores = new ScoreForSorting[labelsCount];
+    for (int i = 0; i < labelsCount; ++i) {
+      sortedAverageScores[i] = new ScoreForSorting(averageScores[i], i);
+    }
+    Arrays.sort(sortedAverageScores);
+
+    // See if the latest top score is enough to trigger a detection.
+    final int currentTopIndex = sortedAverageScores[0].index;
+    final String currentTopLabel = labels.get(currentTopIndex);
+    final float currentTopScore = sortedAverageScores[0].score;
+    // If we've recently had another label trigger, assume one that occurs too
+    // soon afterwards is a bad result.
+    long timeSinceLastTop;
+    if (previousTopLabel.equals(SILENCE_LABEL) || (previousTopLabelTime == Long.MIN_VALUE)) {
+      timeSinceLastTop = Long.MAX_VALUE;
+    } else {
+      timeSinceLastTop = currentTimeMS - previousTopLabelTime;
+    }
+    boolean isNewCommand;
+    if ((currentTopScore > detectionThreshold) && (timeSinceLastTop > suppressionMs)) {
+      previousTopLabel = currentTopLabel;
+      previousTopLabelTime = currentTimeMS;
+      previousTopLabelScore = currentTopScore;
+      isNewCommand = true;
+    } else {
+      isNewCommand = false;
+    }
+    return new RecognitionResult(currentTopLabel, currentTopScore, isNewCommand);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ResultsView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ResultsView.java
new file mode 100644
index 0000000000000000000000000000000000000000..211d7e66fb20ce00e4e91ecc9134fbf2852e9f3d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ResultsView.java
@@ -0,0 +1,23 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import java.util.List;
+import org.tensorflow.demo.Classifier.Recognition;
+
+public interface ResultsView {
+  public void setResults(final List<Recognition> results);
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/SpeechActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/SpeechActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..9c9c30bc0985e529b46c322fd0ff02590967afa2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/SpeechActivity.java
@@ -0,0 +1,381 @@
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Demonstrates how to run an audio recognition model in Android.
+
+This example loads a simple speech recognition model trained by the tutorial at
+https://www.tensorflow.org/tutorials/audio_training
+
+The model files should be downloaded automatically from the TensorFlow website,
+but if you have a custom model you can update the LABEL_FILENAME and
+MODEL_FILENAME constants to point to your own files.
+
+The example application displays a list view with all of the known audio labels,
+and highlights each one when it thinks it has detected one through the
+microphone. The averaging of results to give a more reliable signal happens in
+the RecognizeCommands helper class.
+*/
+
+package org.tensorflow.demo;
+
+import android.animation.ValueAnimator;
+import android.app.Activity;
+import android.content.pm.PackageManager;
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.View;
+import android.widget.ArrayAdapter;
+import android.widget.Button;
+import android.widget.ListView;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.locks.ReentrantLock;
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+/**
+ * An activity that listens for audio and then uses a TensorFlow model to detect particular classes,
+ * by default a small set of action words.
+ */
+public class SpeechActivity extends Activity {
+
+  // Constants that control the behavior of the recognition code and model
+  // settings. See the audio recognition tutorial for a detailed explanation of
+  // all these, but you should customize them to match your training settings if
+  // you are running your own model.
+  private static final int SAMPLE_RATE = 16000;
+  private static final int SAMPLE_DURATION_MS = 1000;
+  private static final int RECORDING_LENGTH = (int) (SAMPLE_RATE * SAMPLE_DURATION_MS / 1000);
+  private static final long AVERAGE_WINDOW_DURATION_MS = 500;
+  private static final float DETECTION_THRESHOLD = 0.70f;
+  private static final int SUPPRESSION_MS = 1500;
+  private static final int MINIMUM_COUNT = 3;
+  private static final long MINIMUM_TIME_BETWEEN_SAMPLES_MS = 30;
+  private static final String LABEL_FILENAME = "file:///android_asset/conv_actions_labels.txt";
+  private static final String MODEL_FILENAME = "file:///android_asset/conv_actions_frozen.tflite";
+
+  // UI elements.
+  private static final int REQUEST_RECORD_AUDIO = 13;
+  private Button quitButton;
+  private ListView labelsListView;
+  private static final String LOG_TAG = SpeechActivity.class.getSimpleName();
+
+  // Working variables.
+  short[] recordingBuffer = new short[RECORDING_LENGTH];
+  int recordingOffset = 0;
+  boolean shouldContinue = true;
+  private Thread recordingThread;
+  boolean shouldContinueRecognition = true;
+  private Thread recognitionThread;
+  private final ReentrantLock recordingBufferLock = new ReentrantLock();
+
+  private List<String> labels = new ArrayList<String>();
+  private List<String> displayedLabels = new ArrayList<>();
+  private RecognizeCommands recognizeCommands = null;
+
+  private Interpreter tfLite;
+
+  /** Memory-map the model file in Assets. */
+  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
+      throws IOException {
+    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    // Set up the UI.
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_speech);
+    quitButton = (Button) findViewById(R.id.quit);
+    quitButton.setOnClickListener(
+        new View.OnClickListener() {
+          @Override
+          public void onClick(View view) {
+            moveTaskToBack(true);
+            android.os.Process.killProcess(android.os.Process.myPid());
+            System.exit(1);
+          }
+        });
+    labelsListView = (ListView) findViewById(R.id.list_view);
+
+    // Load the labels for the model, but only display those that don't start
+    // with an underscore.
+    String actualLabelFilename = LABEL_FILENAME.split("file:///android_asset/", -1)[1];
+    Log.i(LOG_TAG, "Reading labels from: " + actualLabelFilename);
+    BufferedReader br = null;
+    try {
+      br = new BufferedReader(new InputStreamReader(getAssets().open(actualLabelFilename)));
+      String line;
+      while ((line = br.readLine()) != null) {
+        labels.add(line);
+        if (line.charAt(0) != '_') {
+          displayedLabels.add(line.substring(0, 1).toUpperCase() + line.substring(1));
+        }
+      }
+      br.close();
+    } catch (IOException e) {
+      throw new RuntimeException("Problem reading label file!", e);
+    }
+
+    // Build a list view based on these labels.
+    ArrayAdapter<String> arrayAdapter =
+        new ArrayAdapter<String>(this, R.layout.list_text_item, displayedLabels);
+    labelsListView.setAdapter(arrayAdapter);
+
+    // Set up an object to smooth recognition results to increase accuracy.
+    recognizeCommands =
+        new RecognizeCommands(
+            labels,
+            AVERAGE_WINDOW_DURATION_MS,
+            DETECTION_THRESHOLD,
+            SUPPRESSION_MS,
+            MINIMUM_COUNT,
+            MINIMUM_TIME_BETWEEN_SAMPLES_MS);
+
+    String actualModelFilename = MODEL_FILENAME.split("file:///android_asset/", -1)[1];
+    try {
+      tfLite = new Interpreter(loadModelFile(getAssets(), actualModelFilename));
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+    tfLite.resizeInput(0, new int[] {RECORDING_LENGTH, 1});
+    tfLite.resizeInput(1, new int[] {1});
+
+    // Start the recording and recognition threads.
+    requestMicrophonePermission();
+    startRecording();
+    startRecognition();
+  }
+
+  private void requestMicrophonePermission() {
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      requestPermissions(
+          new String[]{android.Manifest.permission.RECORD_AUDIO}, REQUEST_RECORD_AUDIO);
+    }
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      int requestCode, String[] permissions, int[] grantResults) {
+    if (requestCode == REQUEST_RECORD_AUDIO
+        && grantResults.length > 0
+        && grantResults[0] == PackageManager.PERMISSION_GRANTED) {
+      startRecording();
+      startRecognition();
+    }
+  }
+
+  public synchronized void startRecording() {
+    if (recordingThread != null) {
+      return;
+    }
+    shouldContinue = true;
+    recordingThread =
+        new Thread(
+            new Runnable() {
+              @Override
+              public void run() {
+                record();
+              }
+            });
+    recordingThread.start();
+  }
+
+  public synchronized void stopRecording() {
+    if (recordingThread == null) {
+      return;
+    }
+    shouldContinue = false;
+    recordingThread = null;
+  }
+
+  private void record() {
+    android.os.Process.setThreadPriority(android.os.Process.THREAD_PRIORITY_AUDIO);
+
+    // Estimate the buffer size we'll need for this device.
+    int bufferSize =
+        AudioRecord.getMinBufferSize(
+            SAMPLE_RATE, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT);
+    if (bufferSize == AudioRecord.ERROR || bufferSize == AudioRecord.ERROR_BAD_VALUE) {
+      bufferSize = SAMPLE_RATE * 2;
+    }
+    short[] audioBuffer = new short[bufferSize / 2];
+
+    AudioRecord record =
+        new AudioRecord(
+            MediaRecorder.AudioSource.DEFAULT,
+            SAMPLE_RATE,
+            AudioFormat.CHANNEL_IN_MONO,
+            AudioFormat.ENCODING_PCM_16BIT,
+            bufferSize);
+
+    if (record.getState() != AudioRecord.STATE_INITIALIZED) {
+      Log.e(LOG_TAG, "Audio Record can't initialize!");
+      return;
+    }
+
+    record.startRecording();
+
+    Log.v(LOG_TAG, "Start recording");
+
+    // Loop, gathering audio data and copying it to a round-robin buffer.
+    while (shouldContinue) {
+      int numberRead = record.read(audioBuffer, 0, audioBuffer.length);
+      int maxLength = recordingBuffer.length;
+      int newRecordingOffset = recordingOffset + numberRead;
+      int secondCopyLength = Math.max(0, newRecordingOffset - maxLength);
+      int firstCopyLength = numberRead - secondCopyLength;
+      // We store off all the data for the recognition thread to access. The ML
+      // thread will copy out of this buffer into its own, while holding the
+      // lock, so this should be thread safe.
+      recordingBufferLock.lock();
+      try {
+        System.arraycopy(audioBuffer, 0, recordingBuffer, recordingOffset, firstCopyLength);
+        System.arraycopy(audioBuffer, firstCopyLength, recordingBuffer, 0, secondCopyLength);
+        recordingOffset = newRecordingOffset % maxLength;
+      } finally {
+        recordingBufferLock.unlock();
+      }
+    }
+
+    record.stop();
+    record.release();
+  }
+
+  public synchronized void startRecognition() {
+    if (recognitionThread != null) {
+      return;
+    }
+    shouldContinueRecognition = true;
+    recognitionThread =
+        new Thread(
+            new Runnable() {
+              @Override
+              public void run() {
+                recognize();
+              }
+            });
+    recognitionThread.start();
+  }
+
+  public synchronized void stopRecognition() {
+    if (recognitionThread == null) {
+      return;
+    }
+    shouldContinueRecognition = false;
+    recognitionThread = null;
+  }
+
+  private void recognize() {
+    Log.v(LOG_TAG, "Start recognition");
+
+    short[] inputBuffer = new short[RECORDING_LENGTH];
+    float[][] floatInputBuffer = new float[RECORDING_LENGTH][1];
+    float[][] outputScores = new float[1][labels.size()];
+    int[] sampleRateList = new int[] {SAMPLE_RATE};
+
+    // Loop, grabbing recorded data and running the recognition model on it.
+    while (shouldContinueRecognition) {
+      // The recording thread places data in this round-robin buffer, so lock to
+      // make sure there's no writing happening and then copy it to our own
+      // local version.
+      recordingBufferLock.lock();
+      try {
+        int maxLength = recordingBuffer.length;
+        int firstCopyLength = maxLength - recordingOffset;
+        int secondCopyLength = recordingOffset;
+        System.arraycopy(recordingBuffer, recordingOffset, inputBuffer, 0, firstCopyLength);
+        System.arraycopy(recordingBuffer, 0, inputBuffer, firstCopyLength, secondCopyLength);
+      } finally {
+        recordingBufferLock.unlock();
+      }
+
+      // We need to feed in float values between -1.0f and 1.0f, so divide the
+      // signed 16-bit inputs.
+      for (int i = 0; i < RECORDING_LENGTH; ++i) {
+        floatInputBuffer[i][0] = inputBuffer[i] / 32767.0f;
+      }
+
+      Object[] inputArray = {floatInputBuffer, sampleRateList};
+      Map<Integer, Object> outputMap = new HashMap<>();
+      outputMap.put(0, outputScores);
+
+      // Run the model.
+      tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
+
+      // Use the smoother to figure out if we've had a real recognition event.
+      long currentTime = System.currentTimeMillis();
+      final RecognizeCommands.RecognitionResult result =
+          recognizeCommands.processLatestResults(outputScores[0], currentTime);
+
+      runOnUiThread(
+          new Runnable() {
+            @Override
+            public void run() {
+              // If we do have a new command, highlight the right list entry.
+              if (!result.foundCommand.startsWith("_") && result.isNewCommand) {
+                int labelIndex = -1;
+                for (int i = 0; i < labels.size(); ++i) {
+                  if (labels.get(i).equals(result.foundCommand)) {
+                    labelIndex = i;
+                  }
+                }
+                final View labelView = (View) labelsListView.getChildAt(labelIndex - 2);
+                ValueAnimator colorAnimation =
+                    ValueAnimator.ofArgb(0x00b3ccff, 0xffb3ccff, 0x00b3ccff);
+                colorAnimation.setDuration(750);
+                colorAnimation.addUpdateListener(
+                    new ValueAnimator.AnimatorUpdateListener() {
+                      @Override
+                      public void onAnimationUpdate(ValueAnimator animator) {
+                        labelView.setBackgroundColor((int) animator.getAnimatedValue());
+                      }
+                    });
+                colorAnimation.start();
+              }
+            }
+          });
+      try {
+        // We don't need to run too frequently, so snooze for a bit.
+        Thread.sleep(MINIMUM_TIME_BETWEEN_SAMPLES_MS);
+      } catch (InterruptedException e) {
+        // Ignore
+      }
+    }
+
+    Log.v(LOG_TAG, "End recognition");
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteImageClassifier.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteImageClassifier.java
new file mode 100644
index 0000000000000000000000000000000000000000..d75c3ceadabd2dad73b1e5feda3ae88181769e74
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteImageClassifier.java
@@ -0,0 +1,209 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.os.SystemClock;
+import android.os.Trace;
+import android.util.Log;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Vector;
+import org.tensorflow.lite.Interpreter;
+
+/** A classifier specialized to label images using TensorFlow. */
+public class TFLiteImageClassifier implements Classifier {
+  private static final String TAG = "TFLiteImageClassifier";
+
+  // Only return this many results with at least this confidence.
+  private static final int MAX_RESULTS = 3;
+
+  private Interpreter tfLite;
+
+  /** Dimensions of inputs. */
+  private static final int DIM_BATCH_SIZE = 1;
+
+  private static final int DIM_PIXEL_SIZE = 3;
+
+  private static final int DIM_IMG_SIZE_X = 224;
+  private static final int DIM_IMG_SIZE_Y = 224;
+
+  byte[][] labelProb;
+
+  // Pre-allocated buffers.
+  private Vector<String> labels = new Vector<String>();
+  private int[] intValues;
+  private ByteBuffer imgData = null;
+
+  private TFLiteImageClassifier() {}
+
+  /** Memory-map the model file in Assets. */
+  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
+      throws IOException {
+    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  /**
+   * Initializes a native TensorFlow session for classifying images.
+   *
+   * @param assetManager The asset manager to be used to load assets.
+   * @param modelFilename The filepath of the model GraphDef protocol buffer.
+   * @param labelFilename The filepath of label file for classes.
+   * @param inputSize The input size. A square image of inputSize x inputSize is assumed.
+   * @throws IOException
+   */
+  public static Classifier create(
+      AssetManager assetManager, String modelFilename, String labelFilename, int inputSize) {
+    TFLiteImageClassifier c = new TFLiteImageClassifier();
+
+    // Read the label names into memory.
+    // TODO(andrewharp): make this handle non-assets.
+    Log.i(TAG, "Reading labels from: " + labelFilename);
+    BufferedReader br = null;
+    try {
+      br = new BufferedReader(new InputStreamReader(assetManager.open(labelFilename)));
+      String line;
+      while ((line = br.readLine()) != null) {
+        c.labels.add(line);
+      }
+      br.close();
+    } catch (IOException e) {
+      throw new RuntimeException("Problem reading label file!" , e);
+    }
+
+    c.imgData =
+        ByteBuffer.allocateDirect(
+            DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y * DIM_PIXEL_SIZE);
+
+    c.imgData.order(ByteOrder.nativeOrder());
+    try {
+      c.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+    // The shape of the output is [N, NUM_CLASSES], where N is the batch size.
+    Log.i(TAG, "Read " + c.labels.size() + " labels");
+
+    // Pre-allocate buffers.
+    c.intValues = new int[inputSize * inputSize];
+
+    c.labelProb = new byte[1][c.labels.size()];
+
+    return c;
+  }
+
+  /** Writes Image data into a {@code ByteBuffer}. */
+  private void convertBitmapToByteBuffer(Bitmap bitmap) {
+    if (imgData == null) {
+      return;
+    }
+    imgData.rewind();
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+    // Convert the image to floating point.
+    int pixel = 0;
+    long startTime = SystemClock.uptimeMillis();
+    for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
+      for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
+        final int val = intValues[pixel++];
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    long endTime = SystemClock.uptimeMillis();
+    Log.d(TAG, "Timecost to put values into ByteBuffer: " + Long.toString(endTime - startTime));
+  }
+
+  @Override
+  public List<Recognition> recognizeImage(final Bitmap bitmap) {
+    // Log this method so that it can be analyzed with systrace.
+    Trace.beginSection("recognizeImage");
+
+    Trace.beginSection("preprocessBitmap");
+
+    long startTime;
+    long endTime;
+    startTime = SystemClock.uptimeMillis();
+
+    convertBitmapToByteBuffer(bitmap);
+
+    // Run the inference call.
+    Trace.beginSection("run");
+    startTime = SystemClock.uptimeMillis();
+    tfLite.run(imgData, labelProb);
+    endTime = SystemClock.uptimeMillis();
+    Log.i(TAG, "Inf time: " + (endTime - startTime));
+    Trace.endSection();
+
+    // Find the best classifications.
+    PriorityQueue<Recognition> pq =
+        new PriorityQueue<Recognition>(
+            3,
+            new Comparator<Recognition>() {
+              @Override
+              public int compare(Recognition lhs, Recognition rhs) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return Float.compare(rhs.getConfidence(), lhs.getConfidence());
+              }
+            });
+    for (int i = 0; i < labels.size(); ++i) {
+      pq.add(
+          new Recognition(
+              "" + i,
+              labels.size() > i ? labels.get(i) : "unknown",
+              (float) labelProb[0][i],
+              null));
+    }
+    final ArrayList<Recognition> recognitions = new ArrayList<Recognition>();
+    int recognitionsSize = Math.min(pq.size(), MAX_RESULTS);
+    for (int i = 0; i < recognitionsSize; ++i) {
+      recognitions.add(pq.poll());
+    }
+    Trace.endSection(); // "recognizeImage"
+    return recognitions;
+  }
+
+  @Override
+  public void enableStatLogging(boolean logStats) {
+  }
+
+  @Override
+  public String getStatString() {
+    return "";
+  }
+
+  @Override
+  public void close() {
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
new file mode 100644
index 0000000000000000000000000000000000000000..bfb4a0a04bc90566736864bf62340d1032961858
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
@@ -0,0 +1,292 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.graphics.RectF;
+import android.os.Trace;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.StringTokenizer;
+import java.util.Vector;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.Interpreter;
+
+/**
+ * Wrapper for frozen detection models trained using the Tensorflow Object Detection API:
+ * github.com/tensorflow/models/tree/master/research/object_detection
+ */
+public class TFLiteObjectDetectionAPIModel implements Classifier {
+  private static final Logger LOGGER = new Logger();
+
+  // Only return this many results.
+  private static final int NUM_RESULTS = 1917;
+  private static final int NUM_CLASSES = 91;
+
+  private static final float Y_SCALE = 10.0f;
+  private static final float X_SCALE = 10.0f;
+  private static final float H_SCALE = 5.0f;
+  private static final float W_SCALE = 5.0f;
+
+  // Config values.
+  private int inputSize;
+
+  private final float[][] boxPriors = new float[4][NUM_RESULTS];
+
+  // Pre-allocated buffers.
+  private Vector<String> labels = new Vector<String>();
+  private int[] intValues;
+  private float[][][] outputLocations;
+  private float[][][] outputClasses;
+
+  float[][][][] img;
+
+  private Interpreter tfLite;
+
+  private float expit(final float x) {
+    return (float) (1. / (1. + Math.exp(-x)));
+  }
+
+  /** Memory-map the model file in Assets. */
+  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
+      throws IOException {
+    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  private void loadCoderOptions(
+      final AssetManager assetManager, final String locationFilename, final float[][] boxPriors)
+      throws IOException {
+    // Try to be intelligent about opening from assets or sdcard depending on prefix.
+    final String assetPrefix = "file:///android_asset/";
+    InputStream is;
+    if (locationFilename.startsWith(assetPrefix)) {
+      is = assetManager.open(locationFilename.split(assetPrefix, -1)[1]);
+    } else {
+      is = new FileInputStream(locationFilename);
+    }
+
+    final BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+
+    for (int lineNum = 0; lineNum < 4; ++lineNum) {
+      String line = reader.readLine();
+      final StringTokenizer st = new StringTokenizer(line, ", ");
+      int priorIndex = 0;
+      while (st.hasMoreTokens()) {
+        final String token = st.nextToken();
+        try {
+          final float number = Float.parseFloat(token);
+          boxPriors[lineNum][priorIndex++] = number;
+        } catch (final NumberFormatException e) {
+          // Silently ignore.
+        }
+      }
+      if (priorIndex != NUM_RESULTS) {
+        throw new RuntimeException(
+            "BoxPrior length mismatch: " + priorIndex + " vs " + NUM_RESULTS);
+      }
+    }
+
+    LOGGER.i("Loaded box priors!");
+  }
+
+  void decodeCenterSizeBoxes(float[][][] predictions) {
+    for (int i = 0; i < NUM_RESULTS; ++i) {
+      float ycenter = predictions[0][i][0] / Y_SCALE * boxPriors[2][i] + boxPriors[0][i];
+      float xcenter = predictions[0][i][1] / X_SCALE * boxPriors[3][i] + boxPriors[1][i];
+      float h = (float) Math.exp(predictions[0][i][2] / H_SCALE) * boxPriors[2][i];
+      float w = (float) Math.exp(predictions[0][i][3] / W_SCALE) * boxPriors[3][i];
+
+      float ymin = ycenter - h / 2.f;
+      float xmin = xcenter - w / 2.f;
+      float ymax = ycenter + h / 2.f;
+      float xmax = xcenter + w / 2.f;
+
+      predictions[0][i][0] = ymin;
+      predictions[0][i][1] = xmin;
+      predictions[0][i][2] = ymax;
+      predictions[0][i][3] = xmax;
+    }
+  }
+
+  /**
+   * Initializes a native TensorFlow session for classifying images.
+   *
+   * @param assetManager The asset manager to be used to load assets.
+   * @param modelFilename The filepath of the model GraphDef protocol buffer.
+   * @param labelFilename The filepath of label file for classes.
+   */
+  public static Classifier create(
+      final AssetManager assetManager,
+      final String modelFilename,
+      final String labelFilename,
+      final int inputSize) throws IOException {
+    final TFLiteObjectDetectionAPIModel d = new TFLiteObjectDetectionAPIModel();
+
+    d.loadCoderOptions(assetManager, "file:///android_asset/box_priors.txt", d.boxPriors);
+
+    InputStream labelsInput = null;
+    String actualFilename = labelFilename.split("file:///android_asset/")[1];
+    labelsInput = assetManager.open(actualFilename);
+    BufferedReader br = null;
+    br = new BufferedReader(new InputStreamReader(labelsInput));
+    String line;
+    while ((line = br.readLine()) != null) {
+      LOGGER.w(line);
+      d.labels.add(line);
+    }
+    br.close();
+
+    d.inputSize = inputSize;
+
+    try {
+      d.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+    // Pre-allocate buffers.
+    d.img = new float[1][inputSize][inputSize][3];
+
+    d.intValues = new int[d.inputSize * d.inputSize];
+    d.outputLocations = new float[1][NUM_RESULTS][4];
+    d.outputClasses = new float[1][NUM_RESULTS][NUM_CLASSES];
+    return d;
+  }
+
+  private TFLiteObjectDetectionAPIModel() {}
+
+  @Override
+  public List<Recognition> recognizeImage(final Bitmap bitmap) {
+    // Log this method so that it can be analyzed with systrace.
+    Trace.beginSection("recognizeImage");
+
+    Trace.beginSection("preprocessBitmap");
+    // Preprocess the image data from 0-255 int to normalized float based
+    // on the provided parameters.
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+
+    for (int i = 0; i < inputSize; ++i) {
+      for (int j = 0; j < inputSize; ++j) {
+        int pixel = intValues[j * inputSize + i];
+        img[0][j][i][2] = (float) (pixel & 0xFF) / 128.0f - 1.0f;
+        img[0][j][i][1] = (float) ((pixel >> 8) & 0xFF) / 128.0f - 1.0f;
+        img[0][j][i][0] = (float) ((pixel >> 16) & 0xFF) / 128.0f - 1.0f;
+      }
+    }
+    Trace.endSection(); // preprocessBitmap
+
+    // Copy the input data into TensorFlow.
+    Trace.beginSection("feed");
+    outputLocations = new float[1][NUM_RESULTS][4];
+    outputClasses = new float[1][NUM_RESULTS][NUM_CLASSES];
+
+    Object[] inputArray = {img};
+    Map<Integer, Object> outputMap = new HashMap<>();
+    outputMap.put(0, outputLocations);
+    outputMap.put(1, outputClasses);
+    Trace.endSection();
+
+    // Run the inference call.
+    Trace.beginSection("run");
+    tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
+    Trace.endSection();
+
+    decodeCenterSizeBoxes(outputLocations);
+
+    // Find the best detections.
+    final PriorityQueue<Recognition> pq =
+        new PriorityQueue<Recognition>(
+            1,
+            new Comparator<Recognition>() {
+              @Override
+              public int compare(final Recognition lhs, final Recognition rhs) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return Float.compare(rhs.getConfidence(), lhs.getConfidence());
+              }
+            });
+
+    // Scale them back to the input size.
+    for (int i = 0; i < NUM_RESULTS; ++i) {
+      float topClassScore = -1000f;
+      int topClassScoreIndex = -1;
+
+      // Skip the first catch-all class.
+      for (int j = 1; j < NUM_CLASSES; ++j) {
+        float score = expit(outputClasses[0][i][j]);
+
+        if (score > topClassScore) {
+          topClassScoreIndex = j;
+          topClassScore = score;
+        }
+      }
+
+      if (topClassScore > 0.001f) {
+        final RectF detection =
+            new RectF(
+                outputLocations[0][i][1] * inputSize,
+                outputLocations[0][i][0] * inputSize,
+                outputLocations[0][i][3] * inputSize,
+                outputLocations[0][i][2] * inputSize);
+
+        pq.add(
+            new Recognition(
+                "" + i,
+                labels.get(topClassScoreIndex),
+                outputClasses[0][i][topClassScoreIndex],
+                detection));
+      }
+    }
+
+    final ArrayList<Recognition> recognitions = new ArrayList<Recognition>();
+    for (int i = 0; i < Math.min(pq.size(), 10); ++i) {
+      Recognition recog = pq.poll();
+      recognitions.add(recog);
+    }
+    Trace.endSection(); // "recognizeImage"
+    return recognitions;
+  }
+
+  @Override
+  public void enableStatLogging(final boolean logStats) {
+  }
+
+  @Override
+  public String getStatString() {
+    return "";
+  }
+
+  @Override
+  public void close() {
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/AssetUtils.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/AssetUtils.java
new file mode 100644
index 0000000000000000000000000000000000000000..c50efdf889145ad717445015fb94a37568939b73
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/AssetUtils.java
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.content.Context;
+import android.content.res.AssetManager;
+import android.util.Log;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/** Utilities for dealing with assets. */
+public class AssetUtils {
+
+  private static final String TAG = AssetUtils.class.getSimpleName();
+
+  private static final int BYTE_BUF_SIZE = 2048;
+
+  /**
+   * Copies a file from assets.
+   *
+   * @param context application context used to discover assets.
+   * @param assetName the relative file name within assets.
+   * @param targetName the target file name, always over write the existing file.
+   * @throws IOException if operation fails.
+   */
+  public static void copy(Context context, String assetName, String targetName) throws IOException {
+
+    Log.d(TAG, "creating file " + targetName + " from " + assetName);
+
+    File targetFile = null;
+    InputStream inputStream = null;
+    FileOutputStream outputStream = null;
+
+    try {
+      AssetManager assets = context.getAssets();
+      targetFile = new File(targetName);
+      inputStream = assets.open(assetName);
+      // TODO(kanlig): refactor log messages to make them more useful.
+      Log.d(TAG, "Creating outputstream");
+      outputStream = new FileOutputStream(targetFile, false /* append */);
+      copy(inputStream, outputStream);
+    } finally {
+      if (outputStream != null) {
+        outputStream.close();
+      }
+      if (inputStream != null) {
+        inputStream.close();
+      }
+    }
+  }
+
+  private static void copy(InputStream from, OutputStream to) throws IOException {
+    byte[] buf = new byte[BYTE_BUF_SIZE];
+    while (true) {
+      int r = from.read(buf);
+      if (r == -1) {
+        break;
+      }
+      to.write(buf, 0, r);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/BorderedText.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/BorderedText.java
new file mode 100644
index 0000000000000000000000000000000000000000..decfc3d8793d127800feb5d58cdaf3f84512d840
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/BorderedText.java
@@ -0,0 +1,117 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Paint;
+import android.graphics.Paint.Align;
+import android.graphics.Paint.Style;
+import android.graphics.Rect;
+import android.graphics.Typeface;
+import java.util.Vector;
+
+/**
+ * A class that encapsulates the tedious bits of rendering legible, bordered text onto a canvas.
+ */
+public class BorderedText {
+  private final Paint interiorPaint;
+  private final Paint exteriorPaint;
+
+  private final float textSize;
+
+  /**
+   * Creates a left-aligned bordered text object with a white interior, and a black exterior with
+   * the specified text size.
+   *
+   * @param textSize text size in pixels
+   */
+  public BorderedText(final float textSize) {
+    this(Color.WHITE, Color.BLACK, textSize);
+  }
+
+  /**
+   * Create a bordered text object with the specified interior and exterior colors, text size and
+   * alignment.
+   *
+   * @param interiorColor the interior text color
+   * @param exteriorColor the exterior text color
+   * @param textSize text size in pixels
+   */
+  public BorderedText(final int interiorColor, final int exteriorColor, final float textSize) {
+    interiorPaint = new Paint();
+    interiorPaint.setTextSize(textSize);
+    interiorPaint.setColor(interiorColor);
+    interiorPaint.setStyle(Style.FILL);
+    interiorPaint.setAntiAlias(false);
+    interiorPaint.setAlpha(255);
+
+    exteriorPaint = new Paint();
+    exteriorPaint.setTextSize(textSize);
+    exteriorPaint.setColor(exteriorColor);
+    exteriorPaint.setStyle(Style.FILL_AND_STROKE);
+    exteriorPaint.setStrokeWidth(textSize / 8);
+    exteriorPaint.setAntiAlias(false);
+    exteriorPaint.setAlpha(255);
+
+    this.textSize = textSize;
+  }
+
+  public void setTypeface(Typeface typeface) {
+    interiorPaint.setTypeface(typeface);
+    exteriorPaint.setTypeface(typeface);
+  }
+
+  public void drawText(final Canvas canvas, final float posX, final float posY, final String text) {
+    canvas.drawText(text, posX, posY, exteriorPaint);
+    canvas.drawText(text, posX, posY, interiorPaint);
+  }
+
+  public void drawLines(Canvas canvas, final float posX, final float posY, Vector<String> lines) {
+    int lineNum = 0;
+    for (final String line : lines) {
+      drawText(canvas, posX, posY - getTextSize() * (lines.size() - lineNum - 1), line);
+      ++lineNum;
+    }
+  }
+
+  public void setInteriorColor(final int color) {
+    interiorPaint.setColor(color);
+  }
+
+  public void setExteriorColor(final int color) {
+    exteriorPaint.setColor(color);
+  }
+
+  public float getTextSize() {
+    return textSize;
+  }
+
+  public void setAlpha(final int alpha) {
+    interiorPaint.setAlpha(alpha);
+    exteriorPaint.setAlpha(alpha);
+  }
+
+  public void getTextBounds(
+      final String line, final int index, final int count, final Rect lineBounds) {
+    interiorPaint.getTextBounds(line, index, count, lineBounds);
+  }
+
+  public void setTextAlign(final Align align) {
+    interiorPaint.setTextAlign(align);
+    exteriorPaint.setTextAlign(align);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
new file mode 100644
index 0000000000000000000000000000000000000000..e02c6559176d40d3df42bccc0c374e60f70371b2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
@@ -0,0 +1,344 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.graphics.Bitmap;
+import android.graphics.Matrix;
+import android.os.Environment;
+import java.io.File;
+import java.io.FileOutputStream;
+
+/**
+ * Utility class for manipulating images.
+ **/
+public class ImageUtils {
+  @SuppressWarnings("unused")
+  private static final Logger LOGGER = new Logger();
+
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.w("Native library not found, native RGB -> YUV conversion may be unavailable.");
+    }
+  }
+
+  /**
+   * Utility method to compute the allocated size in bytes of a YUV420SP image
+   * of the given dimensions.
+   */
+  public static int getYUVByteSize(final int width, final int height) {
+    // The luminance plane requires 1 byte per pixel.
+    final int ySize = width * height;
+
+    // The UV plane works on 2x2 blocks, so dimensions with odd size must be rounded up.
+    // Each 2x2 block takes 2 bytes to encode, one each for U and V.
+    final int uvSize = ((width + 1) / 2) * ((height + 1) / 2) * 2;
+
+    return ySize + uvSize;
+  }
+
+  /**
+   * Saves a Bitmap object to disk for analysis.
+   *
+   * @param bitmap The bitmap to save.
+   */
+  public static void saveBitmap(final Bitmap bitmap) {
+    saveBitmap(bitmap, "preview.png");
+  }
+
+  /**
+   * Saves a Bitmap object to disk for analysis.
+   *
+   * @param bitmap The bitmap to save.
+   * @param filename The location to save the bitmap to.
+   */
+  public static void saveBitmap(final Bitmap bitmap, final String filename) {
+    final String root =
+        Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "tensorflow";
+    LOGGER.i("Saving %dx%d bitmap to %s.", bitmap.getWidth(), bitmap.getHeight(), root);
+    final File myDir = new File(root);
+
+    if (!myDir.mkdirs()) {
+      LOGGER.i("Make dir failed");
+    }
+
+    final String fname = filename;
+    final File file = new File(myDir, fname);
+    if (file.exists()) {
+      file.delete();
+    }
+    try {
+      final FileOutputStream out = new FileOutputStream(file);
+      bitmap.compress(Bitmap.CompressFormat.PNG, 99, out);
+      out.flush();
+      out.close();
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  // This value is 2 ^ 18 - 1, and is used to clamp the RGB values before their ranges
+  // are normalized to eight bits.
+  static final int kMaxChannelValue = 262143;
+
+  // Always prefer the native implementation if available.
+  private static boolean useNativeConversion = false;
+
+  public static void convertYUV420SPToARGB8888(
+      byte[] input,
+      int width,
+      int height,
+      int[] output) {
+    if (useNativeConversion) {
+      try {
+        ImageUtils.convertYUV420SPToARGB8888(input, output, width, height, false);
+        return;
+      } catch (UnsatisfiedLinkError e) {
+        LOGGER.w(
+            "Native YUV420SP -> RGB implementation not found, falling back to Java implementation");
+        useNativeConversion = false;
+      }
+    }
+
+    // Java implementation of YUV420SP to ARGB8888 converting
+    final int frameSize = width * height;
+    for (int j = 0, yp = 0; j < height; j++) {
+      int uvp = frameSize + (j >> 1) * width;
+      int u = 0;
+      int v = 0;
+
+      for (int i = 0; i < width; i++, yp++) {
+        int y = 0xff & input[yp];
+        if ((i & 1) == 0) {
+          v = 0xff & input[uvp++];
+          u = 0xff & input[uvp++];
+        }
+
+        output[yp] = YUV2RGB(y, u, v);
+      }
+    }
+  }
+
+  private static int YUV2RGB(int y, int u, int v) {
+    // Adjust and check YUV values
+    y = (y - 16) < 0 ? 0 : (y - 16);
+    u -= 128;
+    v -= 128;
+
+    // This is the floating point equivalent. We do the conversion in integer
+    // because some Android devices do not have floating point in hardware.
+    // nR = (int)(1.164 * nY + 2.018 * nU);
+    // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
+    // nB = (int)(1.164 * nY + 1.596 * nV);
+    int y1192 = 1192 * y;
+    int r = (y1192 + 1634 * v);
+    int g = (y1192 - 833 * v - 400 * u);
+    int b = (y1192 + 2066 * u);
+
+    // Clipping RGB values to be inside boundaries [ 0 , kMaxChannelValue ]
+    r = r > kMaxChannelValue ? kMaxChannelValue : (r < 0 ? 0 : r);
+    g = g > kMaxChannelValue ? kMaxChannelValue : (g < 0 ? 0 : g);
+    b = b > kMaxChannelValue ? kMaxChannelValue : (b < 0 ? 0 : b);
+
+    return 0xff000000 | ((r << 6) & 0xff0000) | ((g >> 2) & 0xff00) | ((b >> 10) & 0xff);
+  }
+
+
+  public static void convertYUV420ToARGB8888(
+      byte[] yData,
+      byte[] uData,
+      byte[] vData,
+      int width,
+      int height,
+      int yRowStride,
+      int uvRowStride,
+      int uvPixelStride,
+      int[] out) {
+    if (useNativeConversion) {
+      try {
+        convertYUV420ToARGB8888(
+            yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false);
+        return;
+      } catch (UnsatisfiedLinkError e) {
+        LOGGER.w(
+            "Native YUV420 -> RGB implementation not found, falling back to Java implementation");
+        useNativeConversion = false;
+      }
+    }
+
+    int yp = 0;
+    for (int j = 0; j < height; j++) {
+      int pY = yRowStride * j;
+      int pUV = uvRowStride * (j >> 1);
+
+      for (int i = 0; i < width; i++) {
+        int uv_offset = pUV + (i >> 1) * uvPixelStride;
+
+        out[yp++] = YUV2RGB(
+            0xff & yData[pY + i],
+            0xff & uData[uv_offset],
+            0xff & vData[uv_offset]);
+      }
+    }
+  }
+
+
+  /**
+   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width and height. The
+   * input and output must already be allocated and non-null. For efficiency, no error checking is
+   * performed.
+   *
+   * @param input The array of YUV 4:2:0 input data.
+   * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   * @param halfSize If true, downsample to 50% in each dimension, otherwise not.
+   */
+  private static native void convertYUV420SPToARGB8888(
+      byte[] input, int[] output, int width, int height, boolean halfSize);
+
+  /**
+   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width
+   * and height. The input and output must already be allocated and non-null.
+   * For efficiency, no error checking is performed.
+   *
+   * @param y
+   * @param u
+   * @param v
+   * @param uvPixelStride
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   * @param halfSize If true, downsample to 50% in each dimension, otherwise not.
+   * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
+   */
+  private static native void convertYUV420ToARGB8888(
+      byte[] y,
+      byte[] u,
+      byte[] v,
+      int[] output,
+      int width,
+      int height,
+      int yRowStride,
+      int uvRowStride,
+      int uvPixelStride,
+      boolean halfSize);
+
+  /**
+   * Converts YUV420 semi-planar data to RGB 565 data using the supplied width
+   * and height. The input and output must already be allocated and non-null.
+   * For efficiency, no error checking is performed.
+   *
+   * @param input The array of YUV 4:2:0 input data.
+   * @param output A pre-allocated array for the RGB 5:6:5 output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   */
+  private static native void convertYUV420SPToRGB565(
+      byte[] input, byte[] output, int width, int height);
+
+  /**
+   * Converts 32-bit ARGB8888 image data to YUV420SP data.  This is useful, for
+   * instance, in creating data to feed the classes that rely on raw camera
+   * preview frames.
+   *
+   * @param input An array of input pixels in ARGB8888 format.
+   * @param output A pre-allocated array for the YUV420SP output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   */
+  private static native void convertARGB8888ToYUV420SP(
+      int[] input, byte[] output, int width, int height);
+
+  /**
+   * Converts 16-bit RGB565 image data to YUV420SP data.  This is useful, for
+   * instance, in creating data to feed the classes that rely on raw camera
+   * preview frames.
+   *
+   * @param input An array of input pixels in RGB565 format.
+   * @param output A pre-allocated array for the YUV420SP output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   */
+  private static native void convertRGB565ToYUV420SP(
+      byte[] input, byte[] output, int width, int height);
+
+  /**
+   * Returns a transformation matrix from one reference frame into another.
+   * Handles cropping (if maintaining aspect ratio is desired) and rotation.
+   *
+   * @param srcWidth Width of source frame.
+   * @param srcHeight Height of source frame.
+   * @param dstWidth Width of destination frame.
+   * @param dstHeight Height of destination frame.
+   * @param applyRotation Amount of rotation to apply from one frame to another.
+   *  Must be a multiple of 90.
+   * @param maintainAspectRatio If true, will ensure that scaling in x and y remains constant,
+   * cropping the image if necessary.
+   * @return The transformation fulfilling the desired requirements.
+   */
+  public static Matrix getTransformationMatrix(
+      final int srcWidth,
+      final int srcHeight,
+      final int dstWidth,
+      final int dstHeight,
+      final int applyRotation,
+      final boolean maintainAspectRatio) {
+    final Matrix matrix = new Matrix();
+
+    if (applyRotation != 0) {
+      if (applyRotation % 90 != 0) {
+        LOGGER.w("Rotation of %d % 90 != 0", applyRotation);
+      }
+
+      // Translate so center of image is at origin.
+      matrix.postTranslate(-srcWidth / 2.0f, -srcHeight / 2.0f);
+
+      // Rotate around origin.
+      matrix.postRotate(applyRotation);
+    }
+
+    // Account for the already applied rotation, if any, and then determine how
+    // much scaling is needed for each axis.
+    final boolean transpose = (Math.abs(applyRotation) + 90) % 180 == 0;
+
+    final int inWidth = transpose ? srcHeight : srcWidth;
+    final int inHeight = transpose ? srcWidth : srcHeight;
+
+    // Apply scaling if necessary.
+    if (inWidth != dstWidth || inHeight != dstHeight) {
+      final float scaleFactorX = dstWidth / (float) inWidth;
+      final float scaleFactorY = dstHeight / (float) inHeight;
+
+      if (maintainAspectRatio) {
+        // Scale by minimum factor so that dst is filled completely while
+        // maintaining the aspect ratio. Some image may fall off the edge.
+        final float scaleFactor = Math.max(scaleFactorX, scaleFactorY);
+        matrix.postScale(scaleFactor, scaleFactor);
+      } else {
+        // Scale exactly to fill dst from src.
+        matrix.postScale(scaleFactorX, scaleFactorY);
+      }
+    }
+
+    if (applyRotation != 0) {
+      // Translate back from origin centered reference to destination frame.
+      matrix.postTranslate(dstWidth / 2.0f, dstHeight / 2.0f);
+    }
+
+    return matrix;
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Logger.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Logger.java
new file mode 100644
index 0000000000000000000000000000000000000000..0d984096a08cff5640a9dad3a33069fd9c77bbd0
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Logger.java
@@ -0,0 +1,190 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.util.Log;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Wrapper for the platform log function, allows convenient message prefixing and log disabling.
+ */
+public final class Logger {
+  private static final String DEFAULT_TAG = "tensorflow";
+  private static final int DEFAULT_MIN_LOG_LEVEL = Log.DEBUG;
+
+  // Classes to be ignored when examining the stack trace
+  private static final Set<String> IGNORED_CLASS_NAMES;
+
+  static {
+    IGNORED_CLASS_NAMES = new HashSet<String>(3);
+    IGNORED_CLASS_NAMES.add("dalvik.system.VMStack");
+    IGNORED_CLASS_NAMES.add("java.lang.Thread");
+    IGNORED_CLASS_NAMES.add(Logger.class.getCanonicalName());
+  }
+
+  private final String tag;
+  private final String messagePrefix;
+  private int minLogLevel = DEFAULT_MIN_LOG_LEVEL;
+
+  /**
+   * Creates a Logger using the class name as the message prefix.
+   *
+   * @param clazz the simple name of this class is used as the message prefix.
+   */
+  public Logger(final Class<?> clazz) {
+    this(clazz.getSimpleName());
+  }
+
+  /**
+   * Creates a Logger using the specified message prefix.
+   *
+   * @param messagePrefix is prepended to the text of every message.
+   */
+  public Logger(final String messagePrefix) {
+    this(DEFAULT_TAG, messagePrefix);
+  }
+
+  /**
+   * Creates a Logger with a custom tag and a custom message prefix. If the message prefix
+   * is set to <pre>null</pre>, the caller's class name is used as the prefix.
+   *
+   * @param tag identifies the source of a log message.
+   * @param messagePrefix prepended to every message if non-null. If null, the name of the caller is
+   *                      being used
+   */
+  public Logger(final String tag, final String messagePrefix) {
+    this.tag = tag;
+    final String prefix = messagePrefix == null ? getCallerSimpleName() : messagePrefix;
+    this.messagePrefix = (prefix.length() > 0) ? prefix + ": " : prefix;
+  }
+
+  /**
+   * Creates a Logger using the caller's class name as the message prefix.
+   */
+  public Logger() {
+    this(DEFAULT_TAG, null);
+  }
+
+  /**
+   * Creates a Logger using the caller's class name as the message prefix.
+   */
+  public Logger(final int minLogLevel) {
+    this(DEFAULT_TAG, null);
+    this.minLogLevel = minLogLevel;
+  }
+
+  public void setMinLogLevel(final int minLogLevel) {
+    this.minLogLevel = minLogLevel;
+  }
+
+  public boolean isLoggable(final int logLevel) {
+    return logLevel >= minLogLevel || Log.isLoggable(tag, logLevel);
+  }
+
+  /**
+   * Return caller's simple name.
+   *
+   * Android getStackTrace() returns an array that looks like this:
+   *     stackTrace[0]: dalvik.system.VMStack
+   *     stackTrace[1]: java.lang.Thread
+   *     stackTrace[2]: com.google.android.apps.unveil.env.UnveilLogger
+   *     stackTrace[3]: com.google.android.apps.unveil.BaseApplication
+   *
+   * This function returns the simple version of the first non-filtered name.
+   *
+   * @return caller's simple name
+   */
+  private static String getCallerSimpleName() {
+    // Get the current callstack so we can pull the class of the caller off of it.
+    final StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
+
+    for (final StackTraceElement elem : stackTrace) {
+      final String className = elem.getClassName();
+      if (!IGNORED_CLASS_NAMES.contains(className)) {
+        // We're only interested in the simple name of the class, not the complete package.
+        final String[] classParts = className.split("\\.");
+        return classParts[classParts.length - 1];
+      }
+    }
+
+    return Logger.class.getSimpleName();
+  }
+
+  private String toMessage(final String format, final Object... args) {
+    return messagePrefix + (args.length > 0 ? String.format(format, args) : format);
+  }
+
+  public void v(final String format, final Object... args) {
+    if (isLoggable(Log.VERBOSE)) {
+      Log.v(tag, toMessage(format, args));
+    }
+  }
+
+  public void v(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.VERBOSE)) {
+      Log.v(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void d(final String format, final Object... args) {
+    if (isLoggable(Log.DEBUG)) {
+      Log.d(tag, toMessage(format, args));
+    }
+  }
+
+  public void d(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.DEBUG)) {
+      Log.d(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void i(final String format, final Object... args) {
+    if (isLoggable(Log.INFO)) {
+      Log.i(tag, toMessage(format, args));
+    }
+  }
+
+  public void i(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.INFO)) {
+      Log.i(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void w(final String format, final Object... args) {
+    if (isLoggable(Log.WARN)) {
+      Log.w(tag, toMessage(format, args));
+    }
+  }
+
+  public void w(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.WARN)) {
+      Log.w(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void e(final String format, final Object... args) {
+    if (isLoggable(Log.ERROR)) {
+      Log.e(tag, toMessage(format, args));
+    }
+  }
+
+  public void e(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.ERROR)) {
+      Log.e(tag, toMessage(format, args), t);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Size.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Size.java
new file mode 100644
index 0000000000000000000000000000000000000000..ef15d14daa841bf185d1839393c68c211d1e04d7
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Size.java
@@ -0,0 +1,143 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.graphics.Bitmap;
+import android.text.TextUtils;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Size class independent of a Camera object.
+ */
+public class Size implements Comparable<Size>, Serializable {
+
+  // 1.4 went out with this UID so we'll need to maintain it to preserve pending queries when
+  // upgrading.
+  public static final long serialVersionUID = 7689808733290872361L;
+
+  public final int width;
+  public final int height;
+
+  public Size(final int width, final int height) {
+    this.width = width;
+    this.height = height;
+  }
+
+  public Size(final Bitmap bmp) {
+    this.width = bmp.getWidth();
+    this.height = bmp.getHeight();
+  }
+
+  /**
+   * Rotate a size by the given number of degrees.
+   * @param size Size to rotate.
+   * @param rotation Degrees {0, 90, 180, 270} to rotate the size.
+   * @return Rotated size.
+   */
+  public static Size getRotatedSize(final Size size, final int rotation) {
+    if (rotation % 180 != 0) {
+      // The phone is portrait, therefore the camera is sideways and frame should be rotated.
+      return new Size(size.height, size.width);
+    }
+    return size;
+  }
+
+  public static Size parseFromString(String sizeString) {
+    if (TextUtils.isEmpty(sizeString)) {
+      return null;
+    }
+
+    sizeString = sizeString.trim();
+
+    // The expected format is "<width>x<height>".
+    final String[] components = sizeString.split("x");
+    if (components.length == 2) {
+      try {
+        final int width = Integer.parseInt(components[0]);
+        final int height = Integer.parseInt(components[1]);
+        return new Size(width, height);
+      } catch (final NumberFormatException e) {
+        return null;
+      }
+    } else {
+      return null;
+    }
+  }
+
+  public static List<Size> sizeStringToList(final String sizes) {
+    final List<Size> sizeList = new ArrayList<Size>();
+    if (sizes != null) {
+      final String[] pairs = sizes.split(",");
+      for (final String pair : pairs) {
+        final Size size = Size.parseFromString(pair);
+        if (size != null) {
+          sizeList.add(size);
+        }
+      }
+    }
+    return sizeList;
+  }
+
+  public static String sizeListToString(final List<Size> sizes) {
+    String sizesString = "";
+    if (sizes != null && sizes.size() > 0) {
+      sizesString = sizes.get(0).toString();
+      for (int i = 1; i < sizes.size(); i++) {
+        sizesString += "," + sizes.get(i).toString();
+      }
+    }
+    return sizesString;
+  }
+
+  public final float aspectRatio() {
+    return (float) width / (float) height;
+  }
+
+  @Override
+  public int compareTo(final Size other) {
+    return width * height - other.width * other.height;
+  }
+
+  @Override
+  public boolean equals(final Object other) {
+    if (other == null) {
+      return false;
+    }
+
+    if (!(other instanceof Size)) {
+      return false;
+    }
+
+    final Size otherSize = (Size) other;
+    return (width == otherSize.width && height == otherSize.height);
+  }
+
+  @Override
+  public int hashCode() {
+    return width * 32713 + height;
+  }
+
+  @Override
+  public String toString() {
+    return dimensionsAsString(width, height);
+  }
+
+  public static final String dimensionsAsString(final int width, final int height) {
+    return width + "x" + height;
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/SplitTimer.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/SplitTimer.java
new file mode 100644
index 0000000000000000000000000000000000000000..459b0a0d4dbae0a9929f1a57d0b1f48b5d96b7ef
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/SplitTimer.java
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.os.SystemClock;
+
+/**
+ * A simple utility timer for measuring CPU time and wall-clock splits.
+ */
+public class SplitTimer {
+  private final Logger logger;
+
+  private long lastWallTime;
+  private long lastCpuTime;
+
+  public SplitTimer(final String name) {
+    logger = new Logger(name);
+    newSplit();
+  }
+
+  public void newSplit() {
+    lastWallTime = SystemClock.uptimeMillis();
+    lastCpuTime = SystemClock.currentThreadTimeMillis();
+  }
+
+  public void endSplit(final String splitName) {
+    final long currWallTime = SystemClock.uptimeMillis();
+    final long currCpuTime = SystemClock.currentThreadTimeMillis();
+
+    logger.i(
+        "%s: cpu=%dms wall=%dms",
+        splitName, currCpuTime - lastCpuTime, currWallTime - lastWallTime);
+
+    lastWallTime = currWallTime;
+    lastCpuTime = currCpuTime;
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
new file mode 100644
index 0000000000000000000000000000000000000000..af6af2bc8f508a70aa7e44a7236f0e7ea5e3d71c
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -0,0 +1,421 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.tracking;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Paint.Cap;
+import android.graphics.Paint.Join;
+import android.graphics.Paint.Style;
+import android.graphics.RectF;
+import android.text.TextUtils;
+import android.util.Pair;
+import android.util.TypedValue;
+import android.widget.Toast;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import org.tensorflow.demo.Classifier.Recognition;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+
+/**
+ * A tracker wrapping ObjectTracker that also handles non-max suppression and matching existing
+ * objects to new detections.
+ */
+public class MultiBoxTracker {
+  private final Logger logger = new Logger();
+
+  private static final float TEXT_SIZE_DIP = 18;
+
+  // Maximum percentage of a box that can be overlapped by another box at detection time. Otherwise
+  // the lower scored box (new or old) will be removed.
+  private static final float MAX_OVERLAP = 0.2f;
+
+  private static final float MIN_SIZE = 16.0f;
+
+  // Allow replacement of the tracked box with new results if
+  // correlation has dropped below this level.
+  private static final float MARGINAL_CORRELATION = 0.75f;
+
+  // Consider object to be lost if correlation falls below this threshold.
+  private static final float MIN_CORRELATION = 0.3f;
+
+  private static final int[] COLORS = {
+    Color.BLUE, Color.RED, Color.GREEN, Color.YELLOW, Color.CYAN, Color.MAGENTA, Color.WHITE,
+    Color.parseColor("#55FF55"), Color.parseColor("#FFA500"), Color.parseColor("#FF8888"),
+    Color.parseColor("#AAAAFF"), Color.parseColor("#FFFFAA"), Color.parseColor("#55AAAA"),
+    Color.parseColor("#AA33AA"), Color.parseColor("#0D0068")
+  };
+
+  private final Queue<Integer> availableColors = new LinkedList<Integer>();
+
+  public ObjectTracker objectTracker;
+
+  final List<Pair<Float, RectF>> screenRects = new LinkedList<Pair<Float, RectF>>();
+
+  private static class TrackedRecognition {
+    ObjectTracker.TrackedObject trackedObject;
+    RectF location;
+    float detectionConfidence;
+    int color;
+    String title;
+  }
+
+  private final List<TrackedRecognition> trackedObjects = new LinkedList<TrackedRecognition>();
+
+  private final Paint boxPaint = new Paint();
+
+  private final float textSizePx;
+  private final BorderedText borderedText;
+
+  private Matrix frameToCanvasMatrix;
+
+  private int frameWidth;
+  private int frameHeight;
+
+  private int sensorOrientation;
+  private Context context;
+
+  public MultiBoxTracker(final Context context) {
+    this.context = context;
+    for (final int color : COLORS) {
+      availableColors.add(color);
+    }
+
+    boxPaint.setColor(Color.RED);
+    boxPaint.setStyle(Style.STROKE);
+    boxPaint.setStrokeWidth(12.0f);
+    boxPaint.setStrokeCap(Cap.ROUND);
+    boxPaint.setStrokeJoin(Join.ROUND);
+    boxPaint.setStrokeMiter(100);
+
+    textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, context.getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+  }
+
+  private Matrix getFrameToCanvasMatrix() {
+    return frameToCanvasMatrix;
+  }
+
+  public synchronized void drawDebug(final Canvas canvas) {
+    final Paint textPaint = new Paint();
+    textPaint.setColor(Color.WHITE);
+    textPaint.setTextSize(60.0f);
+
+    final Paint boxPaint = new Paint();
+    boxPaint.setColor(Color.RED);
+    boxPaint.setAlpha(200);
+    boxPaint.setStyle(Style.STROKE);
+
+    for (final Pair<Float, RectF> detection : screenRects) {
+      final RectF rect = detection.second;
+      canvas.drawRect(rect, boxPaint);
+      canvas.drawText("" + detection.first, rect.left, rect.top, textPaint);
+      borderedText.drawText(canvas, rect.centerX(), rect.centerY(), "" + detection.first);
+    }
+
+    if (objectTracker == null) {
+      return;
+    }
+
+    // Draw correlations.
+    for (final TrackedRecognition recognition : trackedObjects) {
+      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
+
+      final RectF trackedPos = trackedObject.getTrackedPositionInPreviewFrame();
+
+      if (getFrameToCanvasMatrix().mapRect(trackedPos)) {
+        final String labelString = String.format("%.2f", trackedObject.getCurrentCorrelation());
+        borderedText.drawText(canvas, trackedPos.right, trackedPos.bottom, labelString);
+      }
+    }
+
+    final Matrix matrix = getFrameToCanvasMatrix();
+    objectTracker.drawDebug(canvas, matrix);
+  }
+
+  public synchronized void trackResults(
+      final List<Recognition> results, final byte[] frame, final long timestamp) {
+    logger.i("Processing %d results from %d", results.size(), timestamp);
+    processResults(timestamp, results, frame);
+  }
+
+  public synchronized void draw(final Canvas canvas) {
+    final boolean rotated = sensorOrientation % 180 == 90;
+    final float multiplier =
+        Math.min(canvas.getHeight() / (float) (rotated ? frameWidth : frameHeight),
+                 canvas.getWidth() / (float) (rotated ? frameHeight : frameWidth));
+    frameToCanvasMatrix =
+        ImageUtils.getTransformationMatrix(
+            frameWidth,
+            frameHeight,
+            (int) (multiplier * (rotated ? frameHeight : frameWidth)),
+            (int) (multiplier * (rotated ? frameWidth : frameHeight)),
+            sensorOrientation,
+            false);
+    for (final TrackedRecognition recognition : trackedObjects) {
+      final RectF trackedPos =
+          (objectTracker != null)
+              ? recognition.trackedObject.getTrackedPositionInPreviewFrame()
+              : new RectF(recognition.location);
+
+      getFrameToCanvasMatrix().mapRect(trackedPos);
+      boxPaint.setColor(recognition.color);
+
+      final float cornerSize = Math.min(trackedPos.width(), trackedPos.height()) / 8.0f;
+      canvas.drawRoundRect(trackedPos, cornerSize, cornerSize, boxPaint);
+
+      final String labelString =
+          !TextUtils.isEmpty(recognition.title)
+              ? String.format("%s %.2f", recognition.title, recognition.detectionConfidence)
+              : String.format("%.2f", recognition.detectionConfidence);
+      borderedText.drawText(canvas, trackedPos.left + cornerSize, trackedPos.bottom, labelString);
+    }
+  }
+
+  private boolean initialized = false;
+
+  public synchronized void onFrame(
+      final int w,
+      final int h,
+      final int rowStride,
+      final int sensorOrientation,
+      final byte[] frame,
+      final long timestamp) {
+    if (objectTracker == null && !initialized) {
+      ObjectTracker.clearInstance();
+
+      logger.i("Initializing ObjectTracker: %dx%d", w, h);
+      objectTracker = ObjectTracker.getInstance(w, h, rowStride, true);
+      frameWidth = w;
+      frameHeight = h;
+      this.sensorOrientation = sensorOrientation;
+      initialized = true;
+
+      if (objectTracker == null) {
+        String message =
+            "Object tracking support not found. "
+                + "See tensorflow/examples/android/README.md for details.";
+        Toast.makeText(context, message, Toast.LENGTH_LONG).show();
+        logger.e(message);
+      }
+    }
+
+    if (objectTracker == null) {
+      return;
+    }
+
+    objectTracker.nextFrame(frame, null, timestamp, null, true);
+
+    // Clean up any objects not worth tracking any more.
+    final LinkedList<TrackedRecognition> copyList =
+        new LinkedList<TrackedRecognition>(trackedObjects);
+    for (final TrackedRecognition recognition : copyList) {
+      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
+      final float correlation = trackedObject.getCurrentCorrelation();
+      if (correlation < MIN_CORRELATION) {
+        logger.v("Removing tracked object %s because NCC is %.2f", trackedObject, correlation);
+        trackedObject.stopTracking();
+        trackedObjects.remove(recognition);
+
+        availableColors.add(recognition.color);
+      }
+    }
+  }
+
+  private void processResults(
+      final long timestamp, final List<Recognition> results, final byte[] originalFrame) {
+    final List<Pair<Float, Recognition>> rectsToTrack = new LinkedList<Pair<Float, Recognition>>();
+
+    screenRects.clear();
+    final Matrix rgbFrameToScreen = new Matrix(getFrameToCanvasMatrix());
+
+    for (final Recognition result : results) {
+      if (result.getLocation() == null) {
+        continue;
+      }
+      final RectF detectionFrameRect = new RectF(result.getLocation());
+
+      final RectF detectionScreenRect = new RectF();
+      rgbFrameToScreen.mapRect(detectionScreenRect, detectionFrameRect);
+
+      logger.v(
+          "Result! Frame: " + result.getLocation() + " mapped to screen:" + detectionScreenRect);
+
+      screenRects.add(new Pair<Float, RectF>(result.getConfidence(), detectionScreenRect));
+
+      if (detectionFrameRect.width() < MIN_SIZE || detectionFrameRect.height() < MIN_SIZE) {
+        logger.w("Degenerate rectangle! " + detectionFrameRect);
+        continue;
+      }
+
+      rectsToTrack.add(new Pair<Float, Recognition>(result.getConfidence(), result));
+    }
+
+    if (rectsToTrack.isEmpty()) {
+      logger.v("Nothing to track, aborting.");
+      return;
+    }
+
+    if (objectTracker == null) {
+      trackedObjects.clear();
+      for (final Pair<Float, Recognition> potential : rectsToTrack) {
+        final TrackedRecognition trackedRecognition = new TrackedRecognition();
+        trackedRecognition.detectionConfidence = potential.first;
+        trackedRecognition.location = new RectF(potential.second.getLocation());
+        trackedRecognition.trackedObject = null;
+        trackedRecognition.title = potential.second.getTitle();
+        trackedRecognition.color = COLORS[trackedObjects.size()];
+        trackedObjects.add(trackedRecognition);
+
+        if (trackedObjects.size() >= COLORS.length) {
+          break;
+        }
+      }
+      return;
+    }
+
+    logger.i("%d rects to track", rectsToTrack.size());
+    for (final Pair<Float, Recognition> potential : rectsToTrack) {
+      handleDetection(originalFrame, timestamp, potential);
+    }
+  }
+
+  private void handleDetection(
+      final byte[] frameCopy, final long timestamp, final Pair<Float, Recognition> potential) {
+    final ObjectTracker.TrackedObject potentialObject =
+        objectTracker.trackObject(potential.second.getLocation(), timestamp, frameCopy);
+
+    final float potentialCorrelation = potentialObject.getCurrentCorrelation();
+    logger.v(
+        "Tracked object went from %s to %s with correlation %.2f",
+        potential.second, potentialObject.getTrackedPositionInPreviewFrame(), potentialCorrelation);
+
+    if (potentialCorrelation < MARGINAL_CORRELATION) {
+      logger.v("Correlation too low to begin tracking %s.", potentialObject);
+      potentialObject.stopTracking();
+      return;
+    }
+
+    final List<TrackedRecognition> removeList = new LinkedList<TrackedRecognition>();
+
+    float maxIntersect = 0.0f;
+
+    // This is the current tracked object whose color we will take. If left null we'll take the
+    // first one from the color queue.
+    TrackedRecognition recogToReplace = null;
+
+    // Look for intersections that will be overridden by this object or an intersection that would
+    // prevent this one from being placed.
+    for (final TrackedRecognition trackedRecognition : trackedObjects) {
+      final RectF a = trackedRecognition.trackedObject.getTrackedPositionInPreviewFrame();
+      final RectF b = potentialObject.getTrackedPositionInPreviewFrame();
+      final RectF intersection = new RectF();
+      final boolean intersects = intersection.setIntersect(a, b);
+
+      final float intersectArea = intersection.width() * intersection.height();
+      final float totalArea = a.width() * a.height() + b.width() * b.height() - intersectArea;
+      final float intersectOverUnion = intersectArea / totalArea;
+
+      // If there is an intersection with this currently tracked box above the maximum overlap
+      // percentage allowed, either the new recognition needs to be dismissed or the old
+      // recognition needs to be removed and possibly replaced with the new one.
+      if (intersects && intersectOverUnion > MAX_OVERLAP) {
+        if (potential.first < trackedRecognition.detectionConfidence
+            && trackedRecognition.trackedObject.getCurrentCorrelation() > MARGINAL_CORRELATION) {
+          // If track for the existing object is still going strong and the detection score was
+          // good, reject this new object.
+          potentialObject.stopTracking();
+          return;
+        } else {
+          removeList.add(trackedRecognition);
+
+          // Let the previously tracked object with max intersection amount donate its color to
+          // the new object.
+          if (intersectOverUnion > maxIntersect) {
+            maxIntersect = intersectOverUnion;
+            recogToReplace = trackedRecognition;
+          }
+        }
+      }
+    }
+
+    // If we're already tracking the max object and no intersections were found to bump off,
+    // pick the worst current tracked object to remove, if it's also worse than this candidate
+    // object.
+    if (availableColors.isEmpty() && removeList.isEmpty()) {
+      for (final TrackedRecognition candidate : trackedObjects) {
+        if (candidate.detectionConfidence < potential.first) {
+          if (recogToReplace == null
+              || candidate.detectionConfidence < recogToReplace.detectionConfidence) {
+            // Save it so that we use this color for the new object.
+            recogToReplace = candidate;
+          }
+        }
+      }
+      if (recogToReplace != null) {
+        logger.v("Found non-intersecting object to remove.");
+        removeList.add(recogToReplace);
+      } else {
+        logger.v("No non-intersecting object found to remove");
+      }
+    }
+
+    // Remove everything that got intersected.
+    for (final TrackedRecognition trackedRecognition : removeList) {
+      logger.v(
+          "Removing tracked object %s with detection confidence %.2f, correlation %.2f",
+          trackedRecognition.trackedObject,
+          trackedRecognition.detectionConfidence,
+          trackedRecognition.trackedObject.getCurrentCorrelation());
+      trackedRecognition.trackedObject.stopTracking();
+      trackedObjects.remove(trackedRecognition);
+      if (trackedRecognition != recogToReplace) {
+        availableColors.add(trackedRecognition.color);
+      }
+    }
+
+    if (recogToReplace == null && availableColors.isEmpty()) {
+      logger.e("No room to track this object, aborting.");
+      potentialObject.stopTracking();
+      return;
+    }
+
+    // Finally safe to say we can track this object.
+    logger.v(
+        "Tracking object %s (%s) with detection confidence %.2f at position %s",
+        potentialObject,
+        potential.second.getTitle(),
+        potential.first,
+        potential.second.getLocation());
+    final TrackedRecognition trackedRecognition = new TrackedRecognition();
+    trackedRecognition.detectionConfidence = potential.first;
+    trackedRecognition.trackedObject = potentialObject;
+    trackedRecognition.title = potential.second.getTitle();
+
+    // Use the color from a replaced object before taking one from the color queue.
+    trackedRecognition.color =
+        recogToReplace != null ? recogToReplace.color : availableColors.poll();
+    trackedObjects.add(trackedRecognition);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
new file mode 100644
index 0000000000000000000000000000000000000000..8b4248d8fbcfa2d58621fb429edbc9498956d273
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
@@ -0,0 +1,661 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.tracking;
+
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.PointF;
+import android.graphics.RectF;
+import android.graphics.Typeface;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Vector;
+import javax.microedition.khronos.opengles.GL10;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.env.Size;
+
+/**
+ * True object detector/tracker class that tracks objects across consecutive preview frames.
+ * It provides a simplified Java interface to the analogous native object defined by
+ * jni/client_vision/tracking/object_tracker.*.
+ *
+ * Currently, the ObjectTracker is a singleton due to native code restrictions, and so must
+ * be allocated by ObjectTracker.getInstance(). In addition, release() should be called
+ * as soon as the ObjectTracker is no longer needed, and before a new one is created.
+ *
+ * nextFrame() should be called as new frames become available, preferably as often as possible.
+ *
+ * After allocation, new TrackedObjects may be instantiated via trackObject(). TrackedObjects
+ * are associated with the ObjectTracker that created them, and are only valid while that
+ * ObjectTracker still exists.
+ */
+public class ObjectTracker {
+  private static final Logger LOGGER = new Logger();
+
+  private static boolean libraryFound = false;
+
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+      libraryFound = true;
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.e("libtensorflow_demo.so not found, tracking unavailable");
+    }
+  }
+
+  private static final boolean DRAW_TEXT = false;
+
+  /**
+   * How many history points to keep track of and draw in the red history line.
+   */
+  private static final int MAX_DEBUG_HISTORY_SIZE = 30;
+
+  /**
+   * How many frames of optical flow deltas to record.
+   * TODO(andrewharp): Push this down to the native level so it can be polled
+   * efficiently into a an array for upload, instead of keeping a duplicate
+   * copy in Java.
+   */
+  private static final int MAX_FRAME_HISTORY_SIZE = 200;
+
+  private static final int DOWNSAMPLE_FACTOR = 2;
+
+  private final byte[] downsampledFrame;
+
+  protected static ObjectTracker instance;
+
+  private final Map<String, TrackedObject> trackedObjects;
+
+  private long lastTimestamp;
+
+  private FrameChange lastKeypoints;
+
+  private final Vector<PointF> debugHistory;
+
+  private final LinkedList<TimestampedDeltas> timestampedDeltas;
+
+  protected final int frameWidth;
+  protected final int frameHeight;
+  private final int rowStride;
+  protected final boolean alwaysTrack;
+
+  private static class TimestampedDeltas {
+    final long timestamp;
+    final byte[] deltas;
+
+    public TimestampedDeltas(final long timestamp, final byte[] deltas) {
+      this.timestamp = timestamp;
+      this.deltas = deltas;
+    }
+  }
+
+  /**
+   * A simple class that records keypoint information, which includes
+   * local location, score and type. This will be used in calculating
+   * FrameChange.
+   */
+  public static class Keypoint {
+    public final float x;
+    public final float y;
+    public final float score;
+    public final int type;
+
+    public Keypoint(final float x, final float y) {
+      this.x = x;
+      this.y = y;
+      this.score = 0;
+      this.type = -1;
+    }
+
+    public Keypoint(final float x, final float y, final float score, final int type) {
+      this.x = x;
+      this.y = y;
+      this.score = score;
+      this.type = type;
+    }
+
+    Keypoint delta(final Keypoint other) {
+      return new Keypoint(this.x - other.x, this.y - other.y);
+    }
+  }
+
+  /**
+   * A simple class that could calculate Keypoint delta.
+   * This class will be used in calculating frame translation delta
+   * for optical flow.
+   */
+  public static class PointChange {
+    public final Keypoint keypointA;
+    public final Keypoint keypointB;
+    Keypoint pointDelta;
+    private final boolean wasFound;
+
+    public PointChange(final float x1, final float y1,
+                       final float x2, final float y2,
+                       final float score, final int type,
+                       final boolean wasFound) {
+      this.wasFound = wasFound;
+
+      keypointA = new Keypoint(x1, y1, score, type);
+      keypointB = new Keypoint(x2, y2);
+    }
+
+    public Keypoint getDelta() {
+      if (pointDelta == null) {
+        pointDelta = keypointB.delta(keypointA);
+      }
+      return pointDelta;
+    }
+  }
+
+  /** A class that records a timestamped frame translation delta for optical flow. */
+  public static class FrameChange {
+    public static final int KEYPOINT_STEP = 7;
+
+    public final Vector<PointChange> pointDeltas;
+
+    private final float minScore;
+    private final float maxScore;
+
+    public FrameChange(final float[] framePoints) {
+      float minScore = 100.0f;
+      float maxScore = -100.0f;
+
+      pointDeltas = new Vector<PointChange>(framePoints.length / KEYPOINT_STEP);
+
+      for (int i = 0; i < framePoints.length; i += KEYPOINT_STEP) {
+        final float x1 = framePoints[i + 0] * DOWNSAMPLE_FACTOR;
+        final float y1 = framePoints[i + 1] * DOWNSAMPLE_FACTOR;
+
+        final boolean wasFound = framePoints[i + 2] > 0.0f;
+
+        final float x2 = framePoints[i + 3] * DOWNSAMPLE_FACTOR;
+        final float y2 = framePoints[i + 4] * DOWNSAMPLE_FACTOR;
+        final float score = framePoints[i + 5];
+        final int type = (int) framePoints[i + 6];
+
+        minScore = Math.min(minScore, score);
+        maxScore = Math.max(maxScore, score);
+
+        pointDeltas.add(new PointChange(x1, y1, x2, y2, score, type, wasFound));
+      }
+
+      this.minScore = minScore;
+      this.maxScore = maxScore;
+    }
+  }
+
+  public static synchronized ObjectTracker getInstance(
+      final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
+    if (!libraryFound) {
+      LOGGER.e(
+          "Native object tracking support not found. "
+              + "See tensorflow/examples/android/README.md for details.");
+      return null;
+    }
+
+    if (instance == null) {
+      instance = new ObjectTracker(frameWidth, frameHeight, rowStride, alwaysTrack);
+      instance.init();
+    } else {
+      throw new RuntimeException(
+          "Tried to create a new objectracker before releasing the old one!");
+    }
+    return instance;
+  }
+
+  public static synchronized void clearInstance() {
+    if (instance != null) {
+      instance.release();
+    }
+  }
+
+  protected ObjectTracker(
+      final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
+    this.frameWidth = frameWidth;
+    this.frameHeight = frameHeight;
+    this.rowStride = rowStride;
+    this.alwaysTrack = alwaysTrack;
+    this.timestampedDeltas = new LinkedList<TimestampedDeltas>();
+
+    trackedObjects = new HashMap<String, TrackedObject>();
+
+    debugHistory = new Vector<PointF>(MAX_DEBUG_HISTORY_SIZE);
+
+    downsampledFrame =
+        new byte
+            [(frameWidth + DOWNSAMPLE_FACTOR - 1)
+                / DOWNSAMPLE_FACTOR
+                * (frameWidth + DOWNSAMPLE_FACTOR - 1)
+                / DOWNSAMPLE_FACTOR];
+  }
+
+  protected void init() {
+    // The native tracker never sees the full frame, so pre-scale dimensions
+    // by the downsample factor.
+    initNative(frameWidth / DOWNSAMPLE_FACTOR, frameHeight / DOWNSAMPLE_FACTOR, alwaysTrack);
+  }
+
+  private final float[] matrixValues = new float[9];
+
+  private long downsampledTimestamp;
+
+  @SuppressWarnings("unused")
+  public synchronized void drawOverlay(final GL10 gl,
+      final Size cameraViewSize, final Matrix matrix) {
+    final Matrix tempMatrix = new Matrix(matrix);
+    tempMatrix.preScale(DOWNSAMPLE_FACTOR, DOWNSAMPLE_FACTOR);
+    tempMatrix.getValues(matrixValues);
+    drawNative(cameraViewSize.width, cameraViewSize.height, matrixValues);
+  }
+
+  public synchronized void nextFrame(
+      final byte[] frameData, final byte[] uvData,
+      final long timestamp, final float[] transformationMatrix,
+      final boolean updateDebugInfo) {
+    if (downsampledTimestamp != timestamp) {
+      ObjectTracker.downsampleImageNative(
+          frameWidth, frameHeight, rowStride, frameData, DOWNSAMPLE_FACTOR, downsampledFrame);
+      downsampledTimestamp = timestamp;
+    }
+
+    // Do Lucas Kanade using the fullframe initializer.
+    nextFrameNative(downsampledFrame, uvData, timestamp, transformationMatrix);
+
+    timestampedDeltas.add(new TimestampedDeltas(timestamp, getKeypointsPacked(DOWNSAMPLE_FACTOR)));
+    while (timestampedDeltas.size() > MAX_FRAME_HISTORY_SIZE) {
+      timestampedDeltas.removeFirst();
+    }
+
+    for (final TrackedObject trackedObject : trackedObjects.values()) {
+      trackedObject.updateTrackedPosition();
+    }
+
+    if (updateDebugInfo) {
+      updateDebugHistory();
+    }
+
+    lastTimestamp = timestamp;
+  }
+
+  public synchronized void release() {
+    releaseMemoryNative();
+    synchronized (ObjectTracker.class) {
+      instance = null;
+    }
+  }
+
+  private void drawHistoryDebug(final Canvas canvas) {
+    drawHistoryPoint(
+        canvas, frameWidth * DOWNSAMPLE_FACTOR / 2, frameHeight * DOWNSAMPLE_FACTOR / 2);
+  }
+
+  private void drawHistoryPoint(final Canvas canvas, final float startX, final float startY) {
+    final Paint p = new Paint();
+    p.setAntiAlias(false);
+    p.setTypeface(Typeface.SERIF);
+
+    p.setColor(Color.RED);
+    p.setStrokeWidth(2.0f);
+
+    // Draw the center circle.
+    p.setColor(Color.GREEN);
+    canvas.drawCircle(startX, startY, 3.0f, p);
+
+    p.setColor(Color.RED);
+
+    // Iterate through in backwards order.
+    synchronized (debugHistory) {
+      final int numPoints = debugHistory.size();
+      float lastX = startX;
+      float lastY = startY;
+      for (int keypointNum = 0; keypointNum < numPoints; ++keypointNum) {
+        final PointF delta = debugHistory.get(numPoints - keypointNum - 1);
+        final float newX = lastX + delta.x;
+        final float newY = lastY + delta.y;
+        canvas.drawLine(lastX, lastY, newX, newY, p);
+        lastX = newX;
+        lastY = newY;
+      }
+    }
+  }
+
+  private static int floatToChar(final float value) {
+    return Math.max(0, Math.min((int) (value * 255.999f), 255));
+  }
+
+  private void drawKeypointsDebug(final Canvas canvas) {
+    final Paint p = new Paint();
+    if (lastKeypoints == null) {
+      return;
+    }
+    final int keypointSize = 3;
+
+    final float minScore = lastKeypoints.minScore;
+    final float maxScore = lastKeypoints.maxScore;
+
+    for (final PointChange keypoint : lastKeypoints.pointDeltas) {
+      if (keypoint.wasFound) {
+        final int r =
+            floatToChar((keypoint.keypointA.score - minScore) / (maxScore - minScore));
+        final int b =
+            floatToChar(1.0f - (keypoint.keypointA.score - minScore) / (maxScore - minScore));
+
+        final int color = 0xFF000000 | (r << 16) | b;
+        p.setColor(color);
+
+        final float[] screenPoints = {keypoint.keypointA.x, keypoint.keypointA.y,
+                                      keypoint.keypointB.x, keypoint.keypointB.y};
+        canvas.drawRect(screenPoints[2] - keypointSize,
+                        screenPoints[3] - keypointSize,
+                        screenPoints[2] + keypointSize,
+                        screenPoints[3] + keypointSize, p);
+        p.setColor(Color.CYAN);
+        canvas.drawLine(screenPoints[2], screenPoints[3],
+                        screenPoints[0], screenPoints[1], p);
+
+        if (DRAW_TEXT) {
+          p.setColor(Color.WHITE);
+          canvas.drawText(keypoint.keypointA.type + ": " + keypoint.keypointA.score,
+              keypoint.keypointA.x, keypoint.keypointA.y, p);
+        }
+      } else {
+        p.setColor(Color.YELLOW);
+        final float[] screenPoint = {keypoint.keypointA.x, keypoint.keypointA.y};
+        canvas.drawCircle(screenPoint[0], screenPoint[1], 5.0f, p);
+      }
+    }
+  }
+
+  private synchronized PointF getAccumulatedDelta(final long timestamp, final float positionX,
+      final float positionY, final float radius) {
+    final RectF currPosition = getCurrentPosition(timestamp,
+        new RectF(positionX - radius, positionY - radius, positionX + radius, positionY + radius));
+    return new PointF(currPosition.centerX() - positionX, currPosition.centerY() - positionY);
+  }
+
+  private synchronized RectF getCurrentPosition(final long timestamp, final RectF
+      oldPosition) {
+    final RectF downscaledFrameRect = downscaleRect(oldPosition);
+
+    final float[] delta = new float[4];
+    getCurrentPositionNative(timestamp, downscaledFrameRect.left, downscaledFrameRect.top,
+        downscaledFrameRect.right, downscaledFrameRect.bottom, delta);
+
+    final RectF newPosition = new RectF(delta[0], delta[1], delta[2], delta[3]);
+
+    return upscaleRect(newPosition);
+  }
+
+  private void updateDebugHistory() {
+    lastKeypoints = new FrameChange(getKeypointsNative(false));
+
+    if (lastTimestamp == 0) {
+      return;
+    }
+
+    final PointF delta =
+        getAccumulatedDelta(
+            lastTimestamp, frameWidth / DOWNSAMPLE_FACTOR, frameHeight / DOWNSAMPLE_FACTOR, 100);
+
+    synchronized (debugHistory) {
+      debugHistory.add(delta);
+
+      while (debugHistory.size() > MAX_DEBUG_HISTORY_SIZE) {
+        debugHistory.remove(0);
+      }
+    }
+  }
+
+  public synchronized void drawDebug(final Canvas canvas, final Matrix frameToCanvas) {
+    canvas.save();
+    canvas.setMatrix(frameToCanvas);
+
+    drawHistoryDebug(canvas);
+    drawKeypointsDebug(canvas);
+
+    canvas.restore();
+  }
+
+  public Vector<String> getDebugText() {
+    final Vector<String> lines = new Vector<String>();
+
+    if (lastKeypoints != null) {
+      lines.add("Num keypoints " + lastKeypoints.pointDeltas.size());
+      lines.add("Min score: " + lastKeypoints.minScore);
+      lines.add("Max score: " + lastKeypoints.maxScore);
+    }
+
+    return lines;
+  }
+
+  public synchronized List<byte[]> pollAccumulatedFlowData(final long endFrameTime) {
+    final List<byte[]> frameDeltas = new ArrayList<byte[]>();
+    while (timestampedDeltas.size() > 0) {
+      final TimestampedDeltas currentDeltas = timestampedDeltas.peek();
+      if (currentDeltas.timestamp <= endFrameTime) {
+        frameDeltas.add(currentDeltas.deltas);
+        timestampedDeltas.removeFirst();
+      } else {
+        break;
+      }
+    }
+
+    return frameDeltas;
+  }
+
+  private RectF downscaleRect(final RectF fullFrameRect) {
+    return new RectF(
+        fullFrameRect.left / DOWNSAMPLE_FACTOR,
+        fullFrameRect.top / DOWNSAMPLE_FACTOR,
+        fullFrameRect.right / DOWNSAMPLE_FACTOR,
+        fullFrameRect.bottom / DOWNSAMPLE_FACTOR);
+  }
+
+  private RectF upscaleRect(final RectF downsampledFrameRect) {
+    return new RectF(
+        downsampledFrameRect.left * DOWNSAMPLE_FACTOR,
+        downsampledFrameRect.top * DOWNSAMPLE_FACTOR,
+        downsampledFrameRect.right * DOWNSAMPLE_FACTOR,
+        downsampledFrameRect.bottom * DOWNSAMPLE_FACTOR);
+  }
+
+  /**
+   * A TrackedObject represents a native TrackedObject, and provides access to the
+   * relevant native tracking information available after every frame update. They may
+   * be safely passed around and accessed externally, but will become invalid after
+   * stopTracking() is called or the related creating ObjectTracker is deactivated.
+   *
+   * @author andrewharp@google.com (Andrew Harp)
+   */
+  public class TrackedObject {
+    private final String id;
+
+    private long lastExternalPositionTime;
+
+    private RectF lastTrackedPosition;
+    private boolean visibleInLastFrame;
+
+    private boolean isDead;
+
+    TrackedObject(final RectF position, final long timestamp, final byte[] data) {
+      isDead = false;
+
+      id = Integer.toString(this.hashCode());
+
+      lastExternalPositionTime = timestamp;
+
+      synchronized (ObjectTracker.this) {
+        registerInitialAppearance(position, data);
+        setPreviousPosition(position, timestamp);
+        trackedObjects.put(id, this);
+      }
+    }
+
+    public void stopTracking() {
+      checkValidObject();
+
+      synchronized (ObjectTracker.this) {
+        isDead = true;
+        forgetNative(id);
+        trackedObjects.remove(id);
+      }
+    }
+
+    public float getCurrentCorrelation() {
+      checkValidObject();
+      return ObjectTracker.this.getCurrentCorrelation(id);
+    }
+
+    void registerInitialAppearance(final RectF position, final byte[] data) {
+      final RectF externalPosition = downscaleRect(position);
+      registerNewObjectWithAppearanceNative(id,
+            externalPosition.left, externalPosition.top,
+            externalPosition.right, externalPosition.bottom,
+            data);
+    }
+
+    synchronized void setPreviousPosition(final RectF position, final long timestamp) {
+      checkValidObject();
+      synchronized (ObjectTracker.this) {
+        if (lastExternalPositionTime > timestamp) {
+          LOGGER.w("Tried to use older position time!");
+          return;
+        }
+        final RectF externalPosition = downscaleRect(position);
+        lastExternalPositionTime = timestamp;
+
+        setPreviousPositionNative(id,
+            externalPosition.left, externalPosition.top,
+            externalPosition.right, externalPosition.bottom,
+            lastExternalPositionTime);
+
+        updateTrackedPosition();
+      }
+    }
+
+    void setCurrentPosition(final RectF position) {
+      checkValidObject();
+      final RectF downsampledPosition = downscaleRect(position);
+      synchronized (ObjectTracker.this) {
+        setCurrentPositionNative(id,
+            downsampledPosition.left, downsampledPosition.top,
+            downsampledPosition.right, downsampledPosition.bottom);
+      }
+    }
+
+    private synchronized void updateTrackedPosition() {
+      checkValidObject();
+
+      final float[] delta = new float[4];
+      getTrackedPositionNative(id, delta);
+      lastTrackedPosition = new RectF(delta[0], delta[1], delta[2], delta[3]);
+
+      visibleInLastFrame = isObjectVisible(id);
+    }
+
+    public synchronized RectF getTrackedPositionInPreviewFrame() {
+      checkValidObject();
+
+      if (lastTrackedPosition == null) {
+        return null;
+      }
+      return upscaleRect(lastTrackedPosition);
+    }
+
+    synchronized long getLastExternalPositionTime() {
+      return lastExternalPositionTime;
+    }
+
+    public synchronized boolean visibleInLastPreviewFrame() {
+      return visibleInLastFrame;
+    }
+
+    private void checkValidObject() {
+      if (isDead) {
+        throw new RuntimeException("TrackedObject already removed from tracking!");
+      } else if (ObjectTracker.this != instance) {
+        throw new RuntimeException("TrackedObject created with another ObjectTracker!");
+      }
+    }
+  }
+
+  public synchronized TrackedObject trackObject(
+      final RectF position, final long timestamp, final byte[] frameData) {
+    if (downsampledTimestamp != timestamp) {
+      ObjectTracker.downsampleImageNative(
+          frameWidth, frameHeight, rowStride, frameData, DOWNSAMPLE_FACTOR, downsampledFrame);
+      downsampledTimestamp = timestamp;
+    }
+    return new TrackedObject(position, timestamp, downsampledFrame);
+  }
+
+  public synchronized TrackedObject trackObject(final RectF position, final byte[] frameData) {
+    return new TrackedObject(position, lastTimestamp, frameData);
+  }
+
+  /** ********************* NATIVE CODE ************************************ */
+
+  /** This will contain an opaque pointer to the native ObjectTracker */
+  private long nativeObjectTracker;
+
+  private native void initNative(int imageWidth, int imageHeight, boolean alwaysTrack);
+
+  protected native void registerNewObjectWithAppearanceNative(
+      String objectId, float x1, float y1, float x2, float y2, byte[] data);
+
+  protected native void setPreviousPositionNative(
+      String objectId, float x1, float y1, float x2, float y2, long timestamp);
+
+  protected native void setCurrentPositionNative(
+      String objectId, float x1, float y1, float x2, float y2);
+
+  protected native void forgetNative(String key);
+
+  protected native String getModelIdNative(String key);
+
+  protected native boolean haveObject(String key);
+  protected native boolean isObjectVisible(String key);
+  protected native float getCurrentCorrelation(String key);
+
+  protected native float getMatchScore(String key);
+
+  protected native void getTrackedPositionNative(String key, float[] points);
+
+  protected native void nextFrameNative(
+      byte[] frameData, byte[] uvData, long timestamp, float[] frameAlignMatrix);
+
+  protected native void releaseMemoryNative();
+
+  protected native void getCurrentPositionNative(long timestamp,
+      final float positionX1, final float positionY1,
+      final float positionX2, final float positionY2,
+      final float[] delta);
+
+  protected native byte[] getKeypointsPacked(float scaleFactor);
+
+  protected native float[] getKeypointsNative(boolean onlyReturnCorrespondingKeypoints);
+
+  protected native void drawNative(int viewWidth, int viewHeight, float[] frameToCanvas);
+
+  protected static native void downsampleImageNative(
+      int width, int height, int rowStride, byte[] input, int factor, byte[] output);
+}
diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
index 959347b5491514ddc13af57ea6f7385a0d39e418..9322e186a280e932a2441ab16ac8579d9ab67ee2 100644
--- a/tensorflow/contrib/lite/examples/label_image/BUILD
+++ b/tensorflow/contrib/lite/examples/label_image/BUILD
@@ -69,15 +69,3 @@ cc_library(
 #         "//testing/base/public:gunit",
 #     ],
 # )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg b/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg
deleted file mode 100644
index bc83946647c6a923a8a0bd3a041b42e4febe6a31..0000000000000000000000000000000000000000
Binary files a/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg and /dev/null differ
diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/contrib/lite/g3doc/models.md
index 5b393140d61544e6d6e40d4b6ee1872b22cc84b2..d8134d5a00097b3eef24d5583d7f114c34e3bef2 100644
--- a/tensorflow/contrib/lite/g3doc/models.md
+++ b/tensorflow/contrib/lite/g3doc/models.md
@@ -1,7 +1,13 @@
-#List of Hosted Models
+# List of Hosted Models
 
-*   [Inception V3 2015](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_2015_2017_11_10.zip)
-*   [Inception V3 Slim 2016](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
+*   [NASNet large](https://storage.googleapis.com/download.tensorflow.org/models/tflite/nasnet_large_2018_03_27.zip)
+*   [NASNet mobile](https://storage.googleapis.com/download.tensorflow.org/models/tflite/nasnet_mobile_2018_03_27.zip)
+*   [ResNet v2 101](https://storage.googleapis.com/download.tensorflow.org/models/tflite/resnet_v2_101_2018_03_27.zip)
+*   [ResNet v2 50](https://storage.googleapis.com/download.tensorflow.org/models/tflite/resnet_v2_50_2018_03_27.zip)
+*   [Inception ResNet v2](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_resnet_v2_2018_03_27.zip)
+*   [Inception v4](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v4_2018_03_27.zip)
+*   [Inception v3 2015](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_2015_2017_11_10.zip)
+*   [Inception v3 Slim 2016](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
 *   [Mobilenet 0.25 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_128_float_2017_11_08.zip)
 *   [Mobilenet 0.25 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_160_float_2017_11_08.zip)
 *   [Mobilenet 0.25 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_192_float_2017_11_08.zip)
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index b1bbb7c67013acfb575cc1e9f9390ba191cbd08e..61ea5231e352f5e014f9200eccae69548574c034 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -30,13 +30,18 @@ quantized training is necessary before conversion.
 ## Data Format and Broadcasting
 
 At the moment TensorFlow Lite supports only TensorFlow's "NHWC" format, and
-broadcasting in operations like tf.add and tf.mul is generally not supported.
+broadcasting is only support in a limited number of ops (tf.add, tf.mul, tf.sub,
+and tf.div).
 
 ## Compatible Operations
 
 The following TensorFlow operations are usually mapped to their TensorFlow Lite
 counterparts:
 
+*   [tf.batch_to_space_nd](https://www.tensorflow.org/api_docs/python/tf/batch_to_space_nd) -
+    *as long as the input tensor is 4D (1 batch + 2 spatial + 1 other) and the
+    crops attribute is not used*
+*   [tf.exp](https://www.tensorflow.org/api_docs/python/tf/exp)
 *   [tf.matmul](https://www.tensorflow.org/api_docs/python/tf/matmul) - *as long
     as the second argument is constant and transposition is not used*
 *   [tf.nn.avg_pool](https://www.tensorflow.org/api_docs/python/tf/nn/avg_pool)
@@ -47,12 +52,30 @@ counterparts:
 *   [tf.nn.l2_normalize](https://www.tensorflow.org/api_docs/python/tf/nn/l2_normalize) -
     *as long as normalization is done along the last dimension*
 *   [tf.nn.local_response_normalization](https://www.tensorflow.org/api_docs/python/tf/nn/local_response_normalization)
+*   [tf.nn.log_softmax](https://www.tensorflow.org/api_docs/python/tf/nn/log_softmax) -
+    *as long as axis is not provided*
 *   [tf.nn.max_pool](https://www.tensorflow.org/api_docs/python/tf/nn/max_pool)
 *   [tf.nn.softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) -
     *as long as tensors are 2D and axis is the last dimension*
+*   [tf.nn.top_k](https://www.tensorflow.org/api_docs/python/tf/nn/top_k)
+*   [tf.pad](https://www.tensorflow.org/api_docs/python/tf/pad) - *as long as
+    mode and constant_values are not used*
+*   [tf.reduce_mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean) -
+    *as long as the reduction_indices attribute is not used*
 *   [tf.reshape](https://www.tensorflow.org/api_docs/python/tf/reshape)
 *   [tf.sigmoid](https://www.tensorflow.org/api_docs/python/tf/sigmoid)
+*   [tf.space_to_batch_nd](https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd) -
+    *as long as the input tensor is 4D (1 batch + 2 spatial + 1 other)*
 *   [tf.space_to_depth](https://www.tensorflow.org/api_docs/python/tf/space_to_depth)
+*   [tf.split](https://www.tensorflow.org/api_docs/python/tf/split) - *as long
+    as num is not provided and num_or_size_split contains number of splits as a
+    0D tensor*
+*   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze) - *as
+    long as axis is not provided*
+*   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice) -
+    *as long as ellipsis_mask and new_axis_mask are not used*
+*   [tf.transpose](https://www.tensorflow.org/versions/master/api_docs/python/tf/transpose) -
+    *as long as conjugate is not used*
 
 ## Straightforward Conversions, Constant-Folding and Fusing
 
@@ -91,7 +114,6 @@ Here is a list of TensorFlow operations that are usually removed from the graph:
 *   [tf.shape](https://www.tensorflow.org/api_docs/python/tf/shape)
 *   [tf.sqrt](https://www.tensorflow.org/api_docs/python/tf/sqrt)
 *   [tf.square](https://www.tensorflow.org/api_docs/python/tf/square)
-*   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze)
 *   [tf.subtract](https://www.tensorflow.org/api_docs/python/tf/subtract)
 *   [tf.tile](https://www.tensorflow.org/api_docs/python/tf/tile)
 *   [tf.nn.batch_norm_with_global_normalization](https://www.tensorflow.org/api_docs/python/tf/nn/batch_norm_with_global_normalization)
@@ -109,17 +131,11 @@ fused.
 TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
-*   [tf.batch_to_space_nd](https://www.tensorflow.org/api_docs/python/tf/batch_to_space_nd)
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
 *   [tf.floor](https://www.tensorflow.org/api_docs/python/tf/floor)
 *   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
-*   [tf.pad](https://www.tensorflow.org/api_docs/python/tf/pad)
-*   [tf.reduce_mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)
 *   [tf.slice](https://www.tensorflow.org/api_docs/python/tf/slice)
-*   [tf.space_to_batch_nd](https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd)
-*   [tf.split](https://www.tensorflow.org/api_docs/python/tf/split)
-*   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
 ## TensorFlow Lite Operations
@@ -160,6 +176,20 @@ Options {
 }
 ```
 
+**BATCH_TO_SPACE_ND**
+
+```
+Inputs {
+  0: 4D tensor
+  1: 1D tensor
+  2: 2D tensor
+}
+Outputs {
+  0: tensor rearranged using block_shape. See tf.batch_to_space_nd for
+     details.
+}
+```
+
 **CONCATENATION**
 
 ```
@@ -213,6 +243,17 @@ Options {
 }
 ```
 
+**EXP**
+
+```
+Inputs {
+  0: tensor
+}
+Outputs {
+  0: result of computing element-wise exponential of the input tensor
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
@@ -289,6 +330,17 @@ Outputs {
 }
 ```
 
+**LOG_SOFTMAX**
+
+```
+Inputs {
+  0: tensor
+}
+Outputs {
+  0: tensor equivalent to logits - log(reduce_sum(exp(logits), -1))
+}
+```
+
 **MAX_POOL_2D**
 
 ```
@@ -322,6 +374,34 @@ Options {
 }
 ```
 
+**PAD**
+
+```
+Inputs {
+  0: tensor
+  1: tensor
+}
+Outputs {
+  0: tensor where additional values are added before and after the contents of
+     each dimension
+}
+```
+
+**MEAN (tf.reduce_mean)**
+
+```
+Inputs {
+  0: tensor
+  1: tensor
+}
+Outputs {
+  0: tensor containing the mean of the elements
+}
+Options {
+  keep_dims: whether to retain reduced dimensions
+}
+```
+
 **RELU**
 
 ```
@@ -399,6 +479,93 @@ Options {
 }
 ```
 
+**SPACE_TO_BATCH_ND**
+
+```
+Inputs {
+  0: 4D tensor
+  1: 1D tensor
+  2: 2D tensor
+}
+Outputs {
+  0: a tensor rearranged using block_shape. See tf.space_to_batch_nd for
+     details.
+}
+```
+
+**SPLIT**
+
+```
+Inputs {
+  0: 0D tensor (axis)
+  1: tensor (input)
+}
+Outputs {
+  0-N: subtensors built from the input tensors
+}
+Options {
+  num_splits: Specifies number of outputs
+}
+```
+
+**SQUEEZE**
+
+```
+Inputs {
+  0: tensor
+}
+Outputs {
+  0: tensor without any dimensions of size 1
+}
+Options {
+  squeeze_dims
+}
+```
+
+**STRIDED_SLICE**
+
+```
+Inputs {
+  0: tensor
+  1: 1D tensor
+  2: 1D tensor
+  3: 1D tensor
+}
+Outputs {
+  0: slice of the input tensor of the given size
+}
+Options {
+  begin_mask: mask for begin indicies
+  end_mask: mask for end indices
+  shrink_axis_mask: mask that indicates which dimensions to remove
+}
+```
+
+**TOP_K**
+
+```
+Inputs {
+  0: tensor
+  1: OD tensor
+}
+Outputs {
+  0: k largest element along each last dimensional slice
+  1: indicies of values within the last dimension of the input ensor
+}
+```
+
+**TRANSPOSE**
+
+```
+Inputs {
+  0: tensor
+  1: tensor
+}
+Outputs {
+  0: tensor permuted according to perm
+}
+```
+
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index cee57bba5ec8a35b0935c171774eb2ba62946afe..4575fe884dc07963df5f0a26c5fe6680d92e409c 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/graph_info.h"
+#include "tensorflow/contrib/lite/kernels/eigen_support.h"
 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
 #include "tensorflow/contrib/lite/memory_planner.h"
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
@@ -356,7 +357,11 @@ TfLiteStatus Interpreter::AllocateTensors() {
   }
 
   TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-  invokable_ = true;
+  if (state_ == kStateUninvokable) {
+    state_ = kStateInvokable;
+  }
+  TF_LITE_ENSURE(&context_, state_ == kStateInvokable ||
+                                state_ == kStateInvokableAndImmutable);
   return kTfLiteOk;
 }
 
@@ -364,7 +369,12 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
     const std::vector<int>& inputs, const std::vector<int>& outputs,
     const char* init_data, size_t init_data_size, void* builtin_data,
     const TfLiteRegistration* registration, int* node_index) {
-  invokable_ = false;
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(&context_,
+                "AddNodeWithParameters is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  state_ = kStateUninvokable;
 
   std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
                                                               free);
@@ -420,12 +430,17 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
 
 TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
                                             const std::vector<int>& dims) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(&context_,
+                "ResizeInputTensor is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  state_ = kStateUninvokable;
+
   // TODO(aselle): All bounds checks can be implemented as one-sided bounds
   // checks by casting to unsigned for efficiency. Profile before doing this.
-
   TF_LITE_ENSURE(&context_,
                  tensor_index < context_.tensors_size && tensor_index >= 0);
-  invokable_ = false;
   TfLiteIntArray* dims_lite = ConvertVectorToTfLiteIntArray(dims);
   return ResizeTensorImpl(&context_.tensors[tensor_index], dims_lite);
 }
@@ -490,7 +505,7 @@ TfLiteStatus Interpreter::Invoke() {
     ReportError(&context_, "Invoke called on model that is not consistent.");
     return kTfLiteError;
   }
-  if (!invokable_) {
+  if (state_ == kStateUninvokable) {
     ReportError(&context_, "Invoke called on model that is not ready.");
     return kTfLiteError;
   }
@@ -622,6 +637,13 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const int rank,
     const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        &context_,
+        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
   TF_LITE_ENSURE(&context_,
                  tensor_index < context_.tensors_size && tensor_index >= 0);
   // For most tensors we know exactly how much memory is necessary so we can
@@ -645,7 +667,7 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     tensor.allocation_type = kTfLiteMmapRo;
     tensor.allocation = allocation;
   } else {
-    invokable_ = false;
+    state_ = kStateUninvokable;
     TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
                       quantization, const_cast<char*>(buffer), bytes,
                       kTfLiteMmapRo, allocation, &tensor);
@@ -660,7 +682,12 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const int rank,
     const int* dims, TfLiteQuantizationParams quantization) {
-  invokable_ = false;
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        &context_,
+        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
   TF_LITE_ENSURE(&context_,
                  tensor_index < context_.tensors_size && tensor_index >= 0);
   size_t required_bytes = 0;
@@ -736,21 +763,62 @@ void Interpreter::UseNNAPI(bool enable) {
 
 void Interpreter::SetNumThreads(int num_threads) {
   context_.recommended_num_threads = num_threads;
-}
 
-TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  // TODO(ahentz): find a way to avoid this. It causes gemmlowp and eigen to
+  // be required in order to compile the framework.
+  gemm_support::SetNumThreads(&context_, num_threads);
+  eigen_support::SetNumThreads(&context_, num_threads);
+}
+
+TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate,
+                                                  bool allow_dynamic_tensors) {
+  if (!allow_dynamic_tensors) {
+    int last_execution_plan_index_prepared;
+    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
+                                     0, &last_execution_plan_index_prepared));
+
+    bool has_dynamic_tensors = true;
+    // Dynamic tensors exist if not all nodes can be prepared.
+    if (last_execution_plan_index_prepared + 1 == execution_plan_.size()) {
+      // If all the nodes can be prepared, check if the last node has dynamic
+      // tensors.
+      int node_index = execution_plan_[last_execution_plan_index_prepared];
+      TfLiteNode& node = nodes_and_registration_[node_index].first;
+      if (!HasDynamicTensor(context_, node.outputs)) {
+        has_dynamic_tensors = false;
+      }
+    }
+    if (has_dynamic_tensors) {
+      ReportError(&context_, "Attempting to resize a fixed-size tensor.");
+      return kTfLiteError;
+    }
+  }
+
   // TODO(aselle): Consider if it is worth storing pointers to delegates.
-  // Setup additional context interface
+  // Setup additional context interface.
   context_.GetNodeAndRegistration = GetNodeAndRegistration;
   context_.ReplaceSubgraphsWithDelegateKernels =
       ReplaceSubgraphsWithDelegateKernels;
   context_.GetExecutionPlan = GetExecutionPlan;
 
   TfLiteStatus status = delegate->Prepare(&context_, delegate);
+
   // Remove additional context info.
   SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
   SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
   SetForbiddenContextFunction(&context_.GetExecutionPlan);
+
+  TF_LITE_ENSURE_OK(&context_, status);
+
+  if (!allow_dynamic_tensors) {
+    TF_LITE_ENSURE_OK(&context_, AllocateTensors());
+    TF_LITE_ENSURE(&context_, state_ == kStateInvokable ||
+                                  state_ == kStateInvokableAndImmutable);
+    // After using a delegate which doesn't support dynamic tensors, make the
+    // entire graph immutable.
+    state_ = kStateInvokableAndImmutable;
+  }
+
   return status;
 }
 
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index b481ee0891ff8e69d186b995c2b75417d1e38c43..77db17878318276c6cf5067274a3af3be262c8e1 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/memory_planner.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -273,13 +272,9 @@ class Interpreter {
   // Allow a delegate to look at the graph and modify the graph to handle
   // parts of the graph themselves. After this is called, the graph may
   // contain new nodes that replace 1 more nodes.
-  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
-
-  // WARNING: This is a deprecated interface and will be removed as soon as
-  // possible.  Please do not use it.
-  // TODO(impjdi): Remove this interface after resolving dependencies.
-  void set_model(const Model* model) { model_ = const_cast<Model*>(model); }
-  Model* model() const { return model_; }
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate,
+                                       bool allow_dynamic_tensors = false);
 
   // Ensure the data in `tensor.data` is readable. In case delegate is used,
   // it might require to copy the data from delegate buffer to raw memory.
@@ -454,6 +449,20 @@ class Interpreter {
     }
   }
 
+  // The state of the Interpreter.
+  enum State {
+    // The interpreter isn't ready to be invoked.
+    // `AllocateTensor` need to be called to enter an invokable state.
+    kStateUninvokable = 0,
+    // The interpreter is ready to be invoked.
+    kStateInvokable,
+    // The interpreter is ready to be invoked, and graph can't be further
+    // modified. The interpreter will enter this state when calling
+    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
+    kStateInvokableAndImmutable,
+  };
+  State state_ = kStateUninvokable;
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
@@ -469,10 +478,6 @@ class Interpreter {
   // the tensor array.
   bool consistent_ = true;
 
-  // Whether the model is safe to invoke (if any errors occurred this
-  // will be false).
-  bool invokable_ = false;
-
   // Array of indices representing the tensors that are inputs to the
   // interpreter.
   std::vector<int> inputs_;
@@ -509,11 +514,6 @@ class Interpreter {
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
 
   std::unique_ptr<MemoryPlanner> memory_planner_;
-
-  // WARNING: This is a deprecated interface and will be removed as soon as
-  // possible.  Please do not use it.
-  // TODO(impjdi): Remove this interface after resolving dependencies.
-  Model* model_ = nullptr;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index 72d4acedbe72beea276c7c8fc1f7ddce01796d41..131e088079857af34478645b7f1559364d03a493 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -17,9 +17,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/testing/util.h"
+
 namespace tflite {
 namespace {
 
@@ -439,12 +441,12 @@ TEST(BasicInterpreter, ThreeStepAllocate) {
   // String-in String-out node.
   TfLiteRegistration reg_copy = {nullptr, nullptr, nullptr, nullptr};
   reg_copy.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
-    TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]];
+    TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
     DynamicBuffer buf;
-    StringRef str_ref = GetString(a0, 0);
+    StringRef str_ref = GetString(input, 0);
     buf.AddString(str_ref);
-    buf.WriteToTensor(a1);
+    buf.WriteToTensor(output);
     return kTfLiteOk;
   };
 
@@ -778,13 +780,17 @@ TfLiteRegistration AddOpRegistration() {
 
   reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
     // Set output size to input size
-    TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
-    TfLiteTensor* tensor1 = &context->tensors[node->inputs->data[1]];
-    TfLiteTensor* tensor2 = &context->tensors[node->outputs->data[0]];
-    TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
-    TfLiteIntArray* newSizeOther = TfLiteIntArrayCopy(tensor1->dims);
-    TF_LITE_ENSURE_EQ(context, newSize->size, newSizeOther->size);
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, tensor2, newSize));
+    TfLiteTensor* input1 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* input2 = &context->tensors[node->inputs->data[1]];
+    TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+
+    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
+    for (int i = 0; i < input1->dims->size; ++i) {
+      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
+    }
+
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+        context, output, TfLiteIntArrayCopy(input1->dims)));
     return kTfLiteOk;
   };
 
@@ -818,6 +824,8 @@ class TestDelegate : public ::testing::Test {
                                                quant);
     interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
                                                quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
     TfLiteRegistration reg = AddOpRegistration();
     interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
     interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
@@ -916,7 +924,6 @@ class TestDelegate : public ::testing::Test {
 };
 
 TEST_F(TestDelegate, BasicDelegate) {
-  interpreter_->Invoke();
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
 
@@ -944,7 +951,6 @@ TEST_F(TestDelegate, BasicDelegate) {
 }
 
 TEST_F(TestDelegate, ComplexDeligate) {
-  interpreter_->Invoke();
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
   interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
 
@@ -959,7 +965,6 @@ TEST_F(TestDelegate, ComplexDeligate) {
 }
 
 TEST_F(TestDelegate, SetBufferHandleToInput) {
-  interpreter_->Invoke();
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
   interpreter_->ModifyGraphWithDelegate(delegate);
@@ -978,7 +983,6 @@ TEST_F(TestDelegate, SetBufferHandleToInput) {
 }
 
 TEST_F(TestDelegate, SetBufferHandleToOutput) {
-  interpreter_->Invoke();
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
   interpreter_->ModifyGraphWithDelegate(delegate);
@@ -1002,7 +1006,7 @@ TEST_F(TestDelegate, SetInvalidHandleToTensor) {
   interpreter_->Invoke();
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
+  interpreter_->ModifyGraphWithDelegate(delegate, true);
 
   SimpleDelegate another_simple_delegate({0, 1, 2});
 
@@ -1023,6 +1027,88 @@ TEST_F(TestDelegate, SetInvalidHandleToTensor) {
   EXPECT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
 }
 
+TEST_F(TestDelegate, ResizeInputWithNonDynamicDelegateShouldFail) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 2}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 2}), kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 2}), kTfLiteError);
+}
+
+class TestDelegateWithDynamicTensors : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+
+    interpreter_->AddTensors(2);
+    interpreter_->SetInputs({0});
+    interpreter_->SetOutputs({1});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = DynamicCopyOpRegistration();
+    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
+
+    delegate_.Prepare = [](TfLiteContext* context,
+                           TfLiteDelegate* delegate) -> TfLiteStatus {
+      // In this test, the delegate replaces all the nodes if this function is
+      // called.
+      TfLiteIntArray* execution_plan;
+      TF_LITE_ENSURE_STATUS(
+          context->GetExecutionPlan(context, &execution_plan));
+      context->ReplaceSubgraphsWithDelegateKernels(
+          context, DelegateRegistration(), execution_plan, delegate);
+      return kTfLiteOk;
+    };
+  }
+
+  static TfLiteRegistration DynamicCopyOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+      SetTensorToDynamic(output);
+      return kTfLiteOk;
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      // Not implemented since this isn't required in testing.
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  static TfLiteRegistration DelegateRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+    return reg;
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  TfLiteDelegate delegate_;
+};
+
+TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
+  interpreter_->ModifyGraphWithDelegate(&delegate_, false);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The interpreter should not call delegate's `Prepare` when dynamic tensors
+  // exist. So the node ID isn't changed.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
+  interpreter_->ModifyGraphWithDelegate(&delegate_, true);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The node should be replaced because dynamic tensors are allowed. Therefore
+  // only node ID in the execution plan is changed from 0 to 1.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/ios_makefile.inc b/tensorflow/contrib/lite/ios_makefile.inc
index fc6594c3a04ba6aabba99bb631f85737baf389f1..079320586ffd01fc77818a81e0c5962f1d28c1f1 100644
--- a/tensorflow/contrib/lite/ios_makefile.inc
+++ b/tensorflow/contrib/lite/ios_makefile.inc
@@ -31,9 +31,6 @@ ifeq ($(TARGET), IOS)
 		${IPHONEOS_SYSROOT} \
 		-arch $(IOS_ARCH) \
 		-O3
-	ifeq ($(IOS_ARCH), x86_64)
-		CXXFLAGS += -msse4.1
-	endif
 	CCFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-fembed-bitcode \
 		-mno-thumb \
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index f52d6ba6c5390e631d29e75f833aa4dd5bba1a68..7f7a2632dd7858deb861ebc66b3348c9eb32e090 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -167,15 +167,3 @@ tflite_jni_binary(
         "//tensorflow/contrib/lite/java/src/main/native",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
index 654fa9d6d2799fc3cafa3e0e042cb2a5746bf2c5..d6fbef9cc938993b283103984307ab51e609dd6e 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
@@ -6,7 +6,7 @@ android_binary(
     name = "TfLiteCameraDemo",
     srcs = glob(["java/**/*.java"]),
     assets = [
-        "@tflite_mobilenet//:labels.txt",
+        "//tensorflow/contrib/lite/java/demo/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
         "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
     ],
     assets_dir = "",
@@ -27,15 +27,3 @@ android_binary(
         "@androidsdk//com.android.support:support-v4-25.2.0",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
index dd0cd6c98ff878e9c41875cab74c12191cadb173..ce68160b68efd446c1dfa4c70c37aaa4048e4f2f 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
@@ -10,15 +10,3 @@ exports_files(
         ],
     ),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 9b9fdffab557060f0211a0ce361b002cc7d03956..300786c3ca01b12a46f7f9a6fe8fd720f97a79f4 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -299,7 +299,7 @@ public class Camera2BasicFragment extends Fragment
       // create either a new ImageClassifierQuantizedMobileNet or an ImageClassifierFloatInception
       classifier = new ImageClassifierQuantizedMobileNet(getActivity());
     } catch (IOException e) {
-      Log.e(TAG, "Failed to initialize an image classifier.");
+      Log.e(TAG, "Failed to initialize an image classifier.", e);
     }
     startBackgroundThread();
   }
@@ -433,7 +433,7 @@ public class Camera2BasicFragment extends Fragment
         return;
       }
     } catch (CameraAccessException e) {
-      e.printStackTrace();
+      Log.e(TAG, "Failed to access Camera", e);
     } catch (NullPointerException e) {
       // Currently an NPE is thrown when the Camera2API is used but not supported on the
       // device this code runs.
@@ -478,7 +478,7 @@ public class Camera2BasicFragment extends Fragment
       }
       manager.openCamera(cameraId, stateCallback, backgroundHandler);
     } catch (CameraAccessException e) {
-      e.printStackTrace();
+      Log.e(TAG, "Failed to open Camera", e);
     } catch (InterruptedException e) {
       throw new RuntimeException("Interrupted while trying to lock camera opening.", e);
     }
@@ -545,7 +545,7 @@ public class Camera2BasicFragment extends Fragment
         runClassifier = false;
       }
     } catch (InterruptedException e) {
-      e.printStackTrace();
+      Log.e(TAG, "Interrupted when stopping background thread", e);
     }
   }
 
@@ -604,7 +604,7 @@ public class Camera2BasicFragment extends Fragment
                 captureSession.setRepeatingRequest(
                     previewRequest, captureCallback, backgroundHandler);
               } catch (CameraAccessException e) {
-                e.printStackTrace();
+                Log.e(TAG, "Failed to set up config to capture Camera", e);
               }
             }
 
@@ -615,7 +615,7 @@ public class Camera2BasicFragment extends Fragment
           },
           null);
     } catch (CameraAccessException e) {
-      e.printStackTrace();
+      Log.e(TAG, "Failed to preview Camera", e);
     }
   }
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
index c533de7927050d3844089e4d74820611a340e573..e164ac75543ebab12e6b1c057c4ed487eb9accdf 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
@@ -16,7 +16,6 @@ limitations under the License.
 package com.example.android.tflitecamerademo;
 
 import android.app.Activity;
-
 import java.io.IOException;
 
 /**
@@ -49,7 +48,7 @@ public class ImageClassifierQuantizedMobileNet extends ImageClassifier {
 
   @Override
   protected String getLabelPath() {
-    return "labels.txt";
+    return "labels_mobilenet_quant_v1_224.txt";
   }
 
   @Override
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
new file mode 100644
index 0000000000000000000000000000000000000000..d0102883e6b41f5c33a0061c5fd53b5f69b8ab54
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
@@ -0,0 +1,197 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import android.graphics.Bitmap;
+import android.os.SystemClock;
+import android.util.Log;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+
+/**
+ * Class that benchmarks image classifier models.
+ *
+ * <p>===================== General workflow =======================
+ *
+ * <pre>{@code
+ * benchmarker = new OvicBenchmarker();
+ * benchmarker.getReadyToTest(labelInputStream, model);
+ * while (!benchmarker.shouldStop()) {
+ *   Bitmap bitmap = ...
+ *   benchmarker.doTestIteration(bitmap);
+ * }
+ * }</pre>
+ */
+public class OvicBenchmarker {
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "OvicBenchmarker";
+
+  /** Evaluation transformation parameters. */
+  private static final float CENTRAL_FRACTION = 0.875f;
+
+  /** Dimensions of inputs. */
+  private static final int DIM_BATCH_SIZE = 1;
+  private static final int DIM_PIXEL_SIZE = 3;
+  private int imgHeight = 224;
+  private int imgWidth = 224;
+
+  /* Preallocated buffers for storing image data in. */
+  private int[] intValues = null;
+
+  /** A ByteBuffer to hold image data, to be feed into classifier as inputs. */
+  private ByteBuffer imgData = null;
+
+  private OvicClassifier classifier;
+
+  /** Total runtime in ms. */
+  private double totalRuntime = 0.0;
+  /** Total allowed runtime in ms. */
+  private double wallTime = 20000 * 30.0;
+
+  private Boolean benchmarkStarted = null;
+
+  /**
+   * Initializes an {@link OvicBenchmarker}
+   *
+   * @param wallTime: a double number specifying the total amount of time to benchmark.
+   */
+  public OvicBenchmarker(double wallTime) {
+    benchmarkStarted = false;
+    totalRuntime = 0.0;
+    this.wallTime = wallTime;
+  }
+
+  /** Check whether the benchmarker should stop. */
+  public Boolean shouldStop() {
+    if (totalRuntime >= wallTime) {
+      Log.e(
+          TAG,
+          "Total runtime "
+              + Double.toString(totalRuntime)
+              + " exceeded walltime "
+              + Double.toString(wallTime));
+      return true;
+    }
+    return false;
+  }
+
+  /** Check whether the benchmarker is ready to start classifying images. */
+  public Boolean readyToTest() {
+    return (classifier != null);
+  }
+
+  /**
+   * Getting the benchmarker ready for classifying images.
+   *
+   * @param labelInputStream: an {@link InputStream} specifying where the list of labels should be
+   *     read from.
+   * @param model: a {@link MappedByteBuffer} model to benchmark.
+   */
+  public void getReadyToTest(InputStream labelInputStream, MappedByteBuffer model) {
+    try {
+      Log.i(TAG, "Creating classifier.");
+      classifier = new OvicClassifier(labelInputStream, model);
+      int [] inputDims = classifier.getInputDims();
+      imgHeight = inputDims[1];
+      imgWidth = inputDims[2];
+      // Only accept QUANTIZED_UINT8 input.
+      imgData = ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE);
+      imgData.order(ByteOrder.nativeOrder());
+      intValues = new int[imgHeight * imgWidth];
+    } catch (Exception e) {
+        Log.e(TAG, e.getMessage());
+        Log.e(TAG, "Failed to initialize ImageNet classifier for the benchmarker.");
+    }
+  }
+
+  /** Return how many classes are predicted per image. */
+  public int getNumPredictions() {
+    return classifier.getNumPredictions();
+  }
+
+  /**
+   * Perform test on a single bitmap image.
+   *
+   * @param bitmap: a {@link Bitmap} image to classify.
+   */
+  public OvicSingleImageResult doTestIteration(Bitmap bitmap)
+      throws IOException, InterruptedException {
+    if (shouldStop() || !readyToTest()) {
+      return null;
+    }
+    OvicSingleImageResult iterResult = null;
+    try {
+      Log.i(TAG, "Converting bitmap.");
+      convertBitmapToInput(bitmap);
+      Log.i(TAG, "Classifying image.");
+      iterResult = classifier.classifyByteBuffer(imgData);
+    } catch (RuntimeException e) {
+      Log.e(TAG, e.getMessage());
+      Log.e(TAG, "Failed to classify image.");
+    }
+    if (iterResult == null || iterResult.latency == null) {
+      throw new RuntimeException("Classification result or timing is invalid.");
+    }
+    Log.d(TAG, "Native inference latency: " + iterResult.latency);
+    Log.i(TAG, iterResult.toString());
+
+    if (!benchmarkStarted) {  // Skip the first image to discount warming-up time.
+      benchmarkStarted = true;
+    } else {
+      totalRuntime += (double) iterResult.latency;
+    }
+    return iterResult;
+  }
+
+  /**
+   * Writes Image data into a {@link ByteBuffer}.
+   *
+   * @param bitmap: a {@link Bitmap} source image.
+   */
+  private void convertBitmapToInput(Bitmap bitmap) throws RuntimeException {
+    if (imgData == null) {
+      throw new RuntimeException("Benchmarker is not yet ready to test.");
+    }
+    imgData.rewind();
+    // Perform transformations corresponding to evaluation mode.
+    float width = (float) bitmap.getWidth();
+    float height = (float) bitmap.getHeight();
+    int stWidth = Math.round((width - width * CENTRAL_FRACTION) / 2);
+    int stHeight = Math.round((height - height * CENTRAL_FRACTION) / 2);
+    int newWidth = Math.round(width - stWidth * 2);
+    int newHeight = Math.round(height - stHeight * 2);
+    bitmap = Bitmap.createBitmap(bitmap, stWidth, stHeight, newWidth, newHeight);
+    bitmap = Bitmap.createScaledBitmap(bitmap, imgWidth, imgHeight, true);
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+
+    // Convert the image to ByteBuffer.
+    int pixel = 0;
+    long startTime = SystemClock.uptimeMillis();
+
+    for (int i = 0; i < imgHeight; ++i) {
+      for (int j = 0; j < imgWidth; ++j) {
+        final int val = intValues[pixel++];
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    long endTime = SystemClock.uptimeMillis();
+    Log.d(TAG, "Timecost to put values into ByteBuffer: " + Long.toString(endTime - startTime));
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
new file mode 100644
index 0000000000000000000000000000000000000000..b2dfd8f2e710324f6c11a3098b858ffee8b28b3c
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
@@ -0,0 +1,209 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.TestHelper;
+
+/** Benchmark ImageNet Classifier with Tensorflow Lite. */
+public class OvicClassifier {
+
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "OvicClassifier";
+
+  /** Number of results to show (i.e. the "K" in top-K predictions). */
+  private static final int RESULTS_TO_SHOW = 5;
+
+  /** An instance of the driver class to run model inference with Tensorflow Lite. */
+  private Interpreter tflite;
+
+  /** Labels corresponding to the output of the vision model. */
+  private List<String> labelList;
+
+  /** An array to hold inference results, to be feed into Tensorflow Lite as outputs. */
+  private byte[][] inferenceOutputArray = null;
+  /** An array to hold final prediction probabilities. */
+  private float[][] labelProbArray = null;
+
+  /** Input resultion. */
+  private int[] inputDims = null;
+  /** Whether the model runs as float or quantized. */
+  private Boolean outputIsFloat = null;
+
+  private PriorityQueue<Map.Entry<Integer, Float>> sortedLabels =
+      new PriorityQueue<>(
+          RESULTS_TO_SHOW,
+          new Comparator<Map.Entry<Integer, Float>>() {
+            @Override
+            public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+              return (o1.getValue()).compareTo(o2.getValue());
+            }
+          });
+
+  /** Initializes an {@code OvicClassifier}. */
+  OvicClassifier(InputStream labelInputStream, MappedByteBuffer model)
+      throws IOException, RuntimeException {
+    if (model == null) {
+      throw new RuntimeException("Input model is empty.");
+    }
+    labelList = loadLabelList(labelInputStream);
+    // OVIC uses one thread for CPU inference.
+    tflite = new Interpreter(model, 1);
+    inputDims = TestHelper.getInputDims(tflite, 0);
+    if (inputDims.length != 4) {
+      throw new RuntimeException("The model's input dimensions must be 4 (BWHC).");
+    }
+    if (inputDims[0] != 1) {
+      throw new RuntimeException("The model must have a batch size of 1, got "
+          + inputDims[0] + " instead.");
+    }
+    if (inputDims[3] != 3) {
+      throw new RuntimeException("The model must have three color channels, got "
+          + inputDims[3] + " instead.");
+    }
+    int minSide = Math.min(inputDims[1], inputDims[2]);
+    int maxSide = Math.max(inputDims[1], inputDims[2]);
+    if (minSide <= 0 || maxSide > 1000) {
+      throw new RuntimeException("The model's resolution must be between (0, 1000].");
+    }
+    String outputDataType = TestHelper.getOutputDataType(tflite, 0);
+    if (outputDataType.equals("float")) {
+      outputIsFloat = true;
+    } else if (outputDataType.equals("byte")) {
+      outputIsFloat = false;
+    } else {
+      throw new RuntimeException("Cannot process output type: " + outputDataType);
+    }
+    inferenceOutputArray = new byte[1][labelList.size()];
+    labelProbArray = new float[1][labelList.size()];
+  }
+
+  /** Classifies a {@link ByteBuffer} image. */
+  // @throws RuntimeException if model is uninitialized.
+  OvicSingleImageResult classifyByteBuffer(ByteBuffer imgData) throws RuntimeException {
+    if (tflite == null) {
+      throw new RuntimeException(TAG + ": ImageNet classifier has not been initialized; Failed.");
+    }
+    if (outputIsFloat == null) {
+      throw new RuntimeException(TAG + ": Classifier output type has not been resolved.");
+    }
+    if (outputIsFloat) {
+      tflite.run(imgData, labelProbArray);
+    } else {
+      tflite.run(imgData, inferenceOutputArray);
+      /** Convert results to float */
+      for (int i = 0; i < inferenceOutputArray[0].length; i++) {
+        labelProbArray[0][i] = (inferenceOutputArray[0][i] & 0xff) / 255.0f;
+      }
+    }
+    OvicSingleImageResult iterResult = computeTopKLabels();
+    iterResult.latency = getLastNativeInferenceLatencyMilliseconds();
+    return iterResult;
+  }
+
+  /** Return the probability array of all classes. */
+  public float[][] getlabelProbArray() {
+    return labelProbArray;
+  }
+
+  /** Return the number of top labels predicted by the classifier. */
+  public int getNumPredictions() {
+    return RESULTS_TO_SHOW;
+  }
+
+  /** Return the four dimensions of the input image. */
+  public int[] getInputDims() {
+    return inputDims;
+  }
+
+  /*
+   * Get native inference latency of last image classification run.
+   *  @throws RuntimeException if model is uninitialized.
+   */
+  public Long getLastNativeInferenceLatencyMilliseconds() {
+    if (tflite == null) {
+      throw new RuntimeException(TAG + ": ImageNet classifier has not been initialized; Failed.");
+    }
+    Long latency = tflite.getLastNativeInferenceDurationNanoseconds();
+    return (latency == null) ? null : (Long) (latency / 1000000);
+  }
+
+  /** Closes tflite to release resources. */
+  public void close() {
+    tflite.close();
+    tflite = null;
+  }
+
+  /** Reads label list from Assets. */
+  private static List<String> loadLabelList(InputStream labelInputStream) throws IOException {
+    List<String> labelList = new ArrayList<String>();
+    try (BufferedReader reader =
+        new BufferedReader(new InputStreamReader(labelInputStream, StandardCharsets.UTF_8))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        labelList.add(line);
+      }
+    }
+    return labelList;
+  }
+
+  /** Computes top-K labels. */
+  private OvicSingleImageResult computeTopKLabels() {
+    if (labelList == null) {
+      throw new RuntimeException("Label file has not been loaded.");
+    }
+    for (int i = 0; i < labelList.size(); ++i) {
+      sortedLabels.add(new AbstractMap.SimpleEntry<>(i, labelProbArray[0][i]));
+      if (sortedLabels.size() > RESULTS_TO_SHOW) {
+        sortedLabels.poll();
+      }
+    }
+    OvicSingleImageResult singleImageResult = new OvicSingleImageResult();
+    if (sortedLabels.size() != RESULTS_TO_SHOW) {
+      throw new RuntimeException(
+          "Number of returned labels does not match requirement: "
+              + sortedLabels.size()
+              + " returned, but "
+              + RESULTS_TO_SHOW
+              + " required.");
+    }
+    for (int i = 0; i < RESULTS_TO_SHOW; ++i) {
+      Map.Entry<Integer, Float> label = sortedLabels.poll();
+      // ImageNet model prediction indices are 0-based.
+      singleImageResult.topKIndices.add(label.getKey());
+      singleImageResult.topKClasses.add(labelList.get(label.getKey()));
+      singleImageResult.topKProbs.add(label.getValue());
+    }
+    // Labels with lowest probability are returned first, hence need to reverse them.
+    Collections.reverse(singleImageResult.topKIndices);
+    Collections.reverse(singleImageResult.topKClasses);
+    Collections.reverse(singleImageResult.topKProbs);
+    return singleImageResult;
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java
new file mode 100644
index 0000000000000000000000000000000000000000..4af9a65c2f45c57b979bf9629e34f52bb0853a44
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java
@@ -0,0 +1,54 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import java.util.ArrayList;
+
+/** Result class for inference run on a single image. */
+public class OvicSingleImageResult {
+
+  /** Top K classes and probabilities. */
+  public ArrayList<String> topKClasses;
+  public ArrayList<Float> topKProbs;
+  public ArrayList<Integer> topKIndices;
+
+  /** Latency (ms). */
+  public Long latency;
+
+  OvicSingleImageResult() {
+    topKClasses = new ArrayList<>();
+    topKProbs = new ArrayList<>();
+    topKIndices = new ArrayList<>();
+    latency = -1L;
+  }
+
+  @Override
+  public String toString() {
+    String textToShow = latency + "ms";
+    for (int k = 0; k < topKProbs.size(); ++k) {
+      textToShow +=
+          "\nPrediction ["
+              + k
+              + "] = Class "
+              + Integer.toString(topKIndices.get(k))
+              + " ("
+              + topKClasses.get(k)
+              + ") : "
+              + Float.toString(topKProbs.get(k));
+    }
+    return textToShow;
+  }
+
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..4fd23a99d25d715530cf36f398d949f7e70598de
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -0,0 +1,176 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.fail;
+
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Paths;
+import javax.imageio.ImageIO;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.ovic.OvicClassifier}. */
+@RunWith(JUnit4.class)
+public final class OvicClassifierTest {
+
+  private OvicClassifier classifier;
+  private InputStream labelsInputStream = null;
+  private MappedByteBuffer quantizedModel = null;
+  private MappedByteBuffer floatModel = null;
+  private MappedByteBuffer lowResModel = null;
+  private ByteBuffer testImage = null;
+  private ByteBuffer lowResTestImage = null;
+  private OvicSingleImageResult testResult = null;
+  private static final String LABELS_PATH = "testdata/labels.txt";
+  private static final String QUANTIZED_MODEL_PATH = "testdata/quantized_model.lite";
+  private static final String LOW_RES_MODEL_PATH = "testdata/low_res_model.lite";
+  private static final String FLOAT_MODEL_PATH = "testdata/float_model.lite";
+  private static final String TEST_IMAGE_PATH = "testdata/test_image_224.jpg";
+  private static final String TEST_LOW_RES_IMAGE_PATH = "testdata/test_image_128.jpg";
+  private static final int TEST_IMAGE_GROUNDTRUTH = 653; // "military uniform"
+
+  @Before
+  public void setUp() {
+    try {
+      File labelsfile = new File(getTestDir(LABELS_PATH));
+      labelsInputStream = new FileInputStream(labelsfile);
+      quantizedModel = loadModelFile(getTestDir(QUANTIZED_MODEL_PATH));
+      floatModel = loadModelFile(getTestDir(FLOAT_MODEL_PATH));
+      lowResModel = loadModelFile(getTestDir(LOW_RES_MODEL_PATH));
+      File imageFile = new File(getTestDir(TEST_IMAGE_PATH));
+      BufferedImage img = ImageIO.read(imageFile);
+      testImage = toByteBuffer(img);
+      // Low res image and models.
+      imageFile = new File(getTestDir(TEST_LOW_RES_IMAGE_PATH));
+      img = ImageIO.read(imageFile);
+      lowResTestImage = toByteBuffer(img);
+    } catch (IOException e) {
+      System.out.print(e.getMessage());
+    }
+    System.out.println("Successful setup");
+  }
+
+  private static String getTestDir(String testfile) throws IOException {
+    return Paths.get("third_party/tensorflow/contrib/lite/java/ovic/src/", testfile).toString();
+  }
+
+  @Test
+  public void ovicClassifier_quantizedModelCreateSuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, quantizedModel);
+    assertThat(classifier != null).isTrue();
+  }
+
+  @Test
+  public void ovicClassifier_floatModelCreateSuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, floatModel);
+    assertThat(classifier != null).isTrue();
+  }
+
+  @Test
+  public void ovicClassifier_quantizedModelClassifySuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, quantizedModel);
+    testResult = classifier.classifyByteBuffer(testImage);
+    assertCorrectTopK(testResult);
+  }
+
+  @Test
+  public void ovicClassifier_floatModelClassifySuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, floatModel);
+    testResult = classifier.classifyByteBuffer(testImage);
+    assertCorrectTopK(testResult);
+  }
+
+  @Test
+  public void ovicClassifier_lowResModelClassifySuccess() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, lowResModel);
+    testResult = classifier.classifyByteBuffer(lowResTestImage);
+    assertCorrectTopK(testResult);
+  }
+
+  @Test
+  public void ovicClassifier_latencyNotNull() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, floatModel);
+    testResult = classifier.classifyByteBuffer(testImage);
+    assertThat(testResult.latency != null).isTrue();
+  }
+
+  @Test
+  public void ovicClassifier_mismatchedInputResolutionFails() throws Exception {
+    classifier = new OvicClassifier(labelsInputStream, lowResModel);
+    int[] inputDims = classifier.getInputDims();
+    assertThat((inputDims[1] == 128) && (inputDims[2] == 128)).isTrue();
+    try {
+      testResult = classifier.classifyByteBuffer(testImage);
+      fail();
+    } catch (RuntimeException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Failed to get input dimensions. 0-th input should have 49152 bytes, "
+                  + "but found 150528 bytes.");
+    }
+  }
+
+  private static ByteBuffer toByteBuffer(BufferedImage image) {
+    ByteBuffer imgData = ByteBuffer.allocateDirect(
+        image.getHeight() * image.getWidth() * 3);
+    imgData.order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int val = image.getRGB(x, y);
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    return imgData;
+  }
+
+  private static void assertCorrectTopK(OvicSingleImageResult testResult) {
+    assertThat(testResult.topKClasses.size() > 0).isTrue();
+    Boolean topKAccurate = false;
+    // Assert that the correct class is in the top K.
+    for (int i = 0; i < testResult.topKIndices.size(); i++) {
+      if (testResult.topKIndices.get(i) == TEST_IMAGE_GROUNDTRUTH) {
+        topKAccurate = true;
+        break;
+      }
+    }
+    System.out.println(testResult.toString());
+    System.out.flush();
+    assertThat(topKAccurate).isTrue();
+  }
+
+  private static MappedByteBuffer loadModelFile(String modelFilePath) throws IOException {
+    File modelfile = new File(modelFilePath);
+    FileInputStream inputStream = new FileInputStream(modelfile);
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = 0L;
+    long declaredLength = fileChannel.size();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt b/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe811239d8e2989de19fecabb1ebb0c9dddac514
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt
@@ -0,0 +1,1001 @@
+background
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index cc17b491f2a1a42d704c0262095ce817f3828764..a33959dca4954e3c2aaed987839bdec1ba079b5e 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -67,6 +67,19 @@ public final class Interpreter implements AutoCloseable {
     wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath());
   }
 
+  /**
+   * Initializes a {@code Interpreter} and specifies the number of threads used for inference.
+   *
+   * @param modelFile: a file of a pre-trained TF Lite model
+   * @param numThreads: number of threads to use for inference
+   */
+  public Interpreter(@NonNull File modelFile, int numThreads) {
+    if (modelFile == null) {
+      return;
+    }
+    wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath(), numThreads);
+  }
+
   /**
    * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file.
    *
@@ -77,6 +90,17 @@ public final class Interpreter implements AutoCloseable {
     wrapper = new NativeInterpreterWrapper(mappedByteBuffer);
   }
 
+  /**
+   * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file and
+   * specifies the number of threads used for inference.
+   *
+   * <p>The {@code MappedByteBuffer} should remain unchanged after the construction of a {@code
+   * Interpreter}.
+   */
+  public Interpreter(@NonNull MappedByteBuffer mappedByteBuffer, int numThreads) {
+    wrapper = new NativeInterpreterWrapper(mappedByteBuffer, numThreads);
+  }
+
   /**
    * Runs model inference if the model takes only one input, and provides only one output.
    *
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 014636ffff3afd3a9579081690cde792647118a3..fc8187acfebf272a72ceb7844333bd589359cc2e 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -32,9 +32,13 @@ import java.util.Map;
 final class NativeInterpreterWrapper implements AutoCloseable {
 
   NativeInterpreterWrapper(String modelPath) {
+    this(modelPath, /* numThreads= */ -1);
+  }
+
+  NativeInterpreterWrapper(String modelPath, int numThreads) {
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModel(modelPath, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle);
+    interpreterHandle = createInterpreter(modelHandle, errorHandle, numThreads);
     isMemoryAllocated = true;
   }
 
@@ -44,10 +48,19 @@ final class NativeInterpreterWrapper implements AutoCloseable {
    * NativeInterpreterWrapper}.
    */
   NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer) {
+    this(mappedByteBuffer, /* numThreads= */ -1);
+  }
+
+  /**
+   * Initializes a {@code NativeInterpreterWrapper} with a {@code MappedByteBuffer} and specifies
+   * the number of inference threads. The MappedByteBuffer should not be modified after the
+   * construction of a {@code NativeInterpreterWrapper}.
+   */
+  NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer, int numThreads) {
     modelByteBuffer = mappedByteBuffer;
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle);
+    interpreterHandle = createInterpreter(modelHandle, errorHandle, numThreads);
     isMemoryAllocated = true;
   }
 
@@ -314,7 +327,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native long createModelWithBuffer(MappedByteBuffer modelBuffer, long errorHandle);
 
-  private static native long createInterpreter(long modelHandle, long errorHandle);
+  private static native long createInterpreter(long modelHandle, long errorHandle, int numThreads);
 
   private static native void delete(long errorHandle, long modelHandle, long interpreterHandle);
 
diff --git a/tensorflow/contrib/lite/java/src/main/native/BUILD b/tensorflow/contrib/lite/java/src/main/native/BUILD
index 3571182ca92e959d54935cfdc76679ab69a8cfa9..4399ed202597082fba36c04a744bf6378e4539a2 100644
--- a/tensorflow/contrib/lite/java/src/main/native/BUILD
+++ b/tensorflow/contrib/lite/java/src/main/native/BUILD
@@ -95,15 +95,3 @@ exports_files(
         "version_script.lds",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 2870ffe8eb68610adf3bdd4d26218678e7404d2e..844226203bb02f4017b2f04da34ac81ac2b7a191 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h"
-
 namespace {
 
 const int kByteBufferValue = 999;
@@ -324,6 +323,19 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
   return reinterpret_cast<jlong>(error_reporter);
 }
 
+// Verifies whether the model is a flatbuffer file.
+class JNIFlatBufferVerifier : public tflite::TfLiteVerifier {
+ public:
+  bool Verify(const char* data, int length,
+              tflite::ErrorReporter* reporter) override {
+    if (!VerifyModel(data, length)) {
+      reporter->Report("The model is not a valid Flatbuffer file");
+      return false;
+    }
+    return true;
+  }
+};
+
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
     JNIEnv* env, jclass clazz, jstring model_file, jlong error_handle) {
@@ -332,17 +344,11 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
   if (error_reporter == nullptr) return 0;
   const char* path = env->GetStringUTFChars(model_file, nullptr);
 
-  {
-    tflite::FileCopyAllocation allocation(path, nullptr);
-    if (!VerifyModel(allocation.base(), allocation.bytes())) {
-      throwException(env, kIllegalArgumentException,
-                     "Contents of %s is not a valid flatbuffer model", path);
-      env->ReleaseStringUTFChars(model_file, path);
-      return 0;
-    }
-  }
+  std::unique_ptr<tflite::TfLiteVerifier> verifier;
+  verifier.reset(new JNIFlatBufferVerifier());
 
-  auto model = tflite::FlatBufferModel::BuildFromFile(path, error_reporter);
+  auto model = tflite::FlatBufferModel::VerifyAndBuildFromFile(
+      path, verifier.get(), error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
                    "Contents of %s does not encode a valid TensorFlowLite "
@@ -384,7 +390,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
 
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
-    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle) {
+    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle,
+    jint num_threads) {
   tflite::FlatBufferModel* model = convertLongToModel(env, model_handle);
   if (model == nullptr) return 0;
   BufferErrorReporter* error_reporter =
@@ -392,8 +399,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
   if (error_reporter == nullptr) return 0;
   auto resolver = ::tflite::CreateOpResolver();
   std::unique_ptr<tflite::Interpreter> interpreter;
-  TfLiteStatus status =
-      tflite::InterpreterBuilder(*model, *(resolver.get()))(&interpreter);
+  TfLiteStatus status = tflite::InterpreterBuilder(*model, *(resolver.get()))(
+      &interpreter, static_cast<int>(num_threads));
   if (status != kTfLiteOk) {
     throwException(env, kIllegalArgumentException,
                    "Cannot create interpreter: %s",
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index d611ec7f38669341761068f571be6311c2f3cb6a..0e28a77feea41d72be126d6e60fffbe7ce374a76 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -99,11 +99,12 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (JJ)J
+ *  Signature: (JJI)J
  */
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
-    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle);
+    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle,
+    jint num_threads);
 
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index d6b4e9f438ab19a1c51b8fcc6047f75075a34693..dbe45e5a05b8227b441de7ca6747f61d010ae210 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -47,6 +47,9 @@ public final class NativeInterpreterWrapperTest {
   private static final String MODEL_WITH_CUSTOM_OP_PATH =
       "tensorflow/contrib/lite/java/src/testdata/with_custom_op.lite";
 
+  private static final String NONEXISTING_MODEL_PATH =
+      "tensorflow/contrib/lite/java/src/testdata/nonexisting_model.bin";
+
   @Test
   public void testConstructor() {
     NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
@@ -60,7 +63,18 @@ public final class NativeInterpreterWrapperTest {
       NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(INVALID_MODEL_PATH);
       fail();
     } catch (IllegalArgumentException e) {
-      assertThat(e).hasMessageThat().contains("is not a valid flatbuffer model");
+      assertThat(e).hasMessageThat().contains("The model is not a valid Flatbuffer file");
+    }
+  }
+
+  @Test
+  public void testConstructorWithNonexistingModel() {
+    try {
+      NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(NONEXISTING_MODEL_PATH);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("The model is not a valid Flatbuffer file");
+      assertThat(e).hasMessageThat().contains("Could not open");
     }
   }
 
diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
index 2b4f37bc6cfe1dbc0c178a56b892f545e8ad4f3b..b524246d436858bbf506809a38cead2897f78d93 100644
--- a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
+++ b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
@@ -16,15 +16,3 @@ android_library(
         "//tensorflow/contrib/lite/java:tensorflowlite_java",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index b8ab6d96a08679e0ee8ba8938f0fedb5a9e6c6fe..df0f3cbeb0e99c4a7cb6a9c610ce660f06454744 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -135,6 +135,7 @@ cc_library(
     srcs = [
         "activations.cc",
         "add.cc",
+        "audio_spectrogram.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
         "bidirectional_sequence_lstm.cc",
@@ -155,7 +156,9 @@ cc_library(
         "local_response_norm.cc",
         "lsh_projection.cc",
         "lstm.cc",
+        "maximum.cc",
         "mean.cc",
+        "mfcc.cc",
         "mul.cc",
         "pad.cc",
         "pooling.cc",
@@ -196,15 +199,42 @@ cc_library(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:gemm_support",
+        "//tensorflow/contrib/lite/kernels/internal:audio_utils",
         "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
         "//tensorflow/contrib/lite/kernels/internal:optimized",
         "//tensorflow/contrib/lite/kernels/internal:optimized_base",
         "//tensorflow/contrib/lite/kernels/internal:quantization_util",
         "//tensorflow/contrib/lite/kernels/internal:reference",
         "//tensorflow/contrib/lite/kernels/internal:reference_base",
-        "//tensorflow/contrib/lite/kernels/internal:round",
         "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
         "@farmhash_archive//:farmhash",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "audio_spectrogram_test",
+    size = "small",
+    srcs = ["audio_spectrogram_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "mfcc_test",
+    size = "small",
+    srcs = ["mfcc_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -240,6 +270,38 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "div_test",
+    size = "small",
+    srcs = ["div_test.cc"],
+    tags = [
+        "tflite_not_portable_ios_arm64",
+        "tflite_not_portable_ios_x86_64",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "sub_test",
+    size = "small",
+    srcs = ["sub_test.cc"],
+    tags = [
+        "tflite_not_portable_ios_arm64",
+        "tflite_not_portable_ios_x86_64",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "transpose_test",
     size = "small",
@@ -475,6 +537,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "maximum_test",
+    size = "small",
+    srcs = ["maximum_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "mean_test",
     size = "small",
@@ -649,6 +723,7 @@ tf_cc_test(
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -849,16 +924,4 @@ tf_cc_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 093761c43c1cb41ddb2245da13c963014b51271c..39a54c93962b33f3a787b3387d9a133119d0e80a 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -150,6 +150,34 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
+TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* alpha = GetInput(context, node, 1);
+
+  output->type = input->type;
+
+  // Currently only Float32 is supported
+  // TODO(ycling): Support other data types.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, alpha->type, kTfLiteFloat32);
+
+  // Currently, only support 4D `input` and 3D `alpha` with shape
+  // (1, 1, channels).
+  // TODO(impjdi): Support other cases where `alpha` is broadcastable
+  // to `input`.
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->size, 3);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[0], 1);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[1], 1);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[2], input->dims->data[3]);
+
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
 TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -388,6 +416,35 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* alpha = GetInput(context, node, 1);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  if (input->type != kTfLiteFloat32) {
+    context->ReportError(context, "Only float32 supported currently.");
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
+  const int batches = input->dims->data[0];
+  const int height = input->dims->data[1];
+  const int width = input->dims->data[2];
+  const int channels = input->dims->data[3];
+
+  TF_LITE_ENSURE_EQ(context, alpha->dims->size, 3);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[0], 1);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[1], 1);
+  TF_LITE_ENSURE_EQ(context, alpha->dims->data[2], channels);
+
+  const int n = batches * height * width * channels;
+  for (int i = 0; i < n; ++i) {
+    const float x = input->data.f[i];
+    output->data.f[i] = x >= 0.0f ? x : alpha->data.f[i % channels] * x;
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace activations
 
 TfLiteRegistration* Register_RELU() {
@@ -439,6 +496,13 @@ TfLiteRegistration* Register_LOG_SOFTMAX() {
   return &r;
 }
 
+TfLiteRegistration* Register_PRELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::PreluPrepare,
+                                 activations::PreluEval};
+  return &r;
+}
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
index b9a96e3f79677c5a94ade86a6b334abb4c265fa1..50a84edd475c8051a563cf8ed9fc03099829b786 100644
--- a/tensorflow/contrib/lite/kernels/activations_test.cc
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -383,6 +383,49 @@ TEST(FloatActivationsOpTest, LogSoftmax) {
                               })));
 }
 
+class PReluOpModel : public SingleOpModel {
+ public:
+  PReluOpModel(const TensorData& input, const TensorData& alpha) {
+    input_ = AddInput(input);
+    alpha_ = AddInput(alpha);
+    output_ = AddOutput(input);
+    SetBuiltinOp(BuiltinOperator_PRELU, BuiltinOptions_NONE, 0);
+    BuildInterpreter({GetShape(input_), GetShape(alpha_)});
+  }
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetAlpha(std::initializer_list<float> data) {
+    PopulateTensor(alpha_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input_;
+  int alpha_;
+  int output_;
+};
+
+TEST(FloatActivationsOpTest, PRelu) {
+  PReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                 {TensorType_FLOAT32, {1, 1, 3}});
+
+  m.SetInput({
+      0.0f, 0.0f, 0.0f,     // Row 1, Column 1
+      1.0f, 1.0f, 1.0f,     // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
+      -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
+  });
+  m.SetAlpha({0.0f, 1.0f, 2.0f});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                                 1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                                 0.0f, -1.0f, -2.0f,  // Row 2, Column 1
+                                 0.0f, -2.0f, -4.0f,  // Row 1, Column 2
+                             }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
new file mode 100644
index 0000000000000000000000000000000000000000..602f3888c10b3790dc0328c817bdd83276544b25
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
@@ -0,0 +1,165 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/spectrogram.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+#include "flatbuffers/flexbuffers.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace audio_spectrogram {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+enum KernelType {
+  kReference,
+};
+
+typedef struct {
+  int window_size;
+  int stride;
+  bool magnitude_squared;
+  int output_height;
+  internal::Spectrogram* spectrogram;
+} TfLiteAudioSpectrogramParams;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new TfLiteAudioSpectrogramParams;
+
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  data->window_size = m["window_size"].AsInt64();
+  data->stride = m["stride"].AsInt64();
+  data->magnitude_squared = m["magnitude_squared"].AsBool();
+
+  data->spectrogram = new internal::Spectrogram;
+
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  auto* params = reinterpret_cast<TfLiteAudioSpectrogramParams*>(buffer);
+  delete params->spectrogram;
+  delete params;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteAudioSpectrogramParams*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
+
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  TF_LITE_ENSURE(context, params->spectrogram->Initialize(params->window_size,
+                                                          params->stride));
+  const int64_t sample_count = input->dims->data[0];
+  const int64_t length_minus_window = (sample_count - params->window_size);
+  if (length_minus_window < 0) {
+    params->output_height = 0;
+  } else {
+    params->output_height = 1 + (length_minus_window / params->stride);
+  }
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
+  output_size->data[0] = input->dims->data[1];
+  output_size->data[1] = params->output_height;
+  output_size->data[2] = params->spectrogram->output_frequency_channels();
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteAudioSpectrogramParams*>(node->user_data);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE(context, params->spectrogram->Initialize(params->window_size,
+                                                          params->stride));
+
+  const float* input_data = GetTensorData<float>(input);
+
+  const int64_t sample_count = input->dims->data[0];
+  const int64_t channel_count = input->dims->data[1];
+
+  const int64_t output_width = params->spectrogram->output_frequency_channels();
+
+  float* output_flat = GetTensorData<float>(output);
+
+  std::vector<float> input_for_channel(sample_count);
+  for (int64_t channel = 0; channel < channel_count; ++channel) {
+    float* output_slice =
+        output_flat + (channel * params->output_height * output_width);
+    for (int i = 0; i < sample_count; ++i) {
+      input_for_channel[i] = input_data[i * channel_count + channel];
+    }
+    std::vector<std::vector<float>> spectrogram_output;
+    TF_LITE_ENSURE(context,
+                   params->spectrogram->ComputeSquaredMagnitudeSpectrogram(
+                       input_for_channel, &spectrogram_output));
+    TF_LITE_ENSURE_EQ(context, spectrogram_output.size(),
+                      params->output_height);
+    TF_LITE_ENSURE(context, spectrogram_output.empty() ||
+                                (spectrogram_output[0].size() == output_width));
+    for (int row_index = 0; row_index < params->output_height; ++row_index) {
+      const std::vector<float>& spectrogram_row = spectrogram_output[row_index];
+      TF_LITE_ENSURE_EQ(context, spectrogram_row.size(), output_width);
+      float* output_row = output_slice + (row_index * output_width);
+      if (params->magnitude_squared) {
+        for (int i = 0; i < output_width; ++i) {
+          output_row[i] = spectrogram_row[i];
+        }
+      } else {
+        for (int i = 0; i < output_width; ++i) {
+          output_row[i] = sqrtf(spectrogram_row[i]);
+        }
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace audio_spectrogram
+
+TfLiteRegistration* Register_AUDIO_SPECTROGRAM() {
+  static TfLiteRegistration r = {
+      audio_spectrogram::Init, audio_spectrogram::Free,
+      audio_spectrogram::Prepare,
+      audio_spectrogram::Eval<audio_spectrogram::kReference>};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d460fdfc610ef9a867acd492ca0558fb6eab8c3
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class BaseAudioSpectrogramOpModel : public SingleOpModel {
+ public:
+  BaseAudioSpectrogramOpModel(const TensorData& input1,
+                              const TensorData& output, int window_size,
+                              int stride, bool magnitude_squared) {
+    input1_ = AddInput(input1);
+    output_ = AddOutput(output);
+
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Int("window_size", window_size);
+      fbb.Int("stride", stride);
+      fbb.Bool("magnitude_squared", magnitude_squared);
+    });
+    fbb.Finish();
+    SetCustomOp("AudioSpectrogram", fbb.GetBuffer(),
+                Register_AUDIO_SPECTROGRAM);
+    BuildInterpreter({GetShape(input1_)});
+  }
+
+  int input1() { return input1_; }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int output_;
+};
+
+TEST(BaseAudioSpectrogramOpModel, NonSquaredTest) {
+  BaseAudioSpectrogramOpModel m({TensorType_FLOAT32, {8, 1}},
+                                {TensorType_FLOAT32, {}}, 8, 1, false);
+  m.PopulateTensor<float>(m.input1(),
+                          {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+  m.Invoke();
+
+  std::vector<int> output_shape = m.GetOutputShape();
+  EXPECT_EQ(3, output_shape.size());
+  EXPECT_THAT(output_shape, ElementsAre(1, 1, 5));
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {0.0f, 1.0f, 2.0f, 1.0f, 0.0f}, 1e-3)));
+}
+
+TEST(SpectrogramOpTest, SquaredTest) {
+  BaseAudioSpectrogramOpModel m({TensorType_FLOAT32, {8, 1}},
+                                {TensorType_FLOAT32, {}}, 8, 1, true);
+  m.PopulateTensor<float>(m.input1(),
+                          {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+  m.Invoke();
+
+  std::vector<int> output_shape = m.GetOutputShape();
+  EXPECT_EQ(3, output_shape.size());
+  EXPECT_THAT(output_shape, ElementsAre(1, 1, 5));
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {0.f, 1.f, 4.f, 1.f, 0.f}, 1e-3)));
+}
+
+TEST(SpectrogramOpTest, StrideTest) {
+  BaseAudioSpectrogramOpModel m({TensorType_FLOAT32, {10, 1}},
+                                {TensorType_FLOAT32, {}}, 8, 2, true);
+  m.PopulateTensor<float>(m.input1(), {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f,
+                                       1.0f, 0.0f, 1.0f, 0.0f});
+
+  m.Invoke();
+
+  std::vector<int> output_shape = m.GetOutputShape();
+  EXPECT_THAT(output_shape, ElementsAre(1, 2, 5));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {0, 1, 4, 1, 0, 1, 2, 1, 2, 1}, 1e-3)));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/cast.cc b/tensorflow/contrib/lite/kernels/cast.cc
index 19942de7bc0c083f192a4b337b224b778d991140..17ef2c572ebbfa54ba6856f7eebbcd6fd9e63868 100644
--- a/tensorflow/contrib/lite/kernels/cast.cc
+++ b/tensorflow/contrib/lite/kernels/cast.cc
@@ -34,6 +34,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(ahentz): these two checks would make the new implementation
+  // incompatible with some existing models, where params is not specified. It
+  // is OK not to have them because toco would have set input and output types
+  // to match the parameters.
+  // auto* params = reinterpret_cast<TfLiteCastParams*>(node->builtin_data);
+  // TF_LITE_ENSURE_EQ(context, input->type, params->in_data_type);
+  // TF_LITE_ENSURE_EQ(context, output->type, params->out_data_type);
+
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
 }
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index e0cd12f1b4042c3d8b28159e288166bf1437e6ef..18ff33bf9f55ac1d25bb3392e714686c5305c2b8 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -89,9 +89,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* data = new OpData;
   gemm_support::IncrementUsageCounter(context);
   eigen_support::IncrementUsageCounter(context);
-
-  data->run_multithreaded_kernel = context->recommended_num_threads != 1;
-
   return data;
 }
 
@@ -176,6 +173,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
+  data->run_multithreaded_kernel = context->recommended_num_threads != 1;
+
   TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(context, node));
 
   bool hasBias = node->inputs->size == 3;
diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc
index 44bd0dc85d50c98ec6b6888e05064a8f2e2731c0..6dd243ad62ece3e094529d923ce80d1d4a0c19ca 100644
--- a/tensorflow/contrib/lite/kernels/div.cc
+++ b/tensorflow/contrib/lite/kernels/div.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,35 +61,47 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
-void EvalDivFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteDivParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDivParams* params, const OpData* data,
+               TfLiteTensor* input1, TfLiteTensor* input2,
+               TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_DIV(type)                                        \
-  type::Div(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_DIV(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
   if (kernel_type == kReference) {
-    TF_LITE_DIV(reference_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_DIV(reference_ops, BroadcastDiv);
+    } else {
+      TF_LITE_DIV(reference_ops, Div);
+    }
   } else {
-    TF_LITE_DIV(optimized_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_DIV(optimized_ops, BroadcastDiv);
+    } else {
+      TF_LITE_DIV(optimized_ops, Div);
+    }
   }
 #undef TF_LITE_DIV
 }
@@ -81,13 +109,14 @@ void EvalDivFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalDivFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else {
     context->ReportError(context, "Inputs and outputs not all float types.");
     return kTfLiteError;
@@ -99,19 +128,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace div
 
 TfLiteRegistration* Register_DIV_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+  static TfLiteRegistration r = {div::Init, div::Free, div::Prepare,
                                  div::Eval<div::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_DIV_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+  static TfLiteRegistration r = {div::Init, div::Free, div::Prepare,
                                  div::Eval<div::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_DIV_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+  static TfLiteRegistration r = {div::Init, div::Free, div::Prepare,
                                  div::Eval<div::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/div_test.cc b/tensorflow/contrib/lite/kernels/div_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..276b8289fbc1b4dcbf4624b76b854300d0fd4912
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/div_test.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseDivOpModel : public SingleOpModel {
+ public:
+  BaseDivOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
+                 ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_DIV, BuiltinOptions_DivOptions,
+                 CreateDivOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatDivOpModel : public BaseDivOpModel {
+ public:
+  using BaseDivOpModel::BaseDivOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+TEST(FloatDivOpTest, NoActivation) {
+  FloatDivOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.5, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-0.4, 1.0, 0.8, 1.6})));
+}
+
+TEST(FloatDivOpTest, ActivationRELU_N1_TO_1) {
+  FloatDivOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-1.0, 1.0, 0.8, 1.0})));
+}
+
+TEST(FloatDivOpTest, VariousInputShapes) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatDivOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.3, 0.8, 1.1, -2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.6, 0.5, -1.1, -0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-20.0, 1.0, 0.5, 1.6, -1.0, 20.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatDivOpTest, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatDivOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, 0.07, 0.08, 0.11, -0.123});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-2.0, 2.0, 0.7, 0.8, 1.1, -1.23})))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/eigen_support.cc b/tensorflow/contrib/lite/kernels/eigen_support.cc
index 213e46555210102b8faeb2e4d9900f924a023366..f1fdb42624073717fb70423ff70dfad08e578ca6 100644
--- a/tensorflow/contrib/lite/kernels/eigen_support.cc
+++ b/tensorflow/contrib/lite/kernels/eigen_support.cc
@@ -46,8 +46,15 @@ void DecrementUsageCounter(TfLiteContext* context) {
   }
   if (--ptr->num_references == 0) {
     delete ptr;
+    context->eigen_context = nullptr;
   }
 }
 
+void SetNumThreads(TfLiteContext* context, int num_threads) {
+  IncrementUsageCounter(context);
+  Eigen::setNbThreads(num_threads);
+  DecrementUsageCounter(context);
+}
+
 }  // namespace eigen_support
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/eigen_support.h b/tensorflow/contrib/lite/kernels/eigen_support.h
index d47e691123282a8a8cc53c29be1d95af037e3939..aa8c351fd8e8dae45f7d4807ce24d80bb393c41c 100644
--- a/tensorflow/contrib/lite/kernels/eigen_support.h
+++ b/tensorflow/contrib/lite/kernels/eigen_support.h
@@ -28,6 +28,9 @@ void IncrementUsageCounter(TfLiteContext* context);
 // usages all temporary Eigen objects will be deleted.
 void DecrementUsageCounter(TfLiteContext* context);
 
+// Set the number of threads that can be used by Eigen.
+void SetNumThreads(TfLiteContext* context, int num_threads);
+
 }  // namespace eigen_support
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
index a0f766c4f4580d7679275c0b63aa200410fcb5ad..87413000a93a0a361d81b1f0eb46550b5b90f9ac 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -19,12 +19,25 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
 #include "tensorflow/contrib/lite/model.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_FULLY_CONNECTED_REF();
+TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT();
+TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT();
+TfLiteRegistration* Register_FULLY_CONNECTED_PIE();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
 using ::testing::ElementsAre;
@@ -119,7 +132,8 @@ static float fully_connected_golden_output[] = {
 class BaseFullyConnectedOpModel : public SingleOpModel {
  public:
   // TODO(ahentz): test different activation types too.
-  BaseFullyConnectedOpModel(int units, int batches, const TensorData& input,
+  BaseFullyConnectedOpModel(TfLiteRegistration* registration, int units,
+                            int batches, const TensorData& input,
                             const TensorData& output = {TensorType_FLOAT32})
       : batches_(batches), units_(units) {
     int total_input_size = 1;
@@ -149,6 +163,8 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
         BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
         CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
             .Union());
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_FULLY_CONNECTED, registration);
     BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
   }
 
@@ -208,10 +224,25 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
   }
 };
 
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
+    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
+    {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
+    {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
+});
+
+class FullyConnectedOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
 // TODO(ahentz): add more small tests like this one, focused on making sure the
 // calculations are correct.
-TEST(FullyConnectedOpTest, SimpleTest) {
-  FloatFullyConnectedOpModel m(3, 2, {TensorType_FLOAT32, {2, 10}});
+TEST_P(FullyConnectedOpTest, SimpleTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), 3, 2,
+                               {TensorType_FLOAT32, {2, 10}});
   m.SetWeights({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
@@ -229,9 +260,9 @@ TEST(FullyConnectedOpTest, SimpleTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
 }
 
-TEST(FullyConnectedOpTest, SimpleTestQuantized) {
+TEST_P(FullyConnectedOpTest, SimpleTestQuantized) {
   QuantizedFullyConnectedOpModel m(
-      3, 2,
+      GetRegistration(), 3, 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
@@ -261,7 +292,8 @@ TEST(FullyConnectedOpTest, SimpleTest4DInput) {
   // Note that it is not required that the first dimension be the number of
   // batches. All we care is that the input can be evenly distributed in
   // batches. In this case, we need the input to have multiples of '2'.
-  FloatFullyConnectedOpModel m(/*units=*/3,
+  FloatFullyConnectedOpModel m(ops::builtin::Register_FULLY_CONNECTED_PIE(),
+                               /*units=*/3,
                                /*batches=*/2,
                                /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}});
   m.SetWeights({
@@ -284,9 +316,9 @@ TEST(FullyConnectedOpTest, SimpleTest4DInput) {
                              }));
 }
 
-TEST(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
+TEST_P(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
   QuantizedFullyConnectedOpModel m(
-      3, 2,
+      GetRegistration(), 3, 2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
@@ -312,10 +344,15 @@ TEST(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+INSTANTIATE_TEST_CASE_P(
+    FullyConnectedOpTest, FullyConnectedOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 // TODO(ahentz): Reconsider this test. Having arbitrary weights makes it hard
 // to debug errors and doesn't necessarily test all the important details.
-TEST(FullyConnectedOpTest, BlackBoxTest) {
-  FloatFullyConnectedOpModel m(16, 2, {TensorType_FLOAT32, {2, 8}});
+TEST_P(FullyConnectedOpTest, BlackBoxTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), 16, 2,
+                               {TensorType_FLOAT32, {2, 8}});
   m.SetWeights(
       {0.091327,  0.103366,  -0.316505, -0.083120, 0.149366,  -0.196636,
        -0.123672, 0.062800,  0.063031,  0.191670,  -0.062001, -0.061504,
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.cc b/tensorflow/contrib/lite/kernels/gemm_support.cc
index 76a5165d148c6c1829580a47456cebce321d7c5a..95f45ea768be7f9bae9570563f161792afbff436 100644
--- a/tensorflow/contrib/lite/kernels/gemm_support.cc
+++ b/tensorflow/contrib/lite/kernels/gemm_support.cc
@@ -61,5 +61,11 @@ gemmlowp::GemmContext* GetFromContext(TfLiteContext* context) {
   return ptr->gemm_context_;
 }
 
+void SetNumThreads(TfLiteContext* context, int num_threads) {
+  IncrementUsageCounter(context);
+  GetFromContext(context)->set_max_num_threads(num_threads);
+  DecrementUsageCounter(context);
+}
+
 }  // namespace gemm_support
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.h b/tensorflow/contrib/lite/kernels/gemm_support.h
index 37af772c6846f2f8124faabf1a0f0987e2e9393d..f033501cb6e341aa014fa4d956b531bd79aa555b 100644
--- a/tensorflow/contrib/lite/kernels/gemm_support.h
+++ b/tensorflow/contrib/lite/kernels/gemm_support.h
@@ -45,6 +45,9 @@ void IncrementUsageCounter(TfLiteContext* context);
 // 'context'. If there are no more usages the GemmContext will be deleted.
 void DecrementUsageCounter(TfLiteContext* context);
 
+// Set the number of threads that can be used by gemmlowp.
+void SetNumThreads(TfLiteContext* context, int num_threads);
+
 }  // namespace gemm_support
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index aa3957bee133c8b51a82e9c62884ce365e086d2e..167c0f1fde9202452a915cea69cbb935fa1af7b6 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -431,15 +431,3 @@ cc_library(
 )
 
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 08674a6c59192a71bae48eac467fecc7b7dff7b7..0f78e0f728585ab27a8116a4707ac9614a6ea060 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1694,11 +1694,11 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
 
 #ifdef __aarch64__
-  // Call kernel optimized for depthwise convolutions using 3x3 filters,
-  // stride = 1, no padding, depth_multiplier = 1 and depth a multiple of 16.
-  if (filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
-      stride_width == 1 && stride_height == 1 && pad_width == 0 &&
-      pad_height == 0 && (input_depth % 16) == 0) {
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (Fast3by3FilterKernelSupported(input_dims, filter_dims, stride_width,
+                                    stride_height, pad_width, pad_height,
+                                    depth_multiplier, output_dims)) {
     DepthwiseConv3by3FilterDepth16(
         input_data, input_dims, input_offset, filter_data, filter_dims,
         filter_offset, bias_data, bias_dims, stride_width, stride_height,
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index e0335b2c74cbf6d9d4f17d816f5802286628964f..a349892076fcc4989e2f4cad188b383d2b31d470 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -440,6 +440,47 @@ struct ConvKernel3x3FilterDepth16<1, 1> {
   }
 };
 
+inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
+                                          const Dims<4>& filter_dims,
+                                          int stride_width, int stride_height,
+                                          int pad_width, int pad_height,
+                                          int depth_multiplier,
+                                          const Dims<4>& output_dims) {
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+
+  bool supported = filter_width == 3 && filter_height == 3 &&
+                   depth_multiplier == 1 &&
+                   (stride_width == 1 || stride_width == 2) &&
+                   (stride_height == 1 || stride_height == 2) &&
+                   pad_width == 0 && pad_height == 0 && (input_depth % 16) == 0;
+
+  if (!supported) {
+    return false;
+  }
+
+  // Handle case where padding is zero but type is not kValid. This would
+  // require special boundary case handling that is not supported yet.
+
+  const int out_x = output_width - 1;
+  const int out_y = output_height - 1;
+
+  const int in_x_origin = (out_x * stride_width) - pad_width;
+  const int in_y_origin = (out_y * stride_height) - pad_height;
+
+  const int in_x_end = in_x_origin + filter_width;
+  const int in_y_end = in_y_origin + filter_height;
+
+  // Supported only if filter on the right and bottom boundary lies completely
+  // within the input.
+  return in_x_end <= input_width && in_y_end <= input_height;
+}
+
 inline void DepthwiseConv3by3FilterDepth16(
     const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
     const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
@@ -466,8 +507,8 @@ inline void DepthwiseConv3by3FilterDepth16(
   TFLITE_DCHECK(filter_width == 3);
   TFLITE_DCHECK(pad_height == 0);
   TFLITE_DCHECK(pad_width == 0);
-  TFLITE_DCHECK(stride_width == 1);
-  TFLITE_DCHECK(stride_height == 1);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
+  TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
 
   // The number of outputs to process in the main loop.
   const int num_x_outputs = 1;
@@ -513,6 +554,16 @@ inline void DepthwiseConv3by3FilterDepth16(
     }
   }
 
+  using dot_product_func_t =
+      decltype(&ConvKernel3x3FilterDepth16<1, 2, 1>::Run);
+  dot_product_func_t dot_product_func = nullptr;
+
+  if (stride_width == 1 && stride_height == 1) {
+    dot_product_func = ConvKernel3x3FilterDepth16<1, 2, 1>::Run;
+  } else {
+    dot_product_func = ConvKernel3x3FilterDepth16<1, 2, 2>::Run;
+  }
+
   // Offsets for preloading inputs.
   const int i0 = 0;
   const int i1 = input_depth;
@@ -526,6 +577,9 @@ inline void DepthwiseConv3by3FilterDepth16(
   const int i9 = 3 * input_row_width;
   const int i10 = 3 * input_row_width + input_depth;
   const int i11 = 3 * input_row_width + 2 * input_depth;
+  const int i12 = 4 * input_row_width;
+  const int i13 = 4 * input_row_width + input_depth;
+  const int i14 = 4 * input_row_width + 2 * input_depth;
 
   for (int b = 0; b < batches; ++b) {
     const int32* bias_ptr = bias_data;
@@ -551,10 +605,6 @@ inline void DepthwiseConv3by3FilterDepth16(
         const uint8* input_ptr =
             input_data + depth + in_x_offset + in_y_offset + in_batch_offset;
 
-        uint8* output_ptr = output_data + depth + (out_x * output_depth) +
-                            (output_depth * output_width * out_y) +
-                            out_batch_offset;
-
         // Preload inputs. If input depth is large, preload every value of the
         // input for this depth range. Otherwise, preload only the first values
         // of each row.
@@ -571,19 +621,33 @@ inline void DepthwiseConv3by3FilterDepth16(
           preload_l1_keep(input_ptr + i9);
           preload_l1_keep(input_ptr + i10);
           preload_l1_keep(input_ptr + i11);
+
+          if (stride_height == 2) {
+            preload_l1_keep(input_ptr + i12);
+            preload_l1_keep(input_ptr + i13);
+            preload_l1_keep(input_ptr + i14);
+          }
         } else {
           preload_l1_keep(input_ptr + i0);
           preload_l1_keep(input_ptr + i3);
           preload_l1_keep(input_ptr + i6);
           preload_l1_keep(input_ptr + i9);
+
+          if (stride_height == 2) {
+            preload_l1_keep(input_ptr + i12);
+          }
         }
 
+        uint8* output_ptr = output_data + depth + (out_x * output_depth) +
+                            (output_depth * output_width * out_y) +
+                            out_batch_offset;
+
         for (; out_x < out_x_end; out_x += num_x_outputs) {
-          ConvKernel3x3FilterDepth16<1, 2, 1>::Run(
-              filter, input_ptr, input_depth, input_offset, input_row_width,
-              bias_ptr, output_offset, output_multiplier, output_shift,
-              output_activation_min, output_activation_max, output_ptr,
-              output_depth, output_width);
+          dot_product_func(filter, input_ptr, input_depth, input_offset,
+                           input_row_width, bias_ptr, output_offset,
+                           output_multiplier, output_shift,
+                           output_activation_min, output_activation_max,
+                           output_ptr, output_depth, output_width);
 
           input_ptr += input_ptr_x_increment * num_x_outputs;
           output_ptr += output_depth * num_x_outputs;
@@ -603,13 +667,15 @@ inline void DepthwiseConv3by3FilterDepth16(
             preload_l1_keep(input_ptr + i8);
             preload_l1_keep(input_ptr + i10);
             preload_l1_keep(input_ptr + i11);
+            preload_l1_keep(input_ptr + i13);
+            preload_l1_keep(input_ptr + i14);
           }
         }
 
         // Handle the rest of the right side.
         for (; out_x < output_width; out_x++) {
           // This code path can only be reached if we're handling >1 x outputs
-          // at a time or support padding.
+          // at a time or support kSame padding.
         }
       }
 
@@ -624,6 +690,21 @@ inline void DepthwiseConv3by3FilterDepth16(
         const uint8* input_ptr =
             input_data + depth + in_x_offset + in_y_offset + in_batch_offset;
 
+        if (input_depth >= 32) {
+          preload_l1_keep(input_ptr + i0);
+          preload_l1_keep(input_ptr + i1);
+          preload_l1_keep(input_ptr + i2);
+          preload_l1_keep(input_ptr + i3);
+          preload_l1_keep(input_ptr + i4);
+          preload_l1_keep(input_ptr + i5);
+          preload_l1_keep(input_ptr + i6);
+          preload_l1_keep(input_ptr + i7);
+        } else {
+          preload_l1_keep(input_ptr + i0);
+          preload_l1_keep(input_ptr + i3);
+          preload_l1_keep(input_ptr + i6);
+        }
+
         uint8* output_ptr = output_data + depth + (out_x * output_depth) +
                             (output_depth * output_width * out_y) +
                             out_batch_offset;
@@ -637,6 +718,19 @@ inline void DepthwiseConv3by3FilterDepth16(
 
           input_ptr += input_ptr_x_increment;
           output_ptr += output_depth;
+
+          if (stride_width == 1) {
+            preload_l1_keep(input_ptr + i2);
+            preload_l1_keep(input_ptr + i5);
+            preload_l1_keep(input_ptr + i8);
+          } else if (stride_width == 2) {
+            preload_l1_keep(input_ptr + i1);
+            preload_l1_keep(input_ptr + i2);
+            preload_l1_keep(input_ptr + i4);
+            preload_l1_keep(input_ptr + i5);
+            preload_l1_keep(input_ptr + i7);
+            preload_l1_keep(input_ptr + i8);
+          }
         }
       }
       filter_ptr += 16;
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 6bbc213cc67f1421a4782636a0f5142c9a4664f3..3642da311cbe938f672d64338a335181764c4175 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -324,6 +324,332 @@ void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
   }
 }
 
+inline void optimized_ops_preload_l1_stream(const uint8* ptr) {
+#ifdef GEMMLOWP_ARM_64
+  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#else
+  gemmlowp::Prefetch(ptr);
+#endif
+}
+
+inline void optimized_ops_preload_l1_keep(const uint8* ptr) {
+#ifdef GEMMLOWP_ARM_64
+  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#else
+  gemmlowp::Prefetch(ptr);
+#endif
+}
+
+#ifdef GEMMLOWP_NEON
+// In the common case of batch size 1, a fully-connected node degenerates
+// to a matrix*vector product. LSTM cells contain a fully-connected node;
+// when quantized, this becomes a special type of GEMV operation where
+// the output is 16bit-quantized, thus needs its own special path.
+inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims,
+                            const uint8* weights_data,
+                            const Dims<4>& weights_dims,
+                            uint8 weights_zero_point, const int32* bias_data,
+                            const Dims<4>& bias_dims, int32 accum_multiplier,
+                            int accum_shift, int16* output_data,
+                            const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCell");
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                       ArraySize(output_dims, 3),
+                   1);
+  const int input_size = input_dims.strides[3];
+  const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  // This special fast path for quantized LSTM cells does not try to support
+  // odd sizes that we haven't encountered in any LSTM cell, that would
+  // require special code (that would go untested until any LSTM cell
+  // exercises it). We just guard our assumptions about size evenness with
+  // the following assertions.
+  TFLITE_DCHECK(!(output_size % 4));
+  TFLITE_DCHECK(!(input_size % 8));
+  const int32* bias_ptr = bias_data;
+  int16* output_ptr = output_data;
+  for (int out = 0; out < output_size; out += 4) {
+    int32x4_t acc_0 = vdupq_n_s32(0);
+    int32x4_t acc_1 = vdupq_n_s32(0);
+    int32x4_t acc_2 = vdupq_n_s32(0);
+    int32x4_t acc_3 = vdupq_n_s32(0);
+    const int16x8_t input_offset_vec = vdupq_n_s16(-128);
+    const int16x8_t weights_offset_vec = vdupq_n_s16(-weights_zero_point);
+    int in = 0;
+    // Handle 16 levels of depth at a time.
+    for (; in <= input_size - 16; in += 16) {
+      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
+      const uint8* weights_ptr = weights_data + in + out * input_size;
+      uint8x16_t weights_val_u8_0 = vld1q_u8(weights_ptr + 0 * input_size);
+      uint8x16_t weights_val_u8_1 = vld1q_u8(weights_ptr + 1 * input_size);
+      uint8x16_t weights_val_u8_2 = vld1q_u8(weights_ptr + 2 * input_size);
+      uint8x16_t weights_val_u8_3 = vld1q_u8(weights_ptr + 3 * input_size);
+      int16x8_t input_val_0, input_val_1;
+      const uint8x8_t low = vget_low_u8(input_val_u8);
+      const uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      int16x8_t weights_val_0_0, weights_val_1_0, weights_val_2_0,
+          weights_val_3_0;
+      int16x8_t weights_val_0_1, weights_val_1_1, weights_val_2_1,
+          weights_val_3_1;
+      weights_val_0_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_0))),
+          weights_offset_vec);
+      weights_val_0_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_0))),
+          weights_offset_vec);
+      weights_val_1_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_1))),
+          weights_offset_vec);
+      weights_val_1_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_1))),
+          weights_offset_vec);
+      weights_val_2_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_2))),
+          weights_offset_vec);
+      weights_val_2_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_2))),
+          weights_offset_vec);
+      weights_val_3_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_3))),
+          weights_offset_vec);
+      weights_val_3_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_3))),
+          weights_offset_vec);
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_0),
+                        vget_low_s16(input_val_0));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_0),
+                        vget_low_s16(input_val_0));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_0),
+                        vget_low_s16(input_val_0));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_0),
+                        vget_low_s16(input_val_0));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_0),
+                        vget_high_s16(input_val_0));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_0),
+                        vget_high_s16(input_val_0));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_0),
+                        vget_high_s16(input_val_0));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_0),
+                        vget_high_s16(input_val_0));
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_1),
+                        vget_low_s16(input_val_1));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_1),
+                        vget_low_s16(input_val_1));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_1),
+                        vget_low_s16(input_val_1));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_1),
+                        vget_low_s16(input_val_1));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_1),
+                        vget_high_s16(input_val_1));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_1),
+                        vget_high_s16(input_val_1));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_1),
+                        vget_high_s16(input_val_1));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_1),
+                        vget_high_s16(input_val_1));
+    }
+    // Handle 8 levels of depth at a time.
+    for (; in < input_size; in += 8) {
+      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
+      const uint8* weights_ptr = weights_data + in + out * input_size;
+      uint8x8_t weights_val_u8_0 = vld1_u8(weights_ptr + 0 * input_size);
+      uint8x8_t weights_val_u8_1 = vld1_u8(weights_ptr + 1 * input_size);
+      uint8x8_t weights_val_u8_2 = vld1_u8(weights_ptr + 2 * input_size);
+      uint8x8_t weights_val_u8_3 = vld1_u8(weights_ptr + 3 * input_size);
+      int16x8_t input_val;
+      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t weights_val_0, weights_val_1, weights_val_2, weights_val_3;
+      weights_val_0 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_0)),
+                    weights_offset_vec);
+      weights_val_1 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_1)),
+                    weights_offset_vec);
+      weights_val_2 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_2)),
+                    weights_offset_vec);
+      weights_val_3 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_3)),
+                    weights_offset_vec);
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0),
+                        vget_low_s16(input_val));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1),
+                        vget_low_s16(input_val));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2),
+                        vget_low_s16(input_val));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3),
+                        vget_low_s16(input_val));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0),
+                        vget_high_s16(input_val));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1),
+                        vget_high_s16(input_val));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2),
+                        vget_high_s16(input_val));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3),
+                        vget_high_s16(input_val));
+    }
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+    reduced = vaddq_s32(reduced, bias_vec);
+    int left_shift = accum_shift > 0 ? accum_shift : 0;
+    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_ptr, res16);
+    output_ptr += 4;
+  }
+}
+#endif
+
+#ifdef GEMMLOWP_NEON
+inline void GEMVForLstmCellWithSymmetricRange(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 accum_multiplier,
+    int accum_shift, int16* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCellWithSymmetricRange");
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                       ArraySize(output_dims, 3),
+                   1);
+  const int input_size = input_dims.strides[3];
+  const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  // This special fast path for quantized LSTM cells does not try to support
+  // odd sizes that we haven't encountered in any LSTM cell, that would
+  // require special code (that would go untested until any LSTM cell
+  // exercises it). We just guard our assumptions about size evenness with
+  // the following assertions.
+  TFLITE_DCHECK(!(output_size % 4));
+  TFLITE_DCHECK(!(input_size % 8));
+  const int32* bias_ptr = bias_data;
+  int16* output_ptr = output_data;
+  const uint8x16_t signbit = vdupq_n_u8(0x80);
+  for (int in = 0; in < input_size; in += 32) {
+    optimized_ops_preload_l1_keep(input_data + in);
+  }
+  for (int out = 0; out < output_size; out += 4) {
+    const uint8* weights_ptr_0 = weights_data + out * input_size;
+    const uint8* weights_ptr_1 = weights_ptr_0 + 1 * input_size;
+    const uint8* weights_ptr_2 = weights_ptr_0 + 2 * input_size;
+    const uint8* weights_ptr_3 = weights_ptr_0 + 3 * input_size;
+
+    int32x4_t acc_0 = vdupq_n_s32(0);
+    int32x4_t acc_1 = vdupq_n_s32(0);
+    int32x4_t acc_2 = vdupq_n_s32(0);
+    int32x4_t acc_3 = vdupq_n_s32(0);
+    int in = 0;
+    const int kReadAhead = 256;
+    // Handle 16 levels of depth at a time.
+    for (; in < input_size; in += 16) {
+      int8x16_t weights_val_0 =
+          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_0)));
+      int8x16_t weights_val_1 =
+          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_1)));
+      int8x16_t weights_val_2 =
+          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_2)));
+      int8x16_t weights_val_3 =
+          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_3)));
+      int8x16_t input_val =
+          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(input_data + in)));
+      int16x8_t acc16_0 =
+          vmull_s8(vget_low_s8(weights_val_0), vget_low_s8(input_val));
+      int16x8_t acc16_1 =
+          vmull_s8(vget_low_s8(weights_val_1), vget_low_s8(input_val));
+      int16x8_t acc16_2 =
+          vmull_s8(vget_low_s8(weights_val_2), vget_low_s8(input_val));
+      int16x8_t acc16_3 =
+          vmull_s8(vget_low_s8(weights_val_3), vget_low_s8(input_val));
+      acc16_0 = vmlal_s8(acc16_0, vget_high_s8(weights_val_0),
+                         vget_high_s8(input_val));
+      acc16_1 = vmlal_s8(acc16_1, vget_high_s8(weights_val_1),
+                         vget_high_s8(input_val));
+      acc16_2 = vmlal_s8(acc16_2, vget_high_s8(weights_val_2),
+                         vget_high_s8(input_val));
+      acc16_3 = vmlal_s8(acc16_3, vget_high_s8(weights_val_3),
+                         vget_high_s8(input_val));
+      acc_0 = vpadalq_s16(acc_0, acc16_0);
+      acc_1 = vpadalq_s16(acc_1, acc16_1);
+      acc_2 = vpadalq_s16(acc_2, acc16_2);
+      acc_3 = vpadalq_s16(acc_3, acc16_3);
+      weights_ptr_0 += 16;
+      weights_ptr_1 += 16;
+      weights_ptr_2 += 16;
+      weights_ptr_3 += 16;
+      optimized_ops_preload_l1_stream(weights_ptr_0 + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_ptr_1 + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_ptr_2 + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_ptr_3 + kReadAhead);
+    }
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+    reduced = vaddq_s32(reduced, bias_vec);
+    int left_shift = accum_shift > 0 ? accum_shift : 0;
+    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_ptr, res16);
+    output_ptr += 4;
+  }
+}
+#endif
+
 inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
                            const float* weights_data,
                            const Dims<4>& weights_dims, const float* bias_data,
@@ -367,14 +693,6 @@ void FullyConnected(const float* input_data, const Dims<4>& input_dims,
                  output_data, output_dims);
 }
 
-inline void preload_l1_stream(const uint8* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
 #ifdef USE_NEON
 inline void FullyConnectedAsGEMV(
     const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
@@ -395,10 +713,10 @@ inline void FullyConnectedAsGEMV(
   const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
   static constexpr int kPeel = 4;
   for (int k = 0; k < input_size; k += 64) {
-    preload_l1_stream(input_data + k);
+    optimized_ops_preload_l1_stream(input_data + k);
   }
   for (int k = 0; k < kPeel * input_size; k += 64) {
-    preload_l1_stream(filter_data + k);
+    optimized_ops_preload_l1_stream(filter_data + k);
   }
   TFLITE_DCHECK(!(output_size % kPeel));
   const int32* bias_ptr = bias_data;
@@ -417,7 +735,7 @@ inline void FullyConnectedAsGEMV(
       for (int k = 0; k < kPeel; k++) {
         const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
         filter_val_u8[k] = vld1q_u8(filter_ptr);
-        preload_l1_stream(filter_ptr + 64);
+        optimized_ops_preload_l1_stream(filter_ptr + 64);
       }
       int16x8_t input_val[2];
       const uint8x8_t low = vget_low_u8(input_val_u8);
@@ -610,21 +928,20 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
       input_offset, output_pipeline);
 }
 
-inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
-                           int32 input_offset, const uint8* filter_data,
-                           const Dims<4>& filter_dims, int32 filter_offset,
-                           const int32* bias_data, const Dims<4>& bias_dims,
-                           int32 output_offset, int32 output_multiplier,
-                           int output_shift, int32 output_activation_min,
-                           int32 output_activation_max, int16* output_data,
-                           const Dims<4>& output_dims,
-                           gemmlowp::GemmContext* gemm_context) {
+inline void FullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
+    const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
+    const int32* bias_data_int32, const Dims<4>& bias_dims, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, int16* output_data, const Dims<4>& output_dims,
+    gemmlowp::GemmContext* gemm_context) {
   gemmlowp::ScopedProfilingLabel label("FullyConnected/Uint8Int16");
   // This is a copy of the reference implementation. We do not currently have a
   // properly optimized version.
   (void)gemm_context;  // only used in properly optimized code.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   TFLITE_DCHECK_EQ(output_offset, 0);
+
   // TODO(benoitjacob): This really should be:
   //     const int batches = ArraySize(output_dims, 1);
   // but the current --variable_batch hack consists in overwriting the 3rd
@@ -636,30 +953,58 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
   const int accum_depth = ArraySize(filter_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
-  for (int b = 0; b < batches; ++b) {
-    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      // Internal accumulation.
-      // Initialize accumulator with the bias-value.
-      int32 accum = bias_data[out_c];
-      // Accumulation loop.
-      for (int d = 0; d < accum_depth; ++d) {
-        int16 input_val = input_data[b * accum_depth + d] + input_offset;
-        int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
-        accum += filter_val * input_val;
-      }
-      // Down-scale the final int32 accumulator to the scale used by our
-      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
-      // multiplier and shift here have been pre-computed offline
-      // (e.g. by toco).
-      accum = MultiplyByQuantizedMultiplier(accum, output_multiplier,
-                                            -output_shift);
-      // Saturate, cast to int16, and store to output array.
-      accum = std::max(accum, output_activation_min - output_offset);
-      accum = std::min(accum, output_activation_max - output_offset);
-      accum += output_offset;
-      output_data[out_c + output_depth * b] = accum;
+
+  // Implementation of the fully connected node suited to the inside of an LSTM
+  // cell. The operands are 8-bit integers, the accumulators are internally
+  // 32bit integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+#ifdef GEMMLOWP_NEON
+  if (batches == 1 && input_offset == -128 && output_activation_min == -32768 &&
+      output_activation_max == 32767) {
+    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 16)) {
+      GEMVForLstmCellWithSymmetricRange(input_data, input_dims, filter_data,
+                                        filter_dims, bias_data_int32, bias_dims,
+                                        output_multiplier, -output_shift,
+                                        output_data, output_dims);
+      return;
+    }
+    if (!(output_depth % 4) && !(accum_depth % 8)) {
+      GEMVForLstmCell(input_data, input_dims, filter_data, filter_dims,
+                      filter_offset, bias_data_int32, bias_dims,
+                      output_multiplier, -output_shift, output_data,
+                      output_dims);
+      return;
     }
   }
+#endif
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> weights_matrix(
+      filter_data, output_depth, accum_depth);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, accum_depth, batches);
+  gemmlowp::MatrixMap<int16, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_depth, batches);
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  ColVectorMap bias_vector(bias_data_int32, output_depth);
+  gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+  bias_addition_stage.bias_vector = bias_vector;
+  gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage;
+  scale_stage.result_offset_after_shift = 0;
+  scale_stage.result_fixedpoint_multiplier = output_multiplier;
+  // Note that this shift is negated wrt ordinary FC.
+  scale_stage.result_exponent = -output_shift;
+  gemmlowp::OutputStageClamp clamp_stage;
+  clamp_stage.min = output_activation_min;
+  clamp_stage.max = output_activation_max;
+  gemmlowp::OutputStageSaturatingCastToInt16 saturating_cast_int16_stage;
+  auto output_pipeline =
+      std::make_tuple(bias_addition_stage, scale_stage, clamp_stage,
+                      saturating_cast_int16_stage);
+  gemmlowp::GemmWithOutputPipeline<uint8, int16,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, weights_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
 }
 
 // legacy, for compatibility with old checked-in code
@@ -1583,6 +1928,8 @@ inline void Add(int left_shift, const uint8* input1_data,
   TFLITE_DCHECK_LT(input1_offset, 256);
   TFLITE_DCHECK_LT(input2_offset, 256);
 #ifdef USE_NEON
+  const auto output_activation_min_vector = vdup_n_u8(output_activation_min);
+  const auto output_activation_max_vector = vdup_n_u8(output_activation_max);
   for (; i <= size - 8; i += 8) {
     const auto input1_val_original = vld1_u8(input1_data + i);
     const auto input2_val_original = vld1_u8(input2_data + i);
@@ -1628,7 +1975,10 @@ inline void Add(int left_shift, const uint8* input1_data,
     const auto s2_narrowed = vmovn_s32(s2);
     const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
                              vdupq_n_s16(output_offset));
-    vst1_u8(output_data + i, vqmovun_s16(s));
+    const auto clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+    vst1_u8(output_data + i, clamped);
   }
 #endif  // NEON
 
@@ -1655,11 +2005,21 @@ template <FusedActivationFunctionType Ac>
 inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
                 int input1_shift, const int16* input2_data,
                 const Dims<4>& input2_dims, int input2_shift,
+                int16 output_activation_min, int16 output_activation_max,
                 int16* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Add/Int16");
   // This is a copy of the reference implementation. We do not currently have a
   // properly optimized version.
-  static_assert(Ac == FusedActivationFunctionType::kNone, "");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, -32768);
+    TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  }
 
   const int flat_size = RequiredBufferSizeForDims(output_dims);
   TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
@@ -1680,7 +2040,10 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
     F0 scaled_input =
         F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift));
     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
-    output_data[i] = result.raw();
+    const int16 raw_output = result.raw();
+    const int16 clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = clamped_output;
   }
 }
 
@@ -2157,6 +2520,51 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
 // TODO(aselle): This is not actually optimized yet.
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
@@ -2175,15 +2583,120 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
       for (int x = 0; x < width; ++x) {
         for (int c = 0; c < depth; ++c) {
           output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] -
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] -
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastSub(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sub, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
         }
       }
     }
   }
 }
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void Concatenation(int concat_dim, const Scalar* const* input_data,
                    const Dims<4>* const* input_dims, int inputs_count,
@@ -2310,198 +2823,6 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
       output_state_map.tanh();
 }
 
-#ifdef GEMMLOWP_NEON
-// In the common case of batch size 1, a fully-connected node degenerates
-// to a matrix*vector product. LSTM cells contain a fully-connected node;
-// when quantized, this becomes a special type of GEMV operation where
-// the output is 16bit-quantized, thus needs its own special path.
-inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims,
-                            const uint8* weights_data,
-                            const Dims<4>& weights_dims,
-                            uint8 weights_zero_point, const int32* bias_data,
-                            const Dims<4>& bias_dims, int32 accum_multiplier,
-                            int accum_shift, int16* output_data,
-                            const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCell");
-  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                       ArraySize(output_dims, 3),
-                   1);
-  const int input_size = input_dims.strides[3];
-  const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
-  // This special fast path for quantized LSTM cells does not try to support
-  // odd sizes that we haven't encountered in any LSTM cell, that would
-  // require special code (that would go untested until any LSTM cell
-  // exercises it). We just guard our assumptions about size evenness with
-  // the following assertions.
-  TFLITE_DCHECK(!(output_size % 4));
-  TFLITE_DCHECK(!(input_size % 8));
-  const int32* bias_ptr = bias_data;
-  int16* output_ptr = output_data;
-  for (int out = 0; out < output_size; out += 4) {
-    int32x4_t acc_0 = vdupq_n_s32(0);
-    int32x4_t acc_1 = vdupq_n_s32(0);
-    int32x4_t acc_2 = vdupq_n_s32(0);
-    int32x4_t acc_3 = vdupq_n_s32(0);
-    const int16x8_t input_offset_vec = vdupq_n_s16(-128);
-    const int16x8_t weights_offset_vec = vdupq_n_s16(-weights_zero_point);
-    int in = 0;
-    // Handle 16 levels of depth at a time.
-    for (; in <= input_size - 16; in += 16) {
-      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
-      const uint8* weights_ptr = weights_data + in + out * input_size;
-      uint8x16_t weights_val_u8_0 = vld1q_u8(weights_ptr + 0 * input_size);
-      uint8x16_t weights_val_u8_1 = vld1q_u8(weights_ptr + 1 * input_size);
-      uint8x16_t weights_val_u8_2 = vld1q_u8(weights_ptr + 2 * input_size);
-      uint8x16_t weights_val_u8_3 = vld1q_u8(weights_ptr + 3 * input_size);
-      int16x8_t input_val_0, input_val_1;
-      const uint8x8_t low = vget_low_u8(input_val_u8);
-      const uint8x8_t high = vget_high_u8(input_val_u8);
-      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
-      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
-      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
-      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
-      int16x8_t weights_val_0_0, weights_val_1_0, weights_val_2_0,
-          weights_val_3_0;
-      int16x8_t weights_val_0_1, weights_val_1_1, weights_val_2_1,
-          weights_val_3_1;
-      weights_val_0_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_0))),
-          weights_offset_vec);
-      weights_val_0_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_0))),
-          weights_offset_vec);
-      weights_val_1_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_1))),
-          weights_offset_vec);
-      weights_val_1_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_1))),
-          weights_offset_vec);
-      weights_val_2_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_2))),
-          weights_offset_vec);
-      weights_val_2_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_2))),
-          weights_offset_vec);
-      weights_val_3_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_3))),
-          weights_offset_vec);
-      weights_val_3_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_3))),
-          weights_offset_vec);
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_0),
-                        vget_low_s16(input_val_0));
-      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_0),
-                        vget_low_s16(input_val_0));
-      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_0),
-                        vget_low_s16(input_val_0));
-      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_0),
-                        vget_low_s16(input_val_0));
-      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_0),
-                        vget_high_s16(input_val_0));
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_0),
-                        vget_high_s16(input_val_0));
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_0),
-                        vget_high_s16(input_val_0));
-      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_0),
-                        vget_high_s16(input_val_0));
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_1),
-                        vget_low_s16(input_val_1));
-      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_1),
-                        vget_low_s16(input_val_1));
-      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_1),
-                        vget_low_s16(input_val_1));
-      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_1),
-                        vget_low_s16(input_val_1));
-      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_1),
-                        vget_high_s16(input_val_1));
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_1),
-                        vget_high_s16(input_val_1));
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_1),
-                        vget_high_s16(input_val_1));
-      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_1),
-                        vget_high_s16(input_val_1));
-    }
-    // Handle 8 levels of depth at a time.
-    for (; in < input_size; in += 8) {
-      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
-      const uint8* weights_ptr = weights_data + in + out * input_size;
-      uint8x8_t weights_val_u8_0 = vld1_u8(weights_ptr + 0 * input_size);
-      uint8x8_t weights_val_u8_1 = vld1_u8(weights_ptr + 1 * input_size);
-      uint8x8_t weights_val_u8_2 = vld1_u8(weights_ptr + 2 * input_size);
-      uint8x8_t weights_val_u8_3 = vld1_u8(weights_ptr + 3 * input_size);
-      int16x8_t input_val;
-      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
-      input_val = vaddq_s16(input_val, input_offset_vec);
-      int16x8_t weights_val_0, weights_val_1, weights_val_2, weights_val_3;
-      weights_val_0 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_0)),
-                    weights_offset_vec);
-      weights_val_1 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_1)),
-                    weights_offset_vec);
-      weights_val_2 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_2)),
-                    weights_offset_vec);
-      weights_val_3 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_3)),
-                    weights_offset_vec);
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0),
-                        vget_low_s16(input_val));
-      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1),
-                        vget_low_s16(input_val));
-      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2),
-                        vget_low_s16(input_val));
-      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3),
-                        vget_low_s16(input_val));
-      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0),
-                        vget_high_s16(input_val));
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1),
-                        vget_high_s16(input_val));
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2),
-                        vget_high_s16(input_val));
-      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3),
-                        vget_high_s16(input_val));
-    }
-    // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
-        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
-    pairwise_reduced_acc_0 =
-        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
-    pairwise_reduced_acc_1 =
-        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
-    pairwise_reduced_acc_2 =
-        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
-    pairwise_reduced_acc_3 =
-        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
-    const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
-    const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
-    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
-    // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_ptr);
-    bias_ptr += 4;
-    reduced = vaddq_s32(reduced, bias_vec);
-    int left_shift = accum_shift > 0 ? accum_shift : 0;
-    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
-    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
-    // Multiply by the fixed-point multiplier.
-    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
-    // Rounding-shift-right.
-    using gemmlowp::RoundingDivideByPOT;
-    reduced = RoundingDivideByPOT(reduced, right_shift);
-    // Narrow values down to 16 bit signed.
-    const int16x4_t res16 = vqmovn_s32(reduced);
-    vst1_s16(output_ptr, res16);
-    output_ptr += 4;
-  }
-}
-#endif
-
 // Quantized LSTM cell. Currently just a copy of the reference impl in
 // reference_ops.h. See the big function comment there, not replicating it
 // here.
@@ -3771,12 +4092,46 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
 inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
                      int16* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic/Int16");
-  // This is a copy of the reference implementation. We do not currently have a
-  // properly optimized version.
   const int flat_size = RequiredBufferSizeForDims(output_dims);
   TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
 
   for (int i = 0; i < flat_size; i++) {
+  }
+
+  int c = 0;
+  const int16* input_data_ptr = input_data;
+  int16* output_data_ptr = output_data;
+#ifdef GEMMLOWP_NEON
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    for (; c <= flat_size - 16; c += 16) {
+      F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
+      F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      vst1q_s16(output_data_ptr, output0.raw());
+      vst1q_s16(output_data_ptr + 8, output1.raw());
+
+      input_data_ptr += 16;
+      output_data_ptr += 16;
+    }
+    for (; c <= flat_size - 8; c += 8) {
+      F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
+      F0 output = gemmlowp::logistic(input);
+      vst1q_s16(output_data_ptr, output.raw());
+
+      input_data_ptr += 8;
+      output_data_ptr += 8;
+    }
+  }
+#endif
+  {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
     // whose range is in [-1, 1].
@@ -3784,9 +4139,14 @@ inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
     // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
     using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
 
-    const F3 input = F3::FromRaw(input_data[i]);
-    F0 output = gemmlowp::logistic(input);
-    output_data[i] = output.raw();
+    for (; c < flat_size; ++c) {
+      F3 input = F3::FromRaw(*input_data_ptr);
+      F0 output = gemmlowp::logistic(input);
+      *output_data_ptr = output.raw();
+
+      ++input_data_ptr;
+      ++output_data_ptr;
+    }
   }
 }
 
@@ -3953,9 +4313,6 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
                  int input_left_shift, int16* output_data,
                  const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Tanh/Int16");
-  // This is a copy of the reference implementation. We do not currently have a
-  // properly optimized version.
-
   // Support for shifts is limited until we have a parameterized version of
   // SaturatingRoundingMultiplyByPOT().
   TFLITE_DCHECK_GE(input_left_shift, 0);
@@ -3964,25 +4321,91 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
   const int flat_size = RequiredBufferSizeForDims(output_dims);
   TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
 
-  // F0 uses 0 integer bits, range [-1, 1].
-  // This is the return type of math functions such as tanh, logistic,
-  // whose range is in [-1, 1].
-  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
-  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
-  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
-
-  if (input_left_shift == 0) {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(input_data[i]);
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
+  int c = 0;
+  const int16* input_data_ptr = input_data;
+  int16* output_data_ptr = output_data;
+#ifdef GEMMLOWP_NEON
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
+        F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        vst1q_s16(output_data_ptr, output0.raw());
+        vst1q_s16(output_data_ptr + 8, output1.raw());
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
+        F0 output = gemmlowp::tanh(input);
+        vst1q_s16(output_data_ptr, output.raw());
+
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    } else {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr)));
+        F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr + 8)));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        vst1q_s16(output_data_ptr, output0.raw());
+        vst1q_s16(output_data_ptr + 8, output1.raw());
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr)));
+        F0 output = gemmlowp::tanh(input);
+        vst1q_s16(output_data_ptr, output.raw());
+
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
     }
-  } else {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(
-          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
+  }
+#endif
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c < flat_size; ++c) {
+        F3 input = F3::FromRaw(*input_data_ptr);
+        F0 output = gemmlowp::tanh(input);
+        *output_data_ptr = output.raw();
+
+        ++input_data_ptr;
+        ++output_data_ptr;
+      }
+    } else {
+      for (; c < flat_size; ++c) {
+        F3 input = F3::FromRaw(
+            gemmlowp::SaturatingRoundingMultiplyByPOT<1>(*input_data_ptr));
+        F0 output = gemmlowp::tanh(input);
+        *output_data_ptr = output.raw();
+
+        ++input_data_ptr;
+        ++output_data_ptr;
+      }
     }
   }
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
index f7706c793883f012d2b66cf3d3167a59afe31f91..9a04b76e56b2527b06f5b0ec1e75e991fd1cbdea 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -97,6 +97,71 @@ QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
   return quantization_params;
 }
 
+// Converts a floating-point number to an integer. For all inputs x where
+// static_cast<IntOut>(x) is legal according to the C++ standard, the result
+// is identical to that cast (i.e. the result is x with its fractional part
+// truncated whenever that is representable as IntOut).
+//
+// static_cast would cause undefined behavior for the following cases, which
+// have well-defined behavior for this function:
+//
+//  1. If x is NaN, the result is zero.
+//
+//  2. If the truncated form of x is above the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::max().
+//
+//  3. If the truncated form of x is below the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::min().
+//
+// Note that cases #2 and #3 cover infinities as well as finite numbers.
+//
+// The range of FloatIn must include the range of IntOut, otherwise
+// the results are undefined.
+// TODO(sfeuz): Replace by absl::SafeCast once available.
+template <class IntOut, class FloatIn>
+IntOut SafeCast(FloatIn x) {
+  static_assert(!std::numeric_limits<FloatIn>::is_integer,
+                "FloatIn is integer");
+  static_assert(std::numeric_limits<IntOut>::is_integer,
+                "IntOut is not integer");
+  static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
+
+  // Special case NaN, for which the logic below doesn't work.
+  if (std::isnan(x)) {
+    return 0;
+  }
+
+  // Negative values all clip to zero for unsigned results.
+  if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
+    return 0;
+  }
+
+  // Handle infinities.
+  if (std::isinf(x)) {
+    return x < 0 ? std::numeric_limits<IntOut>::min()
+                 : std::numeric_limits<IntOut>::max();
+  }
+
+  // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
+  // unless x is zero in which case exp == 0. Note that this implies that the
+  // magnitude of x is strictly less than 2^exp.
+  int exp = 0;
+  std::frexp(x, &exp);
+
+  // Let N be the number of non-sign bits in the representation of IntOut. If
+  // the magnitude of x is strictly less than 2^N, the truncated version of x
+  // is representable as IntOut. The only representable integer for which this
+  // is not the case is kMin for signed types (i.e. -2^N), but that is covered
+  // by the fall-through below.
+  if (exp <= std::numeric_limits<IntOut>::digits) {
+    return x;
+  }
+
+  // Handle numbers with magnitude >= 2^N.
+  return x < 0 ? std::numeric_limits<IntOut>::min()
+               : std::numeric_limits<IntOut>::max();
+}
+
 // Decompose a double multiplier into a Q0.31 int32 representation of its
 // significand, and shift representation of NEGATIVE its exponent ---
 // this is intended as a RIGHT-shift.
@@ -135,8 +200,8 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
 // Calculate the largest input that will result in a within-bounds intermediate
 // result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
 // it must not overflow before we reduce the value by multiplication by the
-// input multiplier.  The negative radius is used as the minimum difference
-// in Softmax.
+// input multiplier.  The negative radius is used as the minimum difference in
+// Softmax.
 int CalculateInputRadius(int input_integer_bits, int input_left_shift);
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
index 4ae2085c30ed08790eaff27c7921909d47687707..3e9a3c29ee26e96612bb05eb9cd1e1badad10c7a 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -22,6 +22,132 @@ namespace {
 
 using ::testing::Pair;
 
+template <class FloatIn, class IntOut>
+void RunSafeCastTests() {
+  const IntOut imax = std::numeric_limits<IntOut>::max();
+  EXPECT_GT(imax, 0);
+  const IntOut imin = std::numeric_limits<IntOut>::min();
+  const bool s = std::numeric_limits<IntOut>::is_signed;
+  if (s) {
+    EXPECT_LT(imin, 0);
+  } else {
+    EXPECT_EQ(0, imin);
+  }
+
+  // Some basic tests.
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0.0)), 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-0.0)), 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0.99)), 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.0)), 1);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.01)), 1);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(1.99)), 1);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(2.0)), 2);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(2.01)), 2);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-0.99)), 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-1.0)), s ? -1 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-1.01)), s ? -1 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-1.99)), s ? -1 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-2.0)), s ? -2 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-2.01)), s ? -2 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(117.9)), 117);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(118.0)), 118);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(118.1)), 118);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-117.9)), s ? -117 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-118.0)), s ? -118 : 0);
+  EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(-118.1)), s ? -118 : 0);
+
+  // Some edge cases.
+  EXPECT_EQ(SafeCast<IntOut>(std::numeric_limits<FloatIn>::max()), imax);
+  EXPECT_EQ(SafeCast<IntOut>(std::numeric_limits<FloatIn>::lowest()), imin);
+  EXPECT_EQ(SafeCast<IntOut>(std::numeric_limits<FloatIn>::infinity()), imax);
+  EXPECT_EQ(SafeCast<IntOut>(-std::numeric_limits<FloatIn>::infinity()), imin);
+  EXPECT_EQ(SafeCast<IntOut>(std::numeric_limits<FloatIn>::quiet_NaN()), 0);
+
+  // Some larger numbers.
+  if (sizeof(IntOut) >= 4 && sizeof(FloatIn) > 4) {
+    EXPECT_EQ(SafeCast<IntOut>(static_cast<FloatIn>(0x76543210)), 0x76543210);
+  }
+
+  if (sizeof(FloatIn) > sizeof(IntOut)) {
+    // Check values near imax.
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 0.1)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 0.99)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 1.0)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 1.99)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) + 2.0)),
+              imax);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 0.1)),
+              imax - 1);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 0.99)),
+              imax - 1);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 1.0)),
+              imax - 1);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 1.01)),
+              imax - 2);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 1.99)),
+              imax - 2);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 2.0)),
+              imax - 2);
+    EXPECT_EQ(SafeCast<IntOut>(
+                  static_cast<FloatIn>(static_cast<FloatIn>(imax) - 2.01)),
+              imax - 3);
+  }
+
+  // Check values considerably larger in magnitude than imin and imax
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) * 2)),
+      imax);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) * 20)),
+      imax);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imax) * 100)),
+      imax);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imin) * 2)),
+      imin);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imin) * 20)),
+      imin);
+  EXPECT_EQ(
+      SafeCast<IntOut>(static_cast<FloatIn>(static_cast<FloatIn>(imin) * 100)),
+      imin);
+}
+
+TEST(QuantizationUtilTest, SafeCast) {
+  RunSafeCastTests<float, int8>();
+  RunSafeCastTests<double, int8>();
+  RunSafeCastTests<float, int16>();
+  RunSafeCastTests<double, int16>();
+  RunSafeCastTests<float, int32>();
+  RunSafeCastTests<double, int32>();
+  RunSafeCastTests<float, int64>();
+  RunSafeCastTests<double, int64>();
+  RunSafeCastTests<float, uint8>();
+  RunSafeCastTests<double, uint8>();
+  RunSafeCastTests<float, uint16>();
+  RunSafeCastTests<double, uint16>();
+  RunSafeCastTests<float, uint32>();
+  RunSafeCastTests<double, uint32>();
+  RunSafeCastTests<float, uint64>();
+  RunSafeCastTests<double, uint64>();
+}
+
 // Example taken from http://www.tensorflow.org/performance/quantization
 //
 //  Quantized | Float
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index d3d15edf4c0c904697817cc7c191f723bd52b7e2..3575974ae9362e53592c50c8d4a0675bee8f8034 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -404,6 +404,7 @@ inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
           const int in_d =
               out_d + ((out_h % block_size) * block_size + out_w % block_size) *
                           output_depth;
+
           const int in_w = out_w / block_size;
           const int in_h = out_h / block_size;
           const int in_b = out_b;
@@ -956,8 +957,18 @@ template <FusedActivationFunctionType Ac>
 inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
                 int input1_shift, const int16* input2_data,
                 const Dims<4>& input2_dims, int input2_shift,
+                int16 output_activation_min, int16 output_activation_max,
                 int16* output_data, const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone, "");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, -32768);
+    TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  }
 
   const int flat_size = RequiredBufferSizeForDims(output_dims);
   TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
@@ -978,7 +989,10 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
     F0 scaled_input =
         F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift));
     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
-    output_data[i] = result.raw();
+    const int16 raw_output = result.raw();
+    const int16 clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = clamped_output;
   }
 }
 
@@ -1325,6 +1339,47 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                output_data, output_dims);
 }
 
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
 inline void Div(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
@@ -1379,6 +1434,106 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastSub(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sub, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void Concatenation(int concat_dim, const Scalar* const* input_data,
                    const Dims<4>* const* input_dims, int inputs_count,
@@ -3029,19 +3184,20 @@ inline void Exp(const T* input_data, const size_t num_elements,
   }
 }
 
-template <typename T>
-inline void Mean(T* input_data, const int* input_dims, const int input_num_dims,
+template <typename T, typename U>
+inline bool Mean(T* input_data, const int* input_dims, const int input_num_dims,
                  T* output_data, const int* output_dims,
                  const int output_num_dims, const int* axis,
                  const int num_axis_dimensions, bool keep_dims, int* temp_index,
-                 int* resolved_axis) {
+                 int* resolved_axis, U* temp_sum) {
   // resets output data.
   size_t num_outputs = 1;
   for (int idx = 0; idx < output_num_dims; ++idx) {
     num_outputs *= static_cast<size_t>(output_dims[idx]);
   }
   for (size_t idx = 0; idx < num_outputs; ++idx) {
-    output_data[idx] = 0;
+    output_data[idx] = T();
+    temp_sum[idx] = U();
   }
   // resets temp index.
   for (int idx = 0; idx < input_num_dims; ++idx) {
@@ -3074,19 +3230,24 @@ inline void Mean(T* input_data, const int* input_dims, const int input_num_dims,
     size_t output_offset =
         ReducedOutputOffset(input_num_dims, input_dims, temp_index,
                             num_resolved_axis, resolved_axis);
-    output_data[output_offset] += input_data[input_offset];
+    temp_sum[output_offset] += static_cast<U>(input_data[input_offset]);
   }
   // takes average by num of elements added to get mean.
   size_t num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
-    num_elements_in_axis *= static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
   }
   if (num_elements_in_axis > 0) {
     for (size_t idx = 0; idx < num_outputs; ++idx) {
-      output_data[idx] = static_cast<T>(static_cast<float>(output_data[idx]) /
-                                        num_elements_in_axis);
+      output_data[idx] =
+          static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
     }
   }
+  return true;
 }
 
 template <typename T>
@@ -3203,6 +3364,30 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       T* output_data, const Dims<4>& output_dims) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          auto out_idx = Offset(output_dims, c, x, y, b);
+          auto in1_idx = SubscriptToIndex(desc1, c, x, y, b);
+          auto in2_idx = SubscriptToIndex(desc2, c, x, y, b);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = in1_val > in2_val ? in1_val : in2_val;
+        }
+      }
+    }
+  }
+}
+
 template <typename T1, typename T2, typename T3>
 void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
             T2* output_data, const Dims<4>& output_dims) {
diff --git a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
index b366bdc9aebf3784248a39225a442475a4f0d550..4eddf7bf0a2cbca695dae20ba8ba56a9cd72e4ba 100644
--- a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
+++ b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
@@ -55,7 +55,6 @@ inline int Log2Floor(uint n) {
       log += shift;
     }
   }
-  assert(value == 1);
   return log;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index 21da1daff7783bc98bc36814f1f9605a83eb65cf..2f407b5da31594335dba31b3057737e67a974057 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -53,13 +53,13 @@ inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
 }
 
 // Determines whether tensor is constant.
-inline bool IsConstantTensor(TfLiteTensor* tensor) {
+inline bool IsConstantTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteMmapRo;
 }
 
 // Determines whether tensor is dynamic. Note that a tensor can be non-const and
 // not dynamic. This function specifically checks for a dynamic tensor.
-inline bool IsDynamicTensor(TfLiteTensor* tensor) {
+inline bool IsDynamicTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteDynamic;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc
index ee8bfe56d95e9f383ef49b40b8f58b63d61da3e1..e67f4e06f3680f8c9447a9e831b63415994ea176 100644
--- a/tensorflow/contrib/lite/kernels/l2norm.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm.cc
@@ -45,10 +45,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
 
-  // TODO(ahentz): Our current implementations only support float32.
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE(
+      context, output->type == kTfLiteFloat32 || output->type == kTfLiteUInt8);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
+  if (output->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
+  }
+
   // TODO(ahentz): For some reason our implementations don't support
   // activations.
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
@@ -75,6 +80,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_L2NORM(optimized_ops);
     }
 #undef TF_LITE_L2NORM
+  } else if (output->type == kTfLiteUInt8) {
+#define TF_LITE_L2NORM(type)                                               \
+  type::L2Normalization(GetTensorData<uint8>(input), GetTensorDims(input), \
+                        input->params.zero_point,                          \
+                        GetTensorData<uint8>(output), GetTensorDims(output))
+
+    if (kernel_type == kReference) {
+      TF_LITE_L2NORM(reference_ops);
+    }
+    if (kernel_type == kGenericOptimized) {
+      TF_LITE_L2NORM(optimized_ops);
+    }
+#undef TF_LITE_L2NORM
   } else {
     context->ReportError(context, "Inputs and outputs not all float types.");
     return kTfLiteError;
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
index 30e103f3303484c339ef98e6a68e0438291c102f..042314ccf55cb6de12c743448fbe040f35e7baab 100644
--- a/tensorflow/contrib/lite/kernels/l2norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -25,10 +25,22 @@ using ::testing::ElementsAreArray;
 
 class L2NormOpModel : public SingleOpModel {
  public:
-  L2NormOpModel(std::initializer_list<int> input_shape,
-                ActivationFunctionType activation_type) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+  L2NormOpModel(const std::initializer_list<int> input_shape,
+                const TensorType tensor_type,
+                const ActivationFunctionType activation_type) {
+    TensorData data = TensorData{tensor_type};
+    if (tensor_type != TensorType_FLOAT32) {
+      data.min = -2.0;
+      data.max = 2.0;
+      data.scale = 2.0;
+      data.zero_point = 128;
+    }
+    input_ = AddInput(data);
+    if (tensor_type != TensorType_FLOAT32) {
+      data.min = -1.0;
+      data.max = 127.0 / 128.0;
+    }
+    output_ = AddOutput(data);
     SetBuiltinOp(BuiltinOperator_L2_NORMALIZATION, BuiltinOptions_L2NormOptions,
                  CreateL2NormOptions(builder_, activation_type).Union());
     BuildInterpreter({input_shape});
@@ -38,7 +50,17 @@ class L2NormOpModel : public SingleOpModel {
     PopulateTensor(input_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
+  int input() const { return input_; }
 
  private:
   int input_;
@@ -46,13 +68,26 @@ class L2NormOpModel : public SingleOpModel {
 };
 
 TEST(L2NormOpTest, SimpleTest) {
-  L2NormOpModel m({1, 1, 1, 6}, ActivationFunctionType_NONE);
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_FLOAT32,
+                  ActivationFunctionType_NONE);
   m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
 }
 
+TEST(L2NormOpTest, SimpleUint8Test) {
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_UINT8, ActivationFunctionType_NONE);
+
+  m.QuantizeAndPopulate<uint8_t>(m.input(), {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({58, 166, 173, 205, 83, 134}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}, 0.1)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/maximum.cc b/tensorflow/contrib/lite/kernels/maximum.cc
new file mode 100644
index 0000000000000000000000000000000000000000..13c40603ced6338086809b908539156e2c0985e7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/maximum.cc
@@ -0,0 +1,118 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace maximum {
+
+// This file has a reference implemenation of TFMaximum.
+enum KernelType {
+  kReference,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct MaximumContext {
+  MaximumContext(TfLiteContext* context, TfLiteNode* node) {
+    input1 = GetInput(context, node, kInputTensor1);
+    input2 = GetInput(context, node, kInputTensor2);
+    output = GetOutput(context, node, kOutputTensor);
+  }
+  TfLiteTensor* input1;
+  TfLiteTensor* input2;
+  TfLiteTensor* output;
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MaximumContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, op_context.input1->type, op_context.input2->type);
+  op_context.output->type = op_context.input1->type;
+
+  bool requires_broadcast =
+      !HaveSameShapes(op_context.input1, op_context.input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (requires_broadcast) {
+    TF_LITE_ENSURE_OK(
+        context, CalculateShapeForBroadcast(context, op_context.input1,
+                                            op_context.input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(op_context.input1->dims);
+  }
+
+  return context->ResizeTensor(context, op_context.output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  MaximumContext op_context(context, node);
+
+#define TF_LITE_MAXIMUM(kernel_type, data_type)    \
+  kernel_type::TensorFlowMaximum<data_type>(       \
+      GetTensorData<data_type>(op_context.input1), \
+      GetTensorDims(op_context.input1),            \
+      GetTensorData<data_type>(op_context.input2), \
+      GetTensorDims(op_context.input2),            \
+      GetTensorData<data_type>(op_context.output), \
+      GetTensorDims(op_context.output))
+
+  if (kernel_type == kReference) {
+    switch (op_context.output->type) {
+      case kTfLiteFloat32:
+        TF_LITE_MAXIMUM(reference_ops, float);
+        break;
+      default:
+        context->ReportError(context,
+                             "Type %d is currently not supported by Maximum.",
+                             op_context.output->type);
+        return kTfLiteError;
+    }
+  } else {
+    context->ReportError(context,
+                         "Type %d is currently not supported by Maximum.",
+                         op_context.output->type);
+    return kTfLiteError;
+  }
+#undef TF_LITE_MAXIMUM
+  return kTfLiteOk;
+}
+
+}  // namespace maximum
+
+TfLiteRegistration* Register_MAXIMUM_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, maximum::Prepare,
+                                 maximum::Eval<maximum::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MAXIMUM() { return Register_MAXIMUM_REF(); }
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/maximum_test.cc b/tensorflow/contrib/lite/kernels/maximum_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df2bf29c205e0a3ff6ea5df2bba8ca721a09e626
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/maximum_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class MaximumOpModel : public SingleOpModel {
+ public:
+  MaximumOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorType& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MAXIMUM, BuiltinOptions_MaximumOptions,
+                 CreateMaximumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  template <class T>
+  void SetInput1(std::initializer_list<T> data) {
+    PopulateTensor(input1_, data);
+  }
+
+  template <class T>
+  void SetInput2(std::initializer_list<T> data) {
+    PopulateTensor(input2_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(MaximumOpTest, FloatTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  MaximumOpModel m({TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32);
+  m.SetInput1<float>(data1);
+  m.SetInput2<float>(data2);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
+  EXPECT_THAT(
+      m.GetOutput<float>(),
+      ElementsAreArray(ArrayFloatNear({1.0, 0.0, 1.0, 12.0, -2.0, -1.43})));
+}
+
+TEST(MaximumOpTest, FloatWithBroadcastTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
+  std::initializer_list<float> data2 = {0.5, 2.0};
+  MaximumOpModel m({TensorType_FLOAT32, {3, 1, 2}}, {TensorType_FLOAT32, {2}},
+                   TensorType_FLOAT32);
+  m.SetInput1<float>(data1);
+  m.SetInput2<float>(data2);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
+  EXPECT_THAT(
+      m.GetOutput<float>(),
+      ElementsAreArray(ArrayFloatNear({1.0, 2.0, 0.5, 2.0, 0.5, 11.0})));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
index aff19581ea56f94c08638b7b388ae181f566cf4f..047bdd1039b993783ce8f2d69d83864e609c13fd 100644
--- a/tensorflow/contrib/lite/kernels/mean.cc
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
@@ -48,7 +49,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // Creates two temp tensors to store index and axis for internal
   // implementation only.
   auto* scratch_tensor_index = new int;
-  context->AddTensors(context, 2, scratch_tensor_index);
+  context->AddTensors(context, 3, scratch_tensor_index);
   return scratch_tensor_index;
 }
 
@@ -64,6 +65,14 @@ TfLiteStatus ResizeTempAxis(TfLiteContext* context, MeanContext* op_context,
   return context->ResizeTensor(context, resolved_axis, axis_size);
 }
 
+// Resizes the temp tensor that stores temp sum of reduced elements.
+TfLiteStatus ResizeTempSum(TfLiteContext* context, MeanContext* op_context,
+                           TfLiteTensor* temp_sum) {
+  TfLiteIntArray* size = TfLiteIntArrayCreate(1);
+  size->data[0] = static_cast<int>(NumElements(op_context->output));
+  return context->ResizeTensor(context, temp_sum, size);
+}
+
 // Resizes output array based on the input size and resolved axis.
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 MeanContext* op_context) {
@@ -135,7 +144,7 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   // Creates a temp index to iterate through input data.
   int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
   TfLiteIntArrayFree(node->temporaries);
-  node->temporaries = TfLiteIntArrayCreate(2);
+  node->temporaries = TfLiteIntArrayCreate(3);
   node->temporaries->data[0] = *scratch_tensor_index;
   TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
   scratch_tensor->type = kTfLiteInt32;
@@ -149,6 +158,25 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   node->temporaries->data[1] = *scratch_tensor_index + 1;
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
   resolved_axis->type = kTfLiteInt32;
+  // Creates a temp tensor to store temp sums when calculating mean.
+  node->temporaries->data[2] = *scratch_tensor_index + 2;
+  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
+  switch (op_context->input->type) {
+    case kTfLiteFloat32:
+      temp_sum->type = kTfLiteFloat32;
+      break;
+    case kTfLiteInt32:
+      temp_sum->type = kTfLiteInt64;
+      break;
+    case kTfLiteInt64:
+      temp_sum->type = kTfLiteInt64;
+      break;
+    case kTfLiteUInt8:
+      temp_sum->type = kTfLiteInt32;
+      break;
+    default:
+      return kTfLiteError;
+  }
   return kTfLiteOk;
 }
 
@@ -160,16 +188,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
 
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
   // Leaves work to Eval if axis is not constant; else resizes output.
   if (!IsConstantTensor(op_context.axis)) {
     SetTensorToDynamic(op_context.output);
     SetTensorToDynamic(resolved_axis);
+    SetTensorToDynamic(temp_sum);
     return kTfLiteOk;
   }
   resolved_axis->allocation_type = kTfLiteArenaRw;
   TF_LITE_ENSURE_OK(context,
                     ResizeTempAxis(context, &op_context, resolved_axis));
-  return ResizeOutputTensor(context, &op_context);
+  TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  temp_sum->allocation_type = kTfLiteArenaRw;
+  return ResizeTempSum(context, &op_context, temp_sum);
 }
 
 template <KernelType kernel_type>
@@ -178,14 +210,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   int num_axis = static_cast<int>(NumElements(op_context.axis));
   TfLiteTensor* temp_index = &context->tensors[node->temporaries->data[0]];
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context,
                       ResizeTempAxis(context, &op_context, resolved_axis));
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum));
   }
 
-#define TF_LITE_MEAN(kernel_type, data_type)                        \
+#define TF_LITE_MEAN(kernel_type, data_type, temp_data_type)        \
   kernel_type::Mean<>(                                              \
       GetTensorData<data_type>(op_context.input),                   \
       op_context.input->dims->data, op_context.input->dims->size,   \
@@ -193,21 +227,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       op_context.output->dims->data, op_context.output->dims->size, \
       GetTensorData<int>(op_context.axis), num_axis,                \
       op_context.params->keep_dims, GetTensorData<int>(temp_index), \
-      GetTensorData<int>(resolved_axis))
+      GetTensorData<int>(resolved_axis),                            \
+      GetTensorData<temp_data_type>(temp_sum))
 
   if (kernel_type == kReference) {
     switch (op_context.input->type) {
       case kTfLiteFloat32:
-        TF_LITE_MEAN(reference_ops, float);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
         break;
       case kTfLiteInt32:
-        TF_LITE_MEAN(reference_ops, int);
-        break;
-      case kTfLiteUInt8:
-        TF_LITE_MEAN(reference_ops, uint8_t);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int, int64_t));
         break;
       case kTfLiteInt64:
-        TF_LITE_MEAN(reference_ops, int64_t);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int64_t, int64_t));
+        break;
+      case kTfLiteUInt8:
+        TF_LITE_ENSURE_EQ(context, op_context.input->params.scale,
+                          op_context.output->params.scale);
+        TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point,
+                          op_context.output->params.zero_point);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int));
         break;
       default:
         return kTfLiteError;
@@ -216,7 +255,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 #undef TF_LITE_MEAN
   return kTfLiteOk;
 }
-
 }  // namespace mean
 
 TfLiteRegistration* Register_MEAN_REF() {
diff --git a/tensorflow/contrib/lite/kernels/mean_test.cc b/tensorflow/contrib/lite/kernels/mean_test.cc
index 2d6d4bc2da4b75289ee27c3f2a12787216716d44..79c9957f76fdb994be0a71f2e90b883435de4815 100644
--- a/tensorflow/contrib/lite/kernels/mean_test.cc
+++ b/tensorflow/contrib/lite/kernels/mean_test.cc
@@ -37,8 +37,15 @@ class BaseMeanOpModel : public SingleOpModel {
     return ExtractVector<T>(output_);
   }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  int Input() { return input_; }
+
  protected:
   int input_;
   int axis_;
@@ -142,56 +149,64 @@ TEST(DynamicFloatMeanOpTest, Scale) {
   EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527})));
 }
 
+// for quantized Add, the error shouldn't exceed step
+float GetTolerance(int min, int max) { return (max - min) / 255.0; }
+
 TEST(ConstUint8MeanOpTest, NotKeepDims) {
-  std::initializer_list<uint8_t> data = {1,  2,  3,  4,  5,  6,  7,  8,
-                                         9,  10, 11, 12, 13, 14, 15, 16,
-                                         17, 18, 19, 20, 21, 22, 23, 24};
-  MeanOpConstModel m({TensorType_UINT8, {4, 3, 2}}, {TensorType_UINT8, {2}},
-                     {4}, {1, 0, -3, -3}, false);
-  m.SetInput(data);
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::initializer_list<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MeanOpConstModel m({TensorType_UINT8, {1, 3, 2}, -1.0, 1.0},
+                     {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({12, 13}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                            {0.4, 0.4}, kQuantizedTolerance)));
 }
 
 TEST(ConstUint8MeanOpTest, KeepDims) {
-  std::initializer_list<uint8_t> data = {1,  2,  3,  4,  5,  6,  7,  8,
-                                         9,  10, 11, 12, 13, 14, 15, 16,
-                                         17, 18, 19, 20, 21, 22, 23, 24};
-  MeanOpConstModel m({TensorType_UINT8, {4, 3, 2}}, {TensorType_UINT8, {3}},
-                     {2}, {0, 2}, true);
-  m.SetInput(data);
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::initializer_list<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MeanOpConstModel m({TensorType_UINT8, {3, 2}, -1.0, 1.0},
+                     {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
-  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({10, 12, 14}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
 }
 
 TEST(DynamicUint8MeanOpTest, NotKeepDims) {
-  std::initializer_list<uint8_t> data = {1,  2,  3,  4,  5,  6,  7,  8,
-                                         9,  10, 11, 12, 13, 14, 15, 16,
-                                         17, 18, 19, 20, 21, 22, 23, 24};
-  MeanOpDynamicModel m({TensorType_UINT8, {4, 3, 2}}, {TensorType_UINT8, {2}},
-                       {TensorType_INT32, {4}}, false);
-  std::initializer_list<int> axis = {1, 0, -3, -3};
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::initializer_list<float> data = {1.3, -4.8, -3.6, 0.24};
+  MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -5.0, 2.0},
+                       {TensorType_UINT8, {2}, -5.0, 2.0},
+                       {TensorType_INT32, {1}}, false);
+  std::initializer_list<int> axis = {1};
   m.SetAxis(axis);
-  m.SetInput(data);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({12, 13}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({-1.75, -1.68}, kQuantizedTolerance)));
 }
 
 TEST(DynamicUint8MeanOpTest, KeepDims) {
-  std::initializer_list<uint8_t> data = {1,  2,  3,  4,  5,  6,  7,  8,
-                                         9,  10, 11, 12, 13, 14, 15, 16,
-                                         17, 18, 19, 20, 21, 22, 23, 24};
-  MeanOpDynamicModel m({TensorType_UINT8, {4, 3, 2}}, {TensorType_UINT8, {3}},
-                       {TensorType_INT32, {2}}, true);
-  std::initializer_list<int> axis = {0, 2};
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::initializer_list<float> data = {11.14, -0.14, 7.423, 0.879};
+  MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -10.0, 12.0},
+                       {TensorType_UINT8, {2}, -10.0, 12.0},
+                       {TensorType_INT32, {1}}, true);
+  std::initializer_list<int> axis = {0};
   m.SetAxis(axis);
-  m.SetInput(data);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
-  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({10, 12, 14}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance)));
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/mfcc.cc b/tensorflow/contrib/lite/kernels/mfcc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..018db0dc54c5d281bf3fb3ff8a1f111b427fe76b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mfcc.cc
@@ -0,0 +1,154 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/mfcc.h"
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h"
+#include "tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace mfcc {
+
+enum KernelType {
+  kReference,
+};
+
+typedef struct {
+  float upper_frequency_limit;
+  float lower_frequency_limit;
+  int filterbank_channel_count;
+  int dct_coefficient_count;
+} TfLiteMfccParams;
+
+constexpr int kInputTensorWav = 0;
+constexpr int kInputTensorRate = 1;
+constexpr int kOutputTensor = 0;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new TfLiteMfccParams;
+
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  data->upper_frequency_limit = m["upper_frequency_limit"].AsInt64();
+  data->lower_frequency_limit = m["lower_frequency_limit"].AsInt64();
+  data->filterbank_channel_count = m["filterbank_channel_count"].AsInt64();
+  data->dct_coefficient_count = m["dct_coefficient_count"].AsInt64();
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<TfLiteMfccParams*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(inputWav), 3);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(inputRate), 1);
+
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, inputWav->type, output->type);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
+  output_size->data[0] = inputWav->dims->data[0];
+  output_size->data[1] = inputWav->dims->data[1];
+  output_size->data[2] = params->dct_coefficient_count;
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+// Input is a single squared-magnitude spectrogram frame. The input spectrum
+// is converted to linear magnitude and weighted into bands using a
+// triangular mel filterbank, and a discrete cosine transform (DCT) of the
+// values is taken. Output is populated with the lowest dct_coefficient_count
+// of these values.
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
+
+  TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  const int32 sample_rate = *GetTensorData<int>(inputRate);
+
+  const int spectrogram_channels = inputWav->dims->data[2];
+  const int spectrogram_samples = inputWav->dims->data[1];
+  const int audio_channels = inputWav->dims->data[0];
+
+  internal::Mfcc mfcc;
+  mfcc.set_upper_frequency_limit(params->upper_frequency_limit);
+  mfcc.set_lower_frequency_limit(params->lower_frequency_limit);
+  mfcc.set_filterbank_channel_count(params->filterbank_channel_count);
+  mfcc.set_dct_coefficient_count(params->dct_coefficient_count);
+
+  mfcc.Initialize(spectrogram_channels, sample_rate);
+
+  const float* spectrogram_flat = GetTensorData<float>(inputWav);
+  float* output_flat = GetTensorData<float>(output);
+
+  for (int audio_channel = 0; audio_channel < audio_channels; ++audio_channel) {
+    for (int spectrogram_sample = 0; spectrogram_sample < spectrogram_samples;
+         ++spectrogram_sample) {
+      const float* sample_data =
+          spectrogram_flat +
+          (audio_channel * spectrogram_samples * spectrogram_channels) +
+          (spectrogram_sample * spectrogram_channels);
+      std::vector<double> mfcc_input(sample_data,
+                                     sample_data + spectrogram_channels);
+      std::vector<double> mfcc_output;
+      mfcc.Compute(mfcc_input, &mfcc_output);
+      TF_LITE_ENSURE_EQ(context, params->dct_coefficient_count,
+                        mfcc_output.size());
+      float* output_data = output_flat +
+                           (audio_channel * spectrogram_samples *
+                            params->dct_coefficient_count) +
+                           (spectrogram_sample * params->dct_coefficient_count);
+      for (int i = 0; i < params->dct_coefficient_count; ++i) {
+        output_data[i] = mfcc_output[i];
+      }
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace mfcc
+
+TfLiteRegistration* Register_MFCC() {
+  static TfLiteRegistration r = {mfcc::Init, mfcc::Free, mfcc::Prepare,
+                                 mfcc::Eval<mfcc::kReference>};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/mfcc_test.cc b/tensorflow/contrib/lite/kernels/mfcc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0291ca8c1c58ea6ab3bb7c22bc436ed3404cba74
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mfcc_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_MFCC();
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class BaseMfccOpModel : public SingleOpModel {
+ public:
+  BaseMfccOpModel(const TensorData& input1, const TensorData& input2,
+                  const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Int("upper_frequency_limit", 4000);
+      fbb.Int("lower_frequency_limit", 20);
+      fbb.Int("filterbank_channel_count", 40);
+      fbb.Int("dct_coefficient_count", 13);
+    });
+    fbb.Finish();
+    SetCustomOp("Mfcc", fbb.GetBuffer(), Register_MFCC);
+
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(MfccOpTest, SimpleTest) {
+  BaseMfccOpModel m({TensorType_FLOAT32, {1, 1, 513}}, {TensorType_INT32, {1}},
+                    {TensorType_FLOAT32, {}});
+
+  std::vector<float> data(513);
+  for (int i = 0; i < data.size(); ++i) {
+    data[i] = i + 1;
+  }
+  m.PopulateTensor<float>(m.input1(), 0, data.data(),
+                          data.data() + data.size());
+  m.PopulateTensor<int>(m.input2(), {22050});
+
+  m.Invoke();
+
+  std::vector<int> output_shape = m.GetOutputShape();
+  EXPECT_THAT(output_shape, ElementsAre(1, 1, 13));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+           -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+           -0.0769791, -0.10806114, -0.06047613},
+          1e-3)));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 9537b79a9ae4cb1eebaecb3fb2cb446726eb2baa..0f98154b904b1f776016e6bbee3263027f815244 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -17,6 +17,14 @@ limitations under the License.
 
 namespace tflite {
 namespace ops {
+
+namespace custom {
+
+TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
+TfLiteRegistration* Register_MFCC();
+
+}  // namespace custom
+
 namespace builtin {
 
 TfLiteRegistration* Register_RELU();
@@ -67,6 +75,8 @@ TfLiteRegistration* Register_TOPK_V2();
 TfLiteRegistration* Register_LOG_SOFTMAX();
 TfLiteRegistration* Register_CAST();
 TfLiteRegistration* Register_DEQUANTIZE();
+TfLiteRegistration* Register_PRELU();
+TfLiteRegistration* Register_MAXIMUM();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -123,6 +133,14 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
+  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+
+  // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
+  // custom ops aren't always included by default.
+  AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
+  AddCustom("AudioSpectrogram",
+            tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
 }
 
 TfLiteRegistration* BuiltinOpResolver::FindOp(
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index eb374d903182f46b40f5c80bfd769a19a5594742..e6d5c300dcd47821b0572e3239b36f14bd6ea3d0 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -228,6 +228,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_STRIDED_SLICE(reference_ops, int64_t);
       }
       break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
+      }
+      break;
     default:
       context->ReportError(context,
                            "Type is currently not supported "
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index 5c98c5f43181fe75f35716dae5682113bde883ec..22d7b097cbd4e1349516eae9fd378aa186e06de7 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -24,6 +24,8 @@ namespace {
 using ::int32;
 using ::testing::ElementsAreArray;
 
+template <typename input_type = float,
+          TensorType tensor_input_type = TensorType_FLOAT32>
 class StridedSliceOpModel : public SingleOpModel {
  public:
   StridedSliceOpModel(std::initializer_list<int> input_shape,
@@ -32,11 +34,11 @@ class StridedSliceOpModel : public SingleOpModel {
                       std::initializer_list<int> strides_shape, int begin_mask,
                       int end_mask, int ellipsis_mask, int new_axis_mask,
                       int shrink_axis_mask) {
-    input_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput(tensor_input_type);
     begin_ = AddInput(TensorType_INT32);
     end_ = AddInput(TensorType_INT32);
     strides_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(tensor_input_type);
     SetBuiltinOp(
         BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
         CreateStridedSliceOptions(builder_, begin_mask, end_mask, ellipsis_mask,
@@ -45,8 +47,8 @@ class StridedSliceOpModel : public SingleOpModel {
     BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
   }
 
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  void SetInput(std::initializer_list<input_type> data) {
+    PopulateTensor<input_type>(input_, data);
   }
   void SetBegin(std::initializer_list<int32> data) {
     PopulateTensor<int32>(begin_, data);
@@ -58,7 +60,9 @@ class StridedSliceOpModel : public SingleOpModel {
     PopulateTensor<int32>(strides_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<input_type> GetOutput() {
+    return ExtractVector<input_type>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  private:
@@ -71,19 +75,19 @@ class StridedSliceOpModel : public SingleOpModel {
 
 TEST(StridedSliceOpTest, UnsupportedInputSize) {
   EXPECT_DEATH(
-      StridedSliceOpModel({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
+      StridedSliceOpModel<>({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
       "StridedSlice op only supports 1D-4D input arrays.");
 }
 
 TEST(StridedSliceOpTest, UnssupportedArgs) {
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
+  EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
                "ellipsis_mask is not implemented yet.");
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
+  EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
 }
 
 TEST(StridedSliceOpTest, In1D) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -94,7 +98,7 @@ TEST(StridedSliceOpTest, In1D) {
 }
 
 TEST(StridedSliceOpTest, In1D_EmptyOutput) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({10});
   m.SetEnd({3});
@@ -104,7 +108,7 @@ TEST(StridedSliceOpTest, In1D_EmptyOutput) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBegin) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({3});
@@ -115,7 +119,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBegin) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-5});
   m.SetEnd({3});
@@ -126,7 +130,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeEnd) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({-2});
@@ -137,7 +141,7 @@ TEST(StridedSliceOpTest, In1D_NegativeEnd) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({5});
@@ -148,7 +152,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
 }
 
 TEST(StridedSliceOpTest, In1D_BeginMask) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -159,7 +163,7 @@ TEST(StridedSliceOpTest, In1D_BeginMask) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-2});
   m.SetEnd({-3});
@@ -170,7 +174,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({5});
   m.SetEnd({2});
@@ -181,7 +185,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({2});
   m.SetEnd({-4});
@@ -192,7 +196,7 @@ TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({-5});
@@ -203,7 +207,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_EndMask) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -214,7 +218,7 @@ TEST(StridedSliceOpTest, In1D_EndMask) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegStride) {
-  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
   m.SetBegin({-1});
   m.SetEnd({-4});
@@ -225,7 +229,7 @@ TEST(StridedSliceOpTest, In1D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
-  StridedSliceOpModel m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2});
   m.SetBegin({0});
   m.SetEnd({2});
@@ -236,7 +240,7 @@ TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
 }
 
 TEST(StridedSliceOpTest, In1D_OddLenStride2) {
-  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
   m.SetBegin({0});
   m.SetEnd({3});
@@ -247,7 +251,7 @@ TEST(StridedSliceOpTest, In1D_OddLenStride2) {
 }
 
 TEST(StridedSliceOpTest, In2D_Identity) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -258,7 +262,7 @@ TEST(StridedSliceOpTest, In2D_Identity) {
 }
 
 TEST(StridedSliceOpTest, In2D) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -269,7 +273,7 @@ TEST(StridedSliceOpTest, In2D) {
 }
 
 TEST(StridedSliceOpTest, In2D_Stride2) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -280,7 +284,7 @@ TEST(StridedSliceOpTest, In2D_Stride2) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStride) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -1});
   m.SetEnd({2, -4});
@@ -291,7 +295,7 @@ TEST(StridedSliceOpTest, In2D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In2D_BeginMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -302,7 +306,7 @@ TEST(StridedSliceOpTest, In2D_BeginMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_EndMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -313,7 +317,7 @@ TEST(StridedSliceOpTest, In2D_EndMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -2});
   m.SetEnd({2, -4});
@@ -324,7 +328,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -2});
   m.SetEnd({2, -3});
@@ -335,7 +339,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
 }
 
 TEST(StridedSliceOpTest, In3D_Identity) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -347,7 +351,7 @@ TEST(StridedSliceOpTest, In3D_Identity) {
 }
 
 TEST(StridedSliceOpTest, In3D_NegStride) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({-1, -1, -1});
   m.SetEnd({-3, -4, -3});
@@ -359,7 +363,7 @@ TEST(StridedSliceOpTest, In3D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In3D_Strided2) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -370,7 +374,7 @@ TEST(StridedSliceOpTest, In3D_Strided2) {
 }
 
 TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -381,7 +385,7 @@ TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({2});
   m.SetEnd({1});
@@ -392,7 +396,7 @@ TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -403,7 +407,7 @@ TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-2});
   m.SetEnd({-3});
@@ -414,7 +418,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -425,7 +429,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -436,7 +440,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -447,7 +451,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -458,7 +462,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -469,7 +473,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -480,7 +484,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -491,7 +495,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -502,7 +506,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -513,7 +517,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -525,7 +529,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
 
 // This tests catches a very subtle bug that was fixed by cl/188403234.
 TEST(StridedSliceOpTest, RunTwice) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
 
   auto setup_inputs = [&m]() {
     m.SetInput({1, 2, 3, 4, 5, 6});
@@ -544,6 +548,17 @@ TEST(StridedSliceOpTest, RunTwice) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 4, 5}));
 }
 
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
+  StridedSliceOpModel<uint8, TensorType_UINT8> m({2, 3, 2}, {3}, {3}, {3}, 0, 0,
+                                                 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index ddaf498d5bac0109429224e7cf66cb3debcabc22..66b06aeaec52dd3d2d98acfec8218ffdd0ae6bf3 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -26,7 +26,7 @@ namespace ops {
 namespace builtin {
 namespace sub {
 
-// This file has three implementation of Div.
+// This file has three implementation of Sub.
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,49 +61,118 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
-void EvalSubFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteSubParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteSubParams* params, const OpData* data,
+               TfLiteTensor* input1, TfLiteTensor* input2,
+               TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_Sub(type)                                        \
-  type::Sub(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_SUB(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    if (data->requires_broadcast) {
+      TF_LITE_SUB(reference_ops, BroadcastSub);
+    } else {
+      TF_LITE_SUB(reference_ops, Sub);
+    }
+  } else {
+    if (data->requires_broadcast) {
+      TF_LITE_SUB(optimized_ops, BroadcastSub);
+    } else {
+      TF_LITE_SUB(optimized_ops, Sub);
+    }
+  }
+#undef TF_LITE_SUB
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteSubParams* params, const OpData* data,
+                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   TfLiteTensor* output) {
+  auto input1_offset = -input1->params.zero_point;
+  auto input2_offset = -input2->params.zero_point;
+  auto output_offset = output->params.zero_point;
+  const int left_shift = 20;
+  const double twice_max_input_scale =
+      2 * std::max(input1->params.scale, input2->params.scale);
+  const double real_input1_multiplier =
+      input1->params.scale / twice_max_input_scale;
+  const double real_input2_multiplier =
+      input2->params.scale / twice_max_input_scale;
+  const double real_output_multiplier =
+      twice_max_input_scale / ((1 << left_shift) * output->params.scale);
+
+  int32 input1_multiplier;
+  int input1_shift;
+  QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &input1_multiplier,
+                                   &input1_shift);
+  int32 input2_multiplier;
+  int input2_shift;
+  QuantizeMultiplierSmallerThanOne(real_input2_multiplier, &input2_multiplier,
+                                   &input2_shift);
+  int32 output_multiplier;
+  int output_shift;
+  QuantizeMultiplierSmallerThanOne(real_output_multiplier, &output_multiplier,
+                                   &output_shift);
+
+  int32 output_activation_min, output_activation_max;
+  CalculateActivationRangeUint8(params->activation, output,
+                                &output_activation_min, &output_activation_max);
+
+#define TF_LITE_SUB(type, opname)                                            \
+  type::opname(left_shift, GetTensorData<uint8_t>(input1),                   \
+               GetTensorDims(input1), input1_offset, input1_multiplier,      \
+               input1_shift, GetTensorData<uint8_t>(input2),                 \
+               GetTensorDims(input2), input2_offset, input2_multiplier,      \
+               input2_shift, output_offset, output_multiplier, output_shift, \
+               output_activation_min, output_activation_max,                 \
+               GetTensorData<uint8_t>(output), GetTensorDims(output));
+  // The quantized version of Sub doesn't support activations, so we
+  // always use BroadcastSub.
   if (kernel_type == kReference) {
-    TF_LITE_Sub(reference_ops);
+    TF_LITE_SUB(reference_ops, BroadcastSub);
   } else {
-    TF_LITE_Sub(optimized_ops);
+    TF_LITE_SUB(optimized_ops, BroadcastSub);
   }
-#undef TF_LITE_Sub
+#undef TF_LITE_SUB
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalSubFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
+  } else if (output->type == kTfLiteUInt8) {
+    EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
+                               output);
   } else {
     context->ReportError(context, "Inputs and outputs not all float types.");
     return kTfLiteError;
@@ -99,19 +184,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace sub
 
 TfLiteRegistration* Register_SUB_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+  static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
                                  sub::Eval<sub::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_SUB_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+  static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
                                  sub::Eval<sub::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_SUB_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+  static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
                                  sub::Eval<sub::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/sub_test.cc b/tensorflow/contrib/lite/kernels/sub_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff07aeec49dbfcc0e1f65df3d674d5ec30f1b54c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sub_test.cc
@@ -0,0 +1,218 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseSubOpModel : public SingleOpModel {
+ public:
+  BaseSubOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
+                 ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SUB, BuiltinOptions_SubOptions,
+                 CreateSubOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatSubOpModel : public BaseSubOpModel {
+ public:
+  using BaseSubOpModel::BaseSubOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedSubOpModel : public BaseSubOpModel {
+ public:
+  using BaseSubOpModel::BaseSubOpModel;
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+// for quantized Sub, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+  float kQuantizedStep = (max - min) / 255.0;
+  float kQuantizedTolerance = 2.0 * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(FloatSubOpModel, NoActivation) {
+  FloatSubOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-2.1, 0.0, 1.4, -0.3})));
+}
+
+TEST(FloatSubOpModel, ActivationRELU_N1_TO_1) {
+  FloatSubOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-1.0, 0.0, 1.0, -0.3})));
+}
+
+TEST(FloatSubOpModel, VariousInputShapes) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5, -1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.8, -1.1, 0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-2.1, 0.0, 1.4, -0.3, 0.0, 1.9})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatSubOpModel, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5, -1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.5});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-2.5, -0.3, 1.2, 0.0, -1.6, 1.5})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<std::initializer_list<float>> inputs1 = {
+      {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
+  std::vector<std::initializer_list<float>> inputs2 = {
+      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.2}, {0.6, 0.4, -0.18, 0.5}};
+  std::vector<std::initializer_list<float>> results = {
+      {-0.5, -0.2, 0.0, 0.3},
+      {-0.8, -0.2, -0.1, 0.9},
+      {-0.61, -0.2, 0.88, -0.2}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {}, -1.0, 1.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                              results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                                       {-0.8, 0.2, 0.7, 0.5}};
+  std::vector<std::initializer_list<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                                       {0.6, 0.4, -0.8, 0.3}};
+  std::vector<std::initializer_list<float>> results = {{-1.0, -0.2, 0.0, 1.0},
+                                                       {-1.0, -0.2, 1.0, 0.2}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {}, -1.0, 1.0},
+                          ActivationFunctionType_RELU_N1_TO_1);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                              results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-2.1, -0.1, 0.4, 0.3, 0.0, 1.9}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.7});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-2.7, -0.5, 0.0, 0.1, 0.4, 1.3}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 8c456e70da2842c9972be723ad764f1658bd310d..606f4a563581775529610c43f1eb40cb1a176469 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -32,11 +32,72 @@ namespace tflite {
 
 const char* kEmptyTensorName = "";
 
+TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
+                               ErrorReporter* error_reporter) {
+  switch (tensor_type) {
+    case TensorType_FLOAT32:
+      *type = kTfLiteFloat32;
+      break;
+    case TensorType_INT32:
+      *type = kTfLiteInt32;
+      break;
+    case TensorType_UINT8:
+      *type = kTfLiteUInt8;
+      break;
+    case TensorType_INT64:
+      *type = kTfLiteInt64;
+      break;
+    case TensorType_STRING:
+      *type = kTfLiteString;
+      break;
+    default:
+      error_reporter->Report("Unimplemented data type %s (%d) in tensor\n",
+                             EnumNameTensorType(tensor_type), tensor_type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+// Loads a model from `filename`. If `mmap_file` is true then use mmap,
+// otherwise make a copy of the model in a buffer.
+std::unique_ptr<Allocation> GetAllocationFromFile(const char* filename,
+                                                  bool mmap_file,
+                                                  ErrorReporter* error_reporter,
+                                                  bool use_nnapi) {
+  std::unique_ptr<Allocation> allocation;
+  if (mmap_file) {
+    if (use_nnapi && NNAPIExists())
+      allocation.reset(new NNAPIAllocation(filename, error_reporter));
+    else
+      allocation.reset(new MMAPAllocation(filename, error_reporter));
+  } else {
+    allocation.reset(new FileCopyAllocation(filename, error_reporter));
+  }
+  return allocation;
+}
+
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
     const char* filename, ErrorReporter* error_reporter) {
   std::unique_ptr<FlatBufferModel> model;
-  model.reset(new FlatBufferModel(filename, /*mmap_file=*/true, error_reporter,
-                                  /*use_nnapi=*/true));
+  auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
+                                          error_reporter, /*use_nnapi=*/true);
+  model.reset(new FlatBufferModel(allocation.release(), error_reporter));
+  if (!model->initialized()) model.reset();
+  return model;
+}
+
+std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
+    const char* filename, TfLiteVerifier* verifier,
+    ErrorReporter* error_reporter) {
+  std::unique_ptr<FlatBufferModel> model;
+  auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
+                                          error_reporter, /*use_nnapi=*/true);
+  if (verifier &&
+      !verifier->Verify(static_cast<const char*>(allocation->base()),
+                        allocation->bytes(), error_reporter)) {
+    return model;
+  }
+  model.reset(new FlatBufferModel(allocation.release(), error_reporter));
   if (!model->initialized()) model.reset();
   return model;
 }
@@ -44,7 +105,9 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
     const char* buffer, size_t buffer_size, ErrorReporter* error_reporter) {
   std::unique_ptr<FlatBufferModel> model;
-  model.reset(new FlatBufferModel(buffer, buffer_size, error_reporter));
+  Allocation* allocation =
+      new MemoryAllocation(buffer, buffer_size, error_reporter);
+  model.reset(new FlatBufferModel(allocation, error_reporter));
   if (!model->initialized()) model.reset();
   return model;
 }
@@ -57,23 +120,6 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
   return model;
 }
 
-FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
-                                 ErrorReporter* error_reporter, bool use_nnapi)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
-  if (mmap_file) {
-    if (use_nnapi && NNAPIExists())
-      allocation_ = new NNAPIAllocation(filename, error_reporter);
-    else
-      allocation_ = new MMAPAllocation(filename, error_reporter);
-  } else {
-    allocation_ = new FileCopyAllocation(filename, error_reporter);
-  }
-  if (!allocation_->valid() || !CheckModelIdentifier()) return;
-
-  model_ = ::tflite::GetModel(allocation_->base());
-}
-
 bool FlatBufferModel::CheckModelIdentifier() const {
   if (!tflite::ModelBufferHasIdentifier(allocation_->base())) {
     const char* ident = flatbuffers::GetBufferIdentifier(allocation_->base());
@@ -85,21 +131,21 @@ bool FlatBufferModel::CheckModelIdentifier() const {
   return true;
 }
 
-FlatBufferModel::FlatBufferModel(const char* ptr, size_t num_bytes,
+FlatBufferModel::FlatBufferModel(const Model* model,
                                  ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
-  allocation_ = new MemoryAllocation(ptr, num_bytes, error_reporter);
-  if (!allocation_->valid()) return;
-
-  model_ = ::tflite::GetModel(allocation_->base());
+  model_ = model;
 }
 
-FlatBufferModel::FlatBufferModel(const Model* model,
+FlatBufferModel::FlatBufferModel(Allocation* allocation,
                                  ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
-  model_ = model;
+  allocation_ = allocation;
+  if (!allocation_->valid() || !CheckModelIdentifier()) return;
+
+  model_ = ::tflite::GetModel(allocation_->base());
 }
 
 FlatBufferModel::~FlatBufferModel() { delete allocation_; }
@@ -287,9 +333,25 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_EXP:
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_LOG_SOFTMAX:
-    case BuiltinOperator_CAST:
     case BuiltinOperator_DEQUANTIZE:
+    case BuiltinOperator_PRELU:
+      break;
+    case BuiltinOperator_CAST: {
+      TfLiteCastParams* params = MallocPOD<TfLiteCastParams>();
+      if (auto* schema_params = op->builtin_options_as_CastOptions()) {
+        auto in_status =
+            ConvertTensorType(schema_params->in_data_type(),
+                              &params->in_data_type, error_reporter);
+        auto out_status =
+            ConvertTensorType(schema_params->out_data_type(),
+                              &params->out_data_type, error_reporter);
+        if (in_status != kTfLiteOk || out_status != kTfLiteOk) {
+          break;
+        }
+      }
+      builtin_data = reinterpret_cast<void*>(params);
       break;
+    }
     case BuiltinOperator_LSH_PROJECTION: {
       TfLiteLSHProjectionParams* params =
           MallocPOD<TfLiteLSHProjectionParams>();
@@ -576,6 +638,9 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_MAXIMUM: {
+      break;
+    }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
@@ -659,35 +724,34 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
       // but we really only support one value for the whole tensor.
       // TODO(aselle): This breaks as well if these are nullptr's.
       // TODO(aselle): This assumes non per-channel quantization.
-      if (q_params->scale()) quantization.scale = q_params->scale()->Get(0);
-      if (q_params->zero_point())
+
+      if (q_params->scale()) {
+        if (q_params->scale()->size() != 1) {
+          error_reporter_->Report(
+              "QuantizationParam has %d scale values (only 1 is supported).",
+              q_params->scale()->size());
+          return kTfLiteError;
+        }
+        quantization.scale = q_params->scale()->Get(0);
+      }
+
+      if (q_params->zero_point()) {
+        if (q_params->zero_point()->size() != 1) {
+          error_reporter_->Report(
+              "QuantizationParam has %d zero_point values"
+              " (only 1 is supported).",
+              q_params->zero_point()->size());
+          return kTfLiteError;
+        }
         quantization.zero_point = q_params->zero_point()->Get(0);
+      }
     }
 
     TfLiteType type;
-    switch (tensor->type()) {
-      case TensorType_FLOAT32:
-        type = kTfLiteFloat32;
-        break;
-      case TensorType_INT32:
-        type = kTfLiteInt32;
-        break;
-      case TensorType_UINT8:
-        type = kTfLiteUInt8;
-        break;
-      case TensorType_INT64:
-        type = kTfLiteInt64;
-        break;
-      case TensorType_STRING:
-        type = kTfLiteString;
-        break;
-      default:
-        // tensorType = ArrayType::NONE;
-        error_reporter_->Report("Unimplemented data type %s (%d) in tensor\n",
-                                EnumNameTensorType(tensor->type()),
-                                tensor->type());
-        status = kTfLiteError;
-        continue;
+    if (ConvertTensorType(tensor->type(), &type, error_reporter_) !=
+        kTfLiteOk) {
+      status = kTfLiteError;
+      continue;
     }
     auto get_readonly_data = [&](const char** buffer_data,
                                  size_t* buffer_size) {
@@ -739,6 +803,11 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 
 TfLiteStatus InterpreterBuilder::operator()(
     std::unique_ptr<Interpreter>* interpreter) {
+  return operator()(interpreter, /*num_threads=*/-1);
+}
+
+TfLiteStatus InterpreterBuilder::operator()(
+    std::unique_ptr<Interpreter>* interpreter, int num_threads) {
   if (!interpreter) {
     error_reporter_->Report(
         "Null output pointer passed to InterpreterBuilder.");
@@ -793,9 +862,8 @@ TfLiteStatus InterpreterBuilder::operator()(
   if ((**interpreter).AddTensors(tensors->Length()) != kTfLiteOk) {
     return cleanup_and_error();
   }
-
-  (**interpreter).set_model(model_);
-
+  // Set num threads
+  (**interpreter).SetNumThreads(num_threads);
   // Parse inputs/outputs
   (**interpreter).SetInputs(FlatBufferIntArrayToVector(subgraph->inputs()));
   (**interpreter).SetOutputs(FlatBufferIntArrayToVector(subgraph->outputs()));
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index 8dc1c794dce6f9df4b5d0c686c177ac94a086e2f..036dc46e03f565c40791aee55d4158cef5c832e0 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -41,6 +41,17 @@ limitations under the License.
 
 namespace tflite {
 
+// Abstract interface that verifies whether a given model is legit.
+// It facilitates the use-case to verify and build a model without loading it
+// twice.
+class TfLiteVerifier {
+ public:
+  // Returns true if the model is legit.
+  virtual bool Verify(const char* data, int length,
+                      ErrorReporter* reporter) = 0;
+  virtual ~TfLiteVerifier() {}
+};
+
 // An RAII object that represents a read-only tflite model, copied from disk,
 // or mmapped. This uses flatbuffers as the serialization format.
 class FlatBufferModel {
@@ -50,6 +61,12 @@ class FlatBufferModel {
       const char* filename,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
+  // Verifies whether the content of the file is legit, then builds a model
+  // based on the file. Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
+      const char* filename, TfLiteVerifier* verifier = nullptr,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
   // Builds a model based on a pre-loaded flatbuffer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
   // is destroyed. Returns a nullptr in case of failure.
@@ -82,23 +99,9 @@ class FlatBufferModel {
   bool CheckModelIdentifier() const;
 
  private:
-  // Loads a model from `filename`. If `mmap_file` is true then use mmap,
-  // otherwise make a copy of the model in a buffer.
-  //
-  // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
-  // used.
-  explicit FlatBufferModel(
-      const char* filename, bool mmap_file = true,
-      ErrorReporter* error_reporter = DefaultErrorReporter(),
-      bool use_nnapi = false);
-
-  // Loads a model from `ptr` and `num_bytes` of the model file. The `ptr` has
-  // to remain alive and unchanged until the end of this flatbuffermodel's
-  // lifetime.
-  //
-  // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
-  // used.
-  FlatBufferModel(const char* ptr, size_t num_bytes,
+  // Loads a model from a given allocation. FlatBufferModel will take over the
+  // ownership of `allocation`, and delete it in desctructor.
+  FlatBufferModel(Allocation* allocation,
                   ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Loads a model from Model flatbuffer. The `model` has to remain alive and
@@ -151,6 +154,8 @@ class InterpreterBuilder {
   InterpreterBuilder(const InterpreterBuilder&) = delete;
   InterpreterBuilder& operator=(const InterpreterBuilder&) = delete;
   TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter);
+  TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter,
+                          int num_threads);
 
  private:
   TfLiteStatus BuildLocalIndexToRegistrationMapping();
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index 66f22fd66a9ae0d35553a1f780ef73a5c5994c99..ae6c1ece18963f11f48a6f07bea4065ce39687e0 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -209,6 +209,38 @@ TEST(BasicFlatBufferModel, TestNullModel) {
   ASSERT_EQ(interpreter.get(), nullptr);
 }
 
+// Mocks the verifier by setting the result in ctor.
+class FakeVerifier : public tflite::TfLiteVerifier {
+ public:
+  explicit FakeVerifier(bool result) : result_(result) {}
+  bool Verify(const char* data, int length,
+              tflite::ErrorReporter* reporter) override {
+    return result_;
+  }
+
+ private:
+  bool result_;
+};
+
+TEST(BasicFlatBufferModel, TestWithTrueVerifier) {
+  FakeVerifier verifier(true);
+  ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
+      "tensorflow/contrib/lite/testdata/test_model.bin",
+      &verifier));
+}
+
+TEST(BasicFlatBufferModel, TestWithFalseVerifier) {
+  FakeVerifier verifier(false);
+  ASSERT_FALSE(FlatBufferModel::VerifyAndBuildFromFile(
+      "tensorflow/contrib/lite/testdata/test_model.bin",
+      &verifier));
+}
+
+TEST(BasicFlatBufferModel, TestWithNullVerifier) {
+  ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
+      "tensorflow/contrib/lite/testdata/test_model.bin", nullptr));
+}
+
 struct TestErrorReporter : public ErrorReporter {
   int Report(const char* format, va_list args) override {
     calls++;
diff --git a/tensorflow/contrib/lite/models/BUILD b/tensorflow/contrib/lite/models/BUILD
index 6a1255b586ef04b80159156a78f0c4569a4661c5..efa47b06fa7f06cc6312535713ec582af4705d85 100644
--- a/tensorflow/contrib/lite/models/BUILD
+++ b/tensorflow/contrib/lite/models/BUILD
@@ -12,15 +12,3 @@ load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
 exports_files(glob([
     "testdata/*",
 ]))
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/models/smartreply/BUILD b/tensorflow/contrib/lite/models/smartreply/BUILD
index 733c3f4c7fa0605f24a1e6b4c458e34310c079c4..a82d1f2eb673b9b7211581f5a9f9febc140d4d1e 100644
--- a/tensorflow/contrib/lite/models/smartreply/BUILD
+++ b/tensorflow/contrib/lite/models/smartreply/BUILD
@@ -86,15 +86,3 @@ cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/nnapi/BUILD b/tensorflow/contrib/lite/nnapi/BUILD
index 402f1e949b7bb576de4970a8ebb41541fcee1cb2..467a2b7a7bc9a40135428240585cd2c2a133cf9f 100644
--- a/tensorflow/contrib/lite/nnapi/BUILD
+++ b/tensorflow/contrib/lite/nnapi/BUILD
@@ -11,15 +11,3 @@ cc_library(
     ],
     linkopts = ["-ldl"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index bd49d327c995ef53dc6cf9f8301ab749c925b2c7..85aca3687402a89b557d76ab5ace80dea8f8b23d 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 // helpers
 
-#define NNAPI_LOG(format, ...) printf(format "\n", __VA_ARGS__);
+#define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
 #define LOAD_FUNCTION(name) \
   static name##_fn fn = reinterpret_cast<name##_fn>(loadFunction(#name));
 #define EXECUTE_FUNCTION(...) \
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 9d00d965d397d3af0dd454609901685ba965ded6..bc13444dc70f27e3360774e843985b6294be6996 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -162,7 +162,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
     };
 
     auto duplicate_state_tensor_float32 =
-        [interpreter, &nn_model, &augmented_inputs, &next_id](int tensor_id) {
+        [interpreter, &nn_model, &augmented_inputs](int tensor_id) {
           const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
           CHECK_NN(ANeuralNetworksModel_setOperandValue(
               nn_model, tensor_id, tensor->data.raw, tensor->bytes));
@@ -349,6 +349,8 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_DEQUANTIZE:
       case tflite::BuiltinOperator_DELEGATE:
       case tflite::BuiltinOperator_CAST:
+      case tflite::BuiltinOperator_PRELU:
+      case tflite::BuiltinOperator_MAXIMUM:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 76607af07986f93abd8f7bf22f222d0153a3087c..e735062a7f2749c1e1c43e9c5f4971b3c7383387 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -69,7 +69,10 @@ py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],
+    tags = [
+        "no-internal-py3",
+        "no_oss",
+    ],
     deps = [
         ":lite",
         ":op_hint",
@@ -81,14 +84,38 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+py_binary(
+    name = "convert_saved_model",
+    srcs = ["convert_saved_model.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lite",
+        "//tensorflow/contrib/saved_model:saved_model_py",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python/tools:freeze_graph_lib",
+    ],
+)
+
+py_test(
+    name = "convert_saved_model_test",
+    srcs = ["convert_saved_model_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":convert_saved_model",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+        "//tensorflow/python/saved_model",
+    ],
+)
+
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "tf_lite_py_pip",
+    deps = [
+        ":convert_saved_model",
+    ],
 )
diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b5ef488ec1feb455b2c8d5d1c4005c3b2f60d6
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert_saved_model.py
@@ -0,0 +1,262 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""TensorFlow Lite flatbuffer generation from saved_models.
+
+Example:
+
+bazel run third_party/tensorflow/contrib/lite/python:convert_saved_model -- \
+  --saved_model_dir=/tmp/test_saved_model/1519865537 \
+  --output_tflite=/tmp/test.lite
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.saved_model.python.saved_model import reader
+from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import graph_util as tf_graph_util
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+
+flags.DEFINE_string("saved_model_dir", "", "Saved model directory to convert.")
+flags.DEFINE_string("output_tflite", None, "File path to write flatbuffer.")
+flags.DEFINE_string("output_arrays", None,
+                    "List of output tensor names, the default value is None, "
+                    "which means the conversion will keep all outputs.")
+flags.DEFINE_integer("batch_size", 1,
+                     "If input tensor shape has None at first dimension, "
+                     "e.g. (None,224,224,3), replace None with batch_size.")
+flags.DEFINE_string("tag_set", tag_constants.SERVING,
+                    "Group of tag(s) of the MetaGraphDef in the saved_model, "
+                    "in string format, separated by ','. For tag-set contains "
+                    "multiple tags, all tags must be passed in.")
+flags.DEFINE_string("signature_key",
+                    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+                    "This is signature key to extract inputs, outputs.")
+
+
+def log_tensor_details(tensor_info):
+  """Log tensor details: name, shape, and type."""
+  for key in tensor_info:
+    val = tensor_info[key]
+    dtype = types_pb2.DataType.Name(val.dtype)
+    if val.tensor_shape.unknown_rank:
+      shape = "unknown_rank"
+    else:
+      dims = [str(dim.size) for dim in val.tensor_shape.dim]
+      shape = "({})".format(", ".join(dims))
+
+    logging.info("Tensor's key in saved_model's tensor_map: %s", key)
+    logging.info(" tensor name: %s, shape: %s, type: %s", val.name, shape,
+                 dtype)
+
+
+def get_meta_graph_def(saved_model_dir, tag_set):
+  """Validate saved_model and extract MetaGraphDef.
+
+  Args:
+    saved_model_dir: saved_model path to convert.
+    tag_set: Set of tag(s) of the MetaGraphDef to load.
+
+  Returns:
+    The meta_graph_def used for tflite conversion.
+
+  Raises:
+    ValueError: No valid MetaGraphDef for given tag_set.
+  """
+  saved_model = reader.read_saved_model(saved_model_dir)
+  tag_sets = []
+  result_meta_graph_def = None
+  for meta_graph_def in saved_model.meta_graphs:
+    meta_graph_tag_set = set(meta_graph_def.meta_info_def.tags)
+    tag_sets.append(meta_graph_tag_set)
+    if meta_graph_tag_set == tag_set:
+      result_meta_graph_def = meta_graph_def
+  logging.info("The given saved_model contains the following tags: %s",
+               tag_sets)
+  if result_meta_graph_def is not None:
+    return result_meta_graph_def
+  else:
+    raise ValueError("No valid MetaGraphDef for this tag_set '{}'. Possible "
+                     "values are '{}'. ".format(tag_set, tag_sets))
+
+
+def get_signature_def(meta_graph, signature_key):
+  """Get the signature def from meta_graph with given signature_key.
+
+  Args:
+    meta_graph: meta_graph_def.
+    signature_key: signature_def in the meta_graph_def.
+
+  Returns:
+    The signature_def used for tflite conversion.
+
+  Raises:
+    ValueError: Given signature_key is not valid for this meta_graph.
+  """
+  signature_def_map = meta_graph.signature_def
+  signature_def_keys = set(signature_def_map.keys())
+  logging.info(
+      "The given saved_model MetaGraphDef contains SignatureDefs with the "
+      "following keys: %s", signature_def_keys)
+  if signature_key not in signature_def_keys:
+    raise ValueError("No '{}' in the saved_model\'s SignatureDefs. Possible "
+                     "values are '{}'. ".format(signature_key,
+                                                signature_def_keys))
+  signature_def = signature_def_utils.get_signature_def_by_key(
+      meta_graph, signature_key)
+  return signature_def
+
+
+def get_inputs_outputs(signature_def):
+  """Get inputs and outputs from signature def.
+
+  Args:
+    signature_def: signatuer def in the meta_graph_def for conversion.
+
+  Returns:
+    The inputs and outputs in the graph for conversion.
+  """
+  inputs_tensor_info = signature_def.inputs
+  outputs_tensor_info = signature_def.outputs
+  logging.info("input tensors info: ")
+  log_tensor_details(inputs_tensor_info)
+  logging.info("output tensors info: ")
+  log_tensor_details(outputs_tensor_info)
+
+  def gather_names(tensor_info):
+    return [tensor_info[key].name for key in tensor_info]
+
+  inputs = gather_names(inputs_tensor_info)
+  outputs = gather_names(outputs_tensor_info)
+  return inputs, outputs
+
+
+def convert(saved_model_dir,
+            output_tflite=None,
+            output_arrays=None,
+            tag_set=None,
+            signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+            batch_size=1):
+  """Convert a saved_model to tflite flatbuffer.
+
+  Args:
+    saved_model_dir: Saved model directory to convert.
+    output_tflite: File path to write result flatbuffer.
+    output_arrays: List of output tensor names, the default value is None, which
+      means conversion keeps all output tensors. This is also used to filter
+      tensors that are from Op currently not supported in tflite, e.g., Argmax).
+    tag_set: This is the set of tags to get meta_graph_def in saved_model.
+    signature_key: This is the signature key to extract inputs, outputs.
+    batch_size: If input tensor shape has None at first dimension,
+      e.g. (None,224,224,3), replace None with batch_size.
+
+  Returns:
+    The converted data. For example if tflite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    ValueError: If tag_set does not indicate any meta_graph_def in saved_model,
+      or signature_key is not in relevant meta_graph_def,
+      or input shape has None beyond 1st dimension, e.g., (1,None, None, 3),
+      or given output_arrays are not valid causing empty outputs.
+  """
+  if tag_set is None:
+    tag_set = set([tag_constants.SERVING])
+
+  meta_graph = get_meta_graph_def(saved_model_dir, tag_set)
+  signature_def = get_signature_def(meta_graph, signature_key)
+  inputs, outputs = get_inputs_outputs(signature_def)
+
+  graph = ops.Graph()
+  with session.Session(graph=graph) as sess:
+
+    loader.load(sess, meta_graph.meta_info_def.tags, saved_model_dir)
+
+    in_tensors = [graph.get_tensor_by_name(input_) for input_ in inputs]
+
+    # Users can use output_arrays to filter output tensors for conversion.
+    # If output_arrays is None, we keep all output tensors. In future, we may
+    # use tflite supported Op list and check whether op is custom Op to
+    # automatically filter output arrays.
+    # TODO(zhixianyan): Use tflite supported Op list to filter outputs.
+    if output_arrays is not None:
+      output_arrays = output_arrays.split(",")
+      out_tensors = [
+          graph.get_tensor_by_name(output)
+          for output in outputs
+          if output.split(":")[0] in output_arrays
+      ]
+    else:
+      out_tensors = [graph.get_tensor_by_name(output) for output in outputs]
+
+    output_names = [node.split(":")[0] for node in outputs]
+
+    if not out_tensors:
+      raise ValueError(
+          "No valid output tensors for '{}', possible values are '{}'".format(
+              output_arrays, output_names))
+
+    frozen_graph_def = tf_graph_util.convert_variables_to_constants(
+        sess, graph.as_graph_def(), output_names)
+
+    # Toco requires fully defined tensor shape, for input tensor with None in
+    # their shape, e.g., (None, 224, 224, 3), we need to replace first None with
+    # a given batch size. For shape with more None, e.g. (None, None, None, 3),
+    # still be able to replace and convert, but require further investigation.
+    # TODO(zhixianyan): Add supports for input tensor with more None in shape.
+    for i in range(len(in_tensors)):
+      shape = in_tensors[i].get_shape().as_list()
+      if shape[0] is None:
+        shape[0] = batch_size
+      if None in shape[1:]:
+        raise ValueError(
+            "Only support None shape at 1st dim as batch_size. But tensor "
+            "'{}' 's shape '{}' has None at other dimension. ".format(
+                inputs[i], shape))
+      in_tensors[i].set_shape(shape)
+
+    result = lite.toco_convert(frozen_graph_def, in_tensors, out_tensors)
+
+    if output_tflite is not None:
+      with gfile.Open(output_tflite, "wb") as f:
+        f.write(result)
+      logging.info("Successfully converted to: %s", output_tflite)
+
+    return result
+
+
+def main(_):
+  convert(
+      saved_model_dir=flags.FLAGS.saved_model_dir,
+      output_tflite=flags.FLAGS.output_tflite,
+      output_arrays=flags.FLAGS.output_arrays,
+      batch_size=flags.FLAGS.batch_size,
+      tag_set=set(flags.FLAGS.tag_set.split(",")),
+      signature_key=flags.FLAGS.signature_key)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d87fbeb91cc3d2779c0ae01aff488f88bd340c1c
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -0,0 +1,276 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TF Lite SavedModel Conversion test cases.
+
+ - test on generated saved_models from simple graphs (sanity check)
+ - test mnist savedmodel generated on-the-fly
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from tensorflow.contrib.lite.python import convert_saved_model
+from tensorflow.python import estimator
+from tensorflow.python import keras
+from tensorflow.python import layers
+from tensorflow.python import losses
+from tensorflow.python import nn
+from tensorflow.python import saved_model
+from tensorflow.python import train
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
+
+  def _createSimpleSavedModel(self, shape):
+    """Create a simple savedmodel on the fly."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
+    with session.Session() as sess:
+      in_tensor = array_ops.placeholder(shape=shape, dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      inputs = {"x": in_tensor}
+      outputs = {"y": out_tensor}
+      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  def testSimpleSavedModel(self):
+    """Test a simple savedmodel created on the fly."""
+    # Create a simple savedmodel
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    # Convert to tflite
+    result = convert_saved_model.convert(saved_model_dir=saved_model_dir)
+    self.assertTrue(result)
+
+  def testSimpleSavedModelWithNoneBatchSizeInShape(self):
+    """Test a simple savedmodel, with None in input tensor's shape."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
+    result = convert_saved_model.convert(saved_model_dir=saved_model_dir)
+    self.assertTrue(result)
+
+  def testSimpleSavedModelWithMoreNoneInShape(self):
+    """Test a simple savedmodel, fail as more None in input shape."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, None, 3])
+    # Convert to tflite: this should raise ValueError, as 3rd dim is None.
+    with self.assertRaises(ValueError):
+      convert_saved_model.convert(saved_model_dir=saved_model_dir)
+
+  def testSimpleSavedModelWithWrongSignatureKey(self):
+    """Test a simple savedmodel, fail as given signature is invalid."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    # Convert to tflite: this should raise ValueError, as
+    # signature_key does not exit in the saved_model.
+    with self.assertRaises(ValueError):
+      convert_saved_model.convert(
+          saved_model_dir=saved_model_dir, signature_key="wrong-key")
+
+  def testSimpleSavedModelWithWrongOutputArray(self):
+    """Test a simple savedmodel, fail as given output_arrays is invalid."""
+    # Create a simple savedmodel
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    # Convert to tflite: this should raise ValueError, as
+    # output_arrays is not valid for the saved_model.
+    with self.assertRaises(ValueError):
+      convert_saved_model.convert(
+          saved_model_dir=saved_model_dir, output_arrays="wrong-output")
+
+  def testMultipleMetaGraphDef(self):
+    """Test saved model with multiple MetaGraphDef."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "savedmodel_two_mgd")
+    builder = saved_model.builder.SavedModelBuilder(saved_model_dir)
+    with session.Session(graph=ops.Graph()) as sess:
+      # MetaGraphDef 1
+      in_tensor = array_ops.placeholder(shape=[1, 28, 28], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sig_input_tensor = saved_model.utils.build_tensor_info(in_tensor)
+      sig_input_tensor_signature = {"x": sig_input_tensor}
+      sig_output_tensor = saved_model.utils.build_tensor_info(out_tensor)
+      sig_output_tensor_signature = {"y": sig_output_tensor}
+      predict_signature_def = (
+          saved_model.signature_def_utils.build_signature_def(
+              sig_input_tensor_signature, sig_output_tensor_signature,
+              saved_model.signature_constants.PREDICT_METHOD_NAME))
+      signature_def_map = {
+          saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              predict_signature_def
+      }
+      builder.add_meta_graph_and_variables(
+          sess,
+          tags=[saved_model.tag_constants.SERVING, "additional_test_tag"],
+          signature_def_map=signature_def_map)
+      # MetaGraphDef 2
+      builder.add_meta_graph(tags=["tflite"])
+      builder.save(True)
+
+    # Convert to tflite
+    convert_saved_model.convert(
+        saved_model_dir=saved_model_dir,
+        tag_set=set([saved_model.tag_constants.SERVING, "additional_test_tag"]))
+
+
+class Model(keras.Model):
+  """Model to recognize digits in the MNIST dataset.
+
+  Train and export savedmodel, used for testOnflyTrainMnistSavedModel
+
+  Network structure is equivalent to:
+  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  and
+  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
+
+  But written as a ops.keras.Model using the layers API.
+  """
+
+  def __init__(self, data_format):
+    """Creates a model for classifying a hand-written digit.
+
+    Args:
+      data_format: Either "channels_first" or "channels_last".
+        "channels_first" is typically faster on GPUs while "channels_last" is
+        typically faster on CPUs. See
+        https://www.tensorflow.org/performance/performance_guide#data_formats
+    """
+    super(Model, self).__init__()
+    self._input_shape = [-1, 28, 28, 1]
+
+    self.conv1 = layers.Conv2D(
+        32, 5, padding="same", data_format=data_format, activation=nn.relu)
+    self.conv2 = layers.Conv2D(
+        64, 5, padding="same", data_format=data_format, activation=nn.relu)
+    self.fc1 = layers.Dense(1024, activation=nn.relu)
+    self.fc2 = layers.Dense(10)
+    self.dropout = layers.Dropout(0.4)
+    self.max_pool2d = layers.MaxPooling2D(
+        (2, 2), (2, 2), padding="same", data_format=data_format)
+
+  def __call__(self, inputs, training):
+    """Add operations to classify a batch of input images.
+
+    Args:
+      inputs: A Tensor representing a batch of input images.
+      training: A boolean. Set to True to add operations required only when
+        training the classifier.
+
+    Returns:
+      A logits Tensor with shape [<batch_size>, 10].
+    """
+    y = array_ops.reshape(inputs, self._input_shape)
+    y = self.conv1(y)
+    y = self.max_pool2d(y)
+    y = self.conv2(y)
+    y = self.max_pool2d(y)
+    y = layers.flatten(y)
+    y = self.fc1(y)
+    y = self.dropout(y, training=training)
+    return self.fc2(y)
+
+
+def model_fn(features, labels, mode, params):
+  """The model_fn argument for creating an Estimator."""
+  model = Model(params["data_format"])
+  image = features
+  if isinstance(image, dict):
+    image = features["image"]
+
+  if mode == estimator.ModeKeys.PREDICT:
+    logits = model(image, training=False)
+    predictions = {
+        "classes": math_ops.argmax(logits, axis=1),
+        "probabilities": nn.softmax(logits),
+    }
+    return estimator.EstimatorSpec(
+        mode=estimator.ModeKeys.PREDICT,
+        predictions=predictions,
+        export_outputs={
+            "classify": estimator.export.PredictOutput(predictions)
+        })
+
+  elif mode == estimator.ModeKeys.TRAIN:
+    optimizer = train.AdamOptimizer(learning_rate=1e-4)
+
+    logits = model(image, training=True)
+    loss = losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+    return estimator.EstimatorSpec(
+        mode=estimator.ModeKeys.TRAIN,
+        loss=loss,
+        train_op=optimizer.minimize(loss, train.get_or_create_global_step()))
+
+  elif mode == estimator.ModeKeys.EVAL:
+    logits = model(image, training=False)
+    loss = losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+    return estimator.EstimatorSpec(
+        mode=estimator.ModeKeys.EVAL,
+        loss=loss,
+        eval_metric_ops={
+            "accuracy":
+                ops.metrics.accuracy(
+                    labels=labels, predictions=math_ops.argmax(logits, axis=1)),
+        })
+
+
+def dummy_input_fn():
+  image = random_ops.random_uniform([100, 784])
+  labels = random_ops.random_uniform([100, 1], maxval=9, dtype=dtypes.int32)
+  return image, labels
+
+
+class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
+
+  def testTrainedMnistSavedModel(self):
+    """Test mnist savedmodel, trained with dummy data and small steps."""
+    # Build classifier
+    classifier = estimator.Estimator(
+        model_fn=model_fn,
+        params={
+            "data_format": "channels_last"  # tflite format
+        })
+
+    # Train and pred for serving
+    classifier.train(input_fn=dummy_input_fn, steps=2)
+    image = array_ops.placeholder(dtypes.float32, [None, 28, 28])
+    pred_input_fn = estimator.export.build_raw_serving_input_receiver_fn({
+        "image": image,
+    })
+
+    # Export savedmodel
+    saved_model_dir = os.path.join(self.get_temp_dir(), "mnist_savedmodel")
+    classifier.export_savedmodel(saved_model_dir, pred_input_fn)
+
+    # Convert to tflite and test output
+    saved_model_name = os.listdir(saved_model_dir)[0]
+    saved_model_final_dir = os.path.join(saved_model_dir, saved_model_name)
+    output_tflite = os.path.join(saved_model_dir,
+                                 saved_model_final_dir + ".lite")
+    # TODO(zhixianyan): no need to limit output_arrays to `Softmax'
+    # once b/74205001 fixed and argmax implemented in tflite.
+    result = convert_saved_model.convert(
+        saved_model_dir=saved_model_final_dir,
+        output_arrays="Softmax",
+        output_tflite=output_tflite)
+
+    self.assertTrue(result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index 5b5a7c3199a480d943aa4a8ce34710a703017d28..b8638007f7e49737726d9939a00e8cb1d6a41281 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -23,19 +23,33 @@ from tensorflow.contrib.lite.python.interpreter_wrapper import tensorflow_wrap_i
 class Interpreter(object):
   """Interpreter inferace for TF-Lite Models."""
 
-  def __init__(self, model_path):
+  def __init__(self, model_path=None, model_content=None):
     """Constructor.
 
     Args:
       model_path: Path to TF-Lite Flatbuffer file.
+      model_content: Content of model.
 
     Raises:
-      ValueError: If the interpreter was unable to open the model.
+      ValueError: If the interpreter was unable to create.
     """
-    self._interpreter = (
-        interpreter_wrapper.InterpreterWrapper_CreateWrapperCPP(model_path))
-    if not self._interpreter:
-      raise ValueError('Failed to open {}'.format(model_path))
+    if model_path and not model_content:
+      self._interpreter = (
+          interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromFile(
+              model_path))
+      if not self._interpreter:
+        raise ValueError('Failed to open {}'.format(model_path))
+    elif model_content and not model_path:
+      self._interpreter = (
+          interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
+              model_content, len(model_content)))
+      if not self._interpreter:
+        raise ValueError(
+            'Failed to create model from {} bytes'.format(len(model_content)))
+    elif not model_path and not model_path:
+      raise ValueError('`model_path` or `model_content` must be specified.')
+    else:
+      raise ValueError('Can\'t both provide `model_path` and `model_content`')
 
   def allocate_tensors(self):
     if not self._interpreter.AllocateTensors():
@@ -57,6 +71,7 @@ class Interpreter(object):
     tensor_name = self._interpreter.TensorName(tensor_index)
     tensor_size = self._interpreter.TensorSize(tensor_index)
     tensor_type = self._interpreter.TensorType(tensor_index)
+    tensor_quantization = self._interpreter.TensorQuantization(tensor_index)
 
     if not tensor_name or not tensor_type:
       raise ValueError('Could not get tensor details')
@@ -66,6 +81,7 @@ class Interpreter(object):
         'index': tensor_index,
         'shape': tensor_size,
         'dtype': tensor_type,
+        'quantization': tensor_quantization,
     }
 
     return details
diff --git a/tensorflow/contrib/lite/python/interpreter_test.py b/tensorflow/contrib/lite/python/interpreter_test.py
index e0215b721c1a7627ced1080def0fbd426c94c2f4..cd2386f5263f24e1e034015ec6880e71f0608c7c 100644
--- a/tensorflow/contrib/lite/python/interpreter_test.py
+++ b/tensorflow/contrib/lite/python/interpreter_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import io
 import numpy as np
 
 from tensorflow.contrib.lite.python import interpreter as interpreter_wrapper
@@ -29,7 +30,8 @@ class InterpreterTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
     interpreter = interpreter_wrapper.Interpreter(
-        resource_loader.get_path_to_datafile('testdata/permute_float.tflite'))
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/permute_float.tflite'))
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
@@ -37,12 +39,14 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     self.assertEqual('input', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 4] == input_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
     self.assertEqual(1, len(output_details))
     self.assertEqual('output', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 4] == output_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), output_details[0]['quantization'])
 
     test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
     expected_output = np.array([[4.0, 3.0, 2.0, 1.0]], dtype=np.float32)
@@ -53,8 +57,12 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     self.assertTrue((expected_output == output_data).all())
 
   def testUint8(self):
-    interpreter = interpreter_wrapper.Interpreter(
-        resource_loader.get_path_to_datafile('testdata/permute_uint8.tflite'))
+    model_path = resource_loader.get_path_to_datafile(
+        'testdata/permute_uint8.tflite')
+    with io.open(model_path, 'rb') as model_file:
+      data = model_file.read()
+
+    interpreter = interpreter_wrapper.Interpreter(model_content=data)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
@@ -62,12 +70,14 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     self.assertEqual('input', input_details[0]['name'])
     self.assertEqual(np.uint8, input_details[0]['dtype'])
     self.assertTrue(([1, 4] == input_details[0]['shape']).all())
+    self.assertEqual((1.0, 0), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
     self.assertEqual(1, len(output_details))
     self.assertEqual('output', output_details[0]['name'])
     self.assertEqual(np.uint8, output_details[0]['dtype'])
     self.assertTrue(([1, 4] == output_details[0]['shape']).all())
+    self.assertEqual((1.0, 0), output_details[0]['quantization'])
 
     test_input = np.array([[1, 2, 3, 4]], dtype=np.uint8)
     expected_output = np.array([[4, 3, 2, 1]], dtype=np.uint8)
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index f30067de94cf631260d42ca1f0179ee4f65582c8..35ad226b78c906f0819afd5b029a1a0d438d69af 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -109,6 +109,13 @@ PyObject* PyArrayFromIntVector(const int* data, npy_intp size) {
   return PyArray_SimpleNewFromData(1, &size, NPY_INT32, pydata);
 }
 
+PyObject* PyTupleFromQuantizationParam(const TfLiteQuantizationParams& param) {
+  PyObject* result = PyTuple_New(2);
+  PyTuple_SET_ITEM(result, 0, PyFloat_FromDouble(param.scale));
+  PyTuple_SET_ITEM(result, 1, PyInt_FromLong(param.zero_point));
+  return result;
+}
+
 }  // namespace
 
 InterpreterWrapper::InterpreterWrapper(
@@ -214,6 +221,16 @@ PyObject* InterpreterWrapper::TensorSize(int i) const {
   return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
 }
 
+PyObject* InterpreterWrapper::TensorQuantization(int i) const {
+  if (!interpreter_ || i >= interpreter_->tensors_size() || i < 0) {
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+
+  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  return PyTupleFromQuantizationParam(tensor->params);
+}
+
 bool InterpreterWrapper::SetTensor(int i, PyObject* value) {
   if (!interpreter_) {
     LOG(ERROR) << "Invalid interpreter.";
@@ -302,12 +319,19 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
   return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
 }
 
-InterpreterWrapper* InterpreterWrapper::CreateWrapperCPP(
+InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
     const char* model_path) {
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromFile(model_path);
   return model ? new InterpreterWrapper(std::move(model)) : nullptr;
 }
 
+InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
+    const char* data, size_t len) {
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromBuffer(data, len);
+  return model ? new InterpreterWrapper(std::move(model)) : nullptr;
+}
+
 }  // namespace interpreter_wrapper
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index dea71ca8794da3aea4a4266721df0a9ebbe6bb58..0972c572595f5044a305a81afaccbea5f131247c 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -37,7 +37,11 @@ namespace interpreter_wrapper {
 class InterpreterWrapper {
  public:
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPP(const char* model_path);
+  static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
+
+  // SWIG caller takes ownership of pointer.
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
+                                                        size_t len);
 
   ~InterpreterWrapper();
   bool AllocateTensors();
@@ -50,6 +54,7 @@ class InterpreterWrapper {
   std::string TensorName(int i) const;
   PyObject* TensorType(int i) const;
   PyObject* TensorSize(int i) const;
+  PyObject* TensorQuantization(int i) const;
   bool SetTensor(int i, PyObject* value);
   PyObject* GetTensor(int i) const;
 
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 35d224924ee4d8cd94543e10c082afee25b7630e..ed6dd036f9fd9f39b74e902498d815793943924b 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -25,9 +25,9 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import os
-import subprocess
-import tempfile
+import os as _os
+import subprocess as _subprocess
+import tempfile as _tempfile
 
 # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs
@@ -74,7 +74,7 @@ else:
   _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
       "../toco/python/toco_from_protos")
 
-if _toco_from_proto_bin and not os.path.exists(_toco_from_proto_bin):
+if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
   _toco_from_proto_bin = "toco_from_protos"
 
 
@@ -102,10 +102,10 @@ def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
     return _toco_python.TocoConvert(
         model_flags_str, toco_flags_str, input_data_str)
 
-  with tempfile.NamedTemporaryFile() as fp_toco, \
-           tempfile.NamedTemporaryFile() as fp_model, \
-           tempfile.NamedTemporaryFile() as fp_input, \
-           tempfile.NamedTemporaryFile() as fp_output:
+  with _tempfile.NamedTemporaryFile() as fp_toco, \
+           _tempfile.NamedTemporaryFile() as fp_model, \
+           _tempfile.NamedTemporaryFile() as fp_input, \
+           _tempfile.NamedTemporaryFile() as fp_output:
     fp_model.write(model_flags_str)
     fp_toco.write(toco_flags_str)
     fp_input.write(input_data_str)
@@ -118,11 +118,11 @@ def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
         fp_output.name
     ]
     cmdline = " ".join(cmd)
-    proc = subprocess.Popen(
+    proc = _subprocess.Popen(
         cmdline,
         shell=True,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
+        stdout=_subprocess.PIPE,
+        stderr=_subprocess.STDOUT,
         close_fds=True)
     stdout, stderr = proc.communicate()
     exitcode = proc.returncode
diff --git a/tensorflow/contrib/lite/schema/BUILD b/tensorflow/contrib/lite/schema/BUILD
index a758c5e7e1434447e83c993034d6496bff8b2a48..246ec85fe47e496e157a91ab4ff84f6b1eeab4a4 100644
--- a/tensorflow/contrib/lite/schema/BUILD
+++ b/tensorflow/contrib/lite/schema/BUILD
@@ -1,8 +1,6 @@
-package(
-    default_visibility = [
-        "//visibility:public",
-    ],
-)
+package(default_visibility = [
+    "//visibility:public",
+])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -72,16 +70,4 @@ cc_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
index 08bcfe451685f488be2c3bc180f2dfc43dfe4f05..ac408d2f94b98d505afe4c951d7cc2ff960606fb 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
+++ b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
@@ -46,8 +46,7 @@ extern "C" {
 #endif  // __cplusplus
 
 // The enum for builtin operators.
-// Note: CUSTOM and DELEGATE are 2 special ops which are not real biultin
-// ops.
+// Note: CUSTOM and DELEGATE are 2 special ops which are not real built-in ops.
 typedef enum {
 )";
 
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 04387fed33fb8d99cd862a5c5fe0c8465f84337e..c63bfb28cc66494c3a42250632485c67861c5e9b 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -130,6 +130,8 @@ enum BuiltinOperator : byte {
   DELEGATE = 51,
   BIDIRECTIONAL_SEQUENCE_LSTM = 52,
   CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
 }
 
 // Options for the builtin operators.
@@ -172,6 +174,7 @@ union BuiltinOptions {
   LogSoftmaxOptions,
   CastOptions,
   DequantizeOptions,
+  MaximumOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -378,11 +381,16 @@ table LogSoftmaxOptions {
 }
 
 table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
 }
 
 table DequantizeOptions {
 }
 
+table MaximumOptions {
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index b922de20816bafc6e01c4bb66ddeef839be2bbd8..0735be5c8f1b1c8a87c3d47839ce54595d58af7d 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -145,6 +145,9 @@ struct CastOptionsT;
 struct DequantizeOptions;
 struct DequantizeOptionsT;
 
+struct MaximumOptions;
+struct MaximumOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -254,11 +257,13 @@ enum BuiltinOperator {
   BuiltinOperator_DELEGATE = 51,
   BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM = 52,
   BuiltinOperator_CAST = 53,
+  BuiltinOperator_PRELU = 54,
+  BuiltinOperator_MAXIMUM = 55,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_CAST
+  BuiltinOperator_MAX = BuiltinOperator_MAXIMUM
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[52] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[54] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -311,7 +316,9 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[52] {
     BuiltinOperator_LOG_SOFTMAX,
     BuiltinOperator_DELEGATE,
     BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
-    BuiltinOperator_CAST
+    BuiltinOperator_CAST,
+    BuiltinOperator_PRELU,
+    BuiltinOperator_MAXIMUM
   };
   return values;
 }
@@ -372,6 +379,8 @@ inline const char **EnumNamesBuiltinOperator() {
     "DELEGATE",
     "BIDIRECTIONAL_SEQUENCE_LSTM",
     "CAST",
+    "PRELU",
+    "MAXIMUM",
     nullptr
   };
   return names;
@@ -422,11 +431,12 @@ enum BuiltinOptions {
   BuiltinOptions_LogSoftmaxOptions = 36,
   BuiltinOptions_CastOptions = 37,
   BuiltinOptions_DequantizeOptions = 38,
+  BuiltinOptions_MaximumOptions = 39,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_DequantizeOptions
+  BuiltinOptions_MAX = BuiltinOptions_MaximumOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[39] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[40] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -466,7 +476,8 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[39] {
     BuiltinOptions_SplitOptions,
     BuiltinOptions_LogSoftmaxOptions,
     BuiltinOptions_CastOptions,
-    BuiltinOptions_DequantizeOptions
+    BuiltinOptions_DequantizeOptions,
+    BuiltinOptions_MaximumOptions
   };
   return values;
 }
@@ -512,6 +523,7 @@ inline const char **EnumNamesBuiltinOptions() {
     "LogSoftmaxOptions",
     "CastOptions",
     "DequantizeOptions",
+    "MaximumOptions",
     nullptr
   };
   return names;
@@ -678,6 +690,10 @@ template<> struct BuiltinOptionsTraits<DequantizeOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_DequantizeOptions;
 };
 
+template<> struct BuiltinOptionsTraits<MaximumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MaximumOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1013,6 +1029,14 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_DequantizeOptions ?
       reinterpret_cast<const DequantizeOptionsT *>(value) : nullptr;
   }
+  MaximumOptionsT *AsMaximumOptions() {
+    return type == BuiltinOptions_MaximumOptions ?
+      reinterpret_cast<MaximumOptionsT *>(value) : nullptr;
+  }
+  const MaximumOptionsT *AsMaximumOptions() const {
+    return type == BuiltinOptions_MaximumOptions ?
+      reinterpret_cast<const MaximumOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -3678,14 +3702,30 @@ flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::Flat
 
 struct CastOptionsT : public flatbuffers::NativeTable {
   typedef CastOptions TableType;
-  CastOptionsT() {
+  TensorType in_data_type;
+  TensorType out_data_type;
+  CastOptionsT()
+      : in_data_type(TensorType_FLOAT32),
+        out_data_type(TensorType_FLOAT32) {
   }
 };
 
 struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef CastOptionsT NativeTableType;
+  enum {
+    VT_IN_DATA_TYPE = 4,
+    VT_OUT_DATA_TYPE = 6
+  };
+  TensorType in_data_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_IN_DATA_TYPE, 0));
+  }
+  TensorType out_data_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_OUT_DATA_TYPE, 0));
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IN_DATA_TYPE) &&
+           VerifyField<int8_t>(verifier, VT_OUT_DATA_TYPE) &&
            verifier.EndTable();
   }
   CastOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3696,6 +3736,12 @@ struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct CastOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
+  void add_in_data_type(TensorType in_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_IN_DATA_TYPE, static_cast<int8_t>(in_data_type), 0);
+  }
+  void add_out_data_type(TensorType out_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_OUT_DATA_TYPE, static_cast<int8_t>(out_data_type), 0);
+  }
   explicit CastOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -3709,8 +3755,12 @@ struct CastOptionsBuilder {
 };
 
 inline flatbuffers::Offset<CastOptions> CreateCastOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType in_data_type = TensorType_FLOAT32,
+    TensorType out_data_type = TensorType_FLOAT32) {
   CastOptionsBuilder builder_(_fbb);
+  builder_.add_out_data_type(out_data_type);
+  builder_.add_in_data_type(in_data_type);
   return builder_.Finish();
 }
 
@@ -3756,6 +3806,46 @@ inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
 
 flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct MaximumOptionsT : public flatbuffers::NativeTable {
+  typedef MaximumOptions TableType;
+  MaximumOptionsT() {
+  }
+};
+
+struct MaximumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MaximumOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  MaximumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MaximumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MaximumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MaximumOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit MaximumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  MaximumOptionsBuilder &operator=(const MaximumOptionsBuilder &);
+  flatbuffers::Offset<MaximumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MaximumOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  MaximumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -3987,6 +4077,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const DequantizeOptions *builtin_options_as_DequantizeOptions() const {
     return builtin_options_type() == BuiltinOptions_DequantizeOptions ? static_cast<const DequantizeOptions *>(builtin_options()) : nullptr;
   }
+  const MaximumOptions *builtin_options_as_MaximumOptions() const {
+    return builtin_options_type() == BuiltinOptions_MaximumOptions ? static_cast<const MaximumOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -4165,6 +4258,10 @@ template<> inline const DequantizeOptions *Operator::builtin_options_as<Dequanti
   return builtin_options_as_DequantizeOptions();
 }
 
+template<> inline const MaximumOptions *Operator::builtin_options_as<MaximumOptions>() const {
+  return builtin_options_as_MaximumOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -5656,6 +5753,8 @@ inline CastOptionsT *CastOptions::UnPack(const flatbuffers::resolver_function_t
 inline void CastOptions::UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
+  { auto _e = in_data_type(); _o->in_data_type = _e; };
+  { auto _e = out_data_type(); _o->out_data_type = _e; };
 }
 
 inline flatbuffers::Offset<CastOptions> CastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -5666,8 +5765,12 @@ inline flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBuffe
   (void)_rehasher;
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _in_data_type = _o->in_data_type;
+  auto _out_data_type = _o->out_data_type;
   return tflite::CreateCastOptions(
-      _fbb);
+      _fbb,
+      _in_data_type,
+      _out_data_type);
 }
 
 inline DequantizeOptionsT *DequantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -5693,6 +5796,29 @@ inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffer
       _fbb);
 }
 
+inline MaximumOptionsT *MaximumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MaximumOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void MaximumOptions::UnPackTo(MaximumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<MaximumOptions> MaximumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMaximumOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMaximumOptions(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -6025,6 +6151,10 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_MaximumOptions: {
+      auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -6195,6 +6325,10 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_MaximumOptions: {
+      auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -6353,6 +6487,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const DequantizeOptionsT *>(value);
       return CreateDequantizeOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_MaximumOptions: {
+      auto ptr = reinterpret_cast<const MaximumOptionsT *>(value);
+      return CreateMaximumOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -6511,6 +6649,10 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new DequantizeOptionsT(*reinterpret_cast<DequantizeOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_MaximumOptions: {
+      value = new MaximumOptionsT(*reinterpret_cast<MaximumOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -6708,6 +6850,11 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_MaximumOptions: {
+      auto ptr = reinterpret_cast<MaximumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 631601656dbad59b7cf0b9b81276d42ea9322cb6..62f20638bac943e9f674087e46c18233e8b09d63 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -35,11 +35,12 @@ gen_zipped_test_files(
         "l2norm.zip",
         "local_response_norm.zip",
         "log_softmax.zip",
-        "lstm.zip",
         "max_pool.zip",
+        "maximum.zip",
         "mean.zip",
         "mul.zip",
         "pad.zip",
+        "prelu.zip",
         "relu.zip",
         "relu1.zip",
         "relu6.zip",
@@ -373,16 +374,4 @@ tf_cc_test(
     }),
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 420bdb41f140d2ef35c66f47f386b016eb9e60c0..80450524520892bb6f9d41f0d2c79f355ca3af15 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -617,6 +617,54 @@ def make_relu6_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_prelu_tests(zip_path):
+  """Make a set of tests to do PReLU."""
+
+  test_parameters = [{
+      # The canonical case for image processing is having a 4D `input` (NHWC)
+      # and `shared_axes`=[1, 2], so the alpha parameter is per channel.
+      "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
+      "shared_axes": [[1, 2], [1]],
+  }]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    prelu = tf.keras.layers.PReLU(shared_axes=parameters["shared_axes"])
+    out = prelu(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build the inputs for the test case."""
+
+    input_shape = parameters["input_shape"]
+    input_values = create_tensor_data(
+        np.float32, input_shape, min_value=-10, max_value=10)
+    shared_axes = parameters["shared_axes"]
+
+    alpha_shape = []
+    for dim in range(1, len(input_shape)):
+      alpha_shape.append(1 if dim in shared_axes else input_shape[dim])
+
+    alpha_values = create_tensor_data(np.float32, alpha_shape)
+
+    with tf.variable_scope("", reuse=True):
+      alpha = tf.get_variable("p_re_lu/alpha")
+      sess.run(alpha.assign(alpha_values))
+
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      use_frozen_graph=True)
+
+
 # This function tests various TensorFLow functions that generates Const op,
 # including `tf.ones`, `tf.zeros` and random functions.
 def make_constant_tests(zip_path):
@@ -706,7 +754,7 @@ def make_mean_tests(zip_path):
           [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
       ],
       "const_axis": [True, False],
-      "keep_dims": [True, False],
+      "keepdims": [True, False],
   }, {
       "input_dtype": [tf.float32, tf.int32, tf.int64],
       "input_shape": [[1, 224, 224, 3]],
@@ -717,7 +765,7 @@ def make_mean_tests(zip_path):
           [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
       ],
       "const_axis": [True, False],
-      "keep_dims": [True, False],
+      "keepdims": [True, False],
   }]
 
   def build_graph(parameters):
@@ -740,7 +788,7 @@ def make_mean_tests(zip_path):
       input_tensors = [input_tensor, axis]
 
     out = tf.reduce_mean(
-        input_tensor, axis=axis, keep_dims=parameters["keep_dims"])
+        input_tensor, axis=axis, keepdims=parameters["keepdims"])
     return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -814,6 +862,41 @@ def make_log_softmax_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_maximum_tests(zip_path):
+  """Make a set of tests to do maximum."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the maximum op testing graph."""
+    input_tensor_1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_1",
+        shape=parameters["input_shape_1"])
+    input_tensor_2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_2",
+        shape=parameters["input_shape_2"])
+
+    out = tf.maximum(input_tensor_1, input_tensor_2)
+    return [input_tensor_1, input_tensor_2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_1"]),
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_2"])
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_binary_op_tests_func(binary_operator):
   """Return a function that does a test on a binary operator."""
   return lambda zip_path: make_binary_op_tests(zip_path, binary_operator)
@@ -826,12 +909,11 @@ def make_gather_tests(zip_path):
       # TODO(mgubin): add string tests when they are supported by Toco.
       # TODO(mgubin): add tests for Nd indices when they are supported by
       # TfLite.
-      # TODO(mgubin): add tests for axis != 0 when it is supported by TfLite.
       "params_dtype": [tf.float32, tf.int32],
       "params_shape": [[10], [1, 2, 20]],
       "indices_dtype": [tf.int32],
       "indices_shape": [[3], [5]],
-      "axis": [0],  # axis!=0 is GatherV2
+      "axis": [0, 1],
   }]
 
   def build_graph(parameters):
@@ -1911,6 +1993,7 @@ def main(unused_args):
         "relu.zip": make_relu_tests,
         "relu1.zip": make_relu1_tests,
         "relu6.zip": make_relu6_tests,
+        "prelu.zip": make_prelu_tests,
         "l2_pool.zip": make_pool_tests(make_l2_pool),
         "avg_pool.zip": make_pool_tests(tf.nn.avg_pool),
         "max_pool.zip": make_pool_tests(tf.nn.max_pool),
@@ -1929,6 +2012,7 @@ def main(unused_args):
         "exp.zip": make_exp_tests,
         "log_softmax.zip": make_log_softmax_tests,
         "lstm.zip": make_lstm_tests,
+        "maximum.zip": make_maximum_tests,
     }
     out = FLAGS.zip_to_output
     bin_path = FLAGS.toco
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 86606d12393b94567fbe1fceb6d708b266efe4a8..6697b86e798756bf3273e36dc105eee17d146aa6 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -47,10 +47,6 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // Key is a substring of the test name and value is a bug number.
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
-    // Sub and Div don't support broadcasting.
-    {R"(^\/diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(^\/suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-
     // Add only supports float32. (and "constant" tests use Add)
     {R"(^\/adda.*int32)", "68808744"},
     {R"(^\/constant.*int32)", "68808744"},
@@ -93,8 +89,11 @@ std::map<string, string> kBrokenTests = {
     // Transpose only supports 1D-4D input tensors.
     {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"},
 
-    // Lstm kernel gets different results on tsan, asan, msan.
-    {R"(^\/lstmdtype=tf.float32.*)", "73830845"},
+    // PRelu only supports 4D input with (1, 1, channels) 3D alpha now.
+    {R"(^\/prelu.*shared_axes=\[1\])", "75975192"},
+
+    // No support for axis!=0 in GatherV2.
+    {R"(^\/gather.*axis=1)", "76910444"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -238,41 +237,42 @@ TEST_P(OpsTest, RunStuff) {
 
 INSTANTIATE_TESTS(add)
 INSTANTIATE_TESTS(avg_pool)
-INSTANTIATE_TESTS(space_to_batch_nd)
 INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
 INSTANTIATE_TESTS(constant)
 INSTANTIATE_TESTS(control_dep)
 INSTANTIATE_TESTS(conv)
 INSTANTIATE_TESTS(depthwiseconv)
+INSTANTIATE_TESTS(div)
 INSTANTIATE_TESTS(exp)
 INSTANTIATE_TESTS(fully_connected)
 INSTANTIATE_TESTS(fused_batch_norm)
 INSTANTIATE_TESTS(gather)
 INSTANTIATE_TESTS(global_batch_norm)
-INSTANTIATE_TESTS(l2norm)
 INSTANTIATE_TESTS(l2_pool)
+INSTANTIATE_TESTS(l2norm)
 INSTANTIATE_TESTS(local_response_norm)
 INSTANTIATE_TESTS(log_softmax)
+INSTANTIATE_TESTS(maximum)
 INSTANTIATE_TESTS(max_pool)
+INSTANTIATE_TESTS(mean)
 INSTANTIATE_TESTS(mul)
 INSTANTIATE_TESTS(pad)
 INSTANTIATE_TESTS(relu)
 INSTANTIATE_TESTS(relu1)
+INSTANTIATE_TESTS(prelu)
 INSTANTIATE_TESTS(relu6)
 INSTANTIATE_TESTS(reshape)
 INSTANTIATE_TESTS(resize_bilinear)
 INSTANTIATE_TESTS(sigmoid)
 INSTANTIATE_TESTS(softmax)
+INSTANTIATE_TESTS(space_to_batch_nd)
 INSTANTIATE_TESTS(space_to_depth)
-INSTANTIATE_TESTS(sub)
 INSTANTIATE_TESTS(split)
-INSTANTIATE_TESTS(div)
-INSTANTIATE_TESTS(transpose)
-INSTANTIATE_TESTS(lstm)
-INSTANTIATE_TESTS(mean)
 INSTANTIATE_TESTS(squeeze)
 INSTANTIATE_TESTS(strided_slice)
+INSTANTIATE_TESTS(sub)
+INSTANTIATE_TESTS(transpose)
 
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 613223f3d4ff212cb8672494243b2d7a1d06b3db..c399f4f2b78d7420ac6ea7098ed44b2122216279 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -56,12 +56,16 @@ void SetTensorData(const std::vector<T>& values, TfLitePtrUnion* data) {
 
 class TfLiteDriver::Expectation {
  public:
-  Expectation() { data_.raw = nullptr; }
+  Expectation() {
+    data_.raw = nullptr;
+    num_elements_ = 0;
+  }
   ~Expectation() { delete[] data_.raw; }
   template <typename T>
   void SetData(const string& csv_values) {
     const auto& values = testing::Split<T>(csv_values, ",");
-    data_.raw = new char[values.size() * sizeof(T)];
+    num_elements_ = values.size();
+    data_.raw = new char[num_elements_ * sizeof(T)];
     SetTensorData(values, &data_);
   }
 
@@ -88,7 +92,13 @@ class TfLiteDriver::Expectation {
     constexpr double kRelativeThreshold = 1e-2f;
     constexpr double kAbsoluteThreshold = 1e-4f;
 
-    int tensor_size = tensor.bytes / sizeof(T);
+    size_t tensor_size = tensor.bytes / sizeof(T);
+
+    if (tensor_size != num_elements_) {
+      std::cerr << "Expected a tensor with " << num_elements_
+                << " elements, got " << tensor_size << std::endl;
+      return false;
+    }
 
     bool good_output = true;
     for (int i = 0; i < tensor_size; ++i) {
@@ -115,6 +125,7 @@ class TfLiteDriver::Expectation {
   }
 
   TfLitePtrUnion data_;
+  size_t num_elements_;
 };
 
 TfLiteDriver::TfLiteDriver(bool use_nnapi) : use_nnapi_(use_nnapi) {}
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 395abc532607d3cab49be6e995648f2839c1555c..8a35fb9034ca9cd1b9eb87956aed1eb96485dc9b 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -124,6 +124,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -142,6 +143,7 @@ cc_library(
         ":toco_graphviz_dump_options",
         ":toco_port",
         ":types_proto_cc",
+        "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
@@ -167,6 +169,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "toco_saved_model",
+    srcs = [
+        "toco_saved_model.cc",
+    ],
+    hdrs = [
+        "toco_saved_model.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_flags_proto_cc",
+        ":types_proto_cc",
+        "//tensorflow/cc/tools:freeze_saved_model",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "toco_saved_model_test",
+    srcs = ["toco_saved_model_test.cc"],
+    deps = [
+        ":model_cmdline_flags",
+        ":toco_cmdline_flags",
+        ":toco_saved_model",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "graph_transformations",
     srcs = [
@@ -193,9 +230,11 @@ cc_library(
         "graph_transformations/identify_lstm.cc",
         "graph_transformations/identify_lstm_merge_inputs.cc",
         "graph_transformations/identify_lstm_split_inputs.cc",
+        "graph_transformations/identify_prelu.cc",
         "graph_transformations/identify_relu1.cc",
         "graph_transformations/lstm_utils.cc",
         "graph_transformations/make_initial_dequantize_operator.cc",
+        "graph_transformations/merge_reshape_into_preceding_transpose.cc",
         "graph_transformations/propagate_activation_function_into_constants.cc",
         "graph_transformations/propagate_array_data_types.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
@@ -213,7 +252,8 @@ cc_library(
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_trivial_slice.cc",
         "graph_transformations/remove_unused_op.cc",
-        "graph_transformations/reorder_activation_functions.cc",
+        "graph_transformations/reorder_elementwise_unary.cc",
+        "graph_transformations/reorder_reshape_transpose.cc",
         "graph_transformations/resolve_batch_normalization.cc",
         "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
         "graph_transformations/resolve_constant_binary.cc",
@@ -221,6 +261,7 @@ cc_library(
         "graph_transformations/resolve_constant_fake_quant.cc",
         "graph_transformations/resolve_constant_fill.cc",
         "graph_transformations/resolve_constant_gather.cc",
+        "graph_transformations/resolve_constant_random_uniform.cc",
         "graph_transformations/resolve_constant_range.cc",
         "graph_transformations/resolve_constant_shape_or_rank.cc",
         "graph_transformations/resolve_constant_stack.cc",
@@ -362,6 +403,7 @@ tf_cc_binary(
         ":toco_cmdline_flags",
         ":toco_flags_proto_cc",
         ":toco_port",
+        ":toco_saved_model",
         ":toco_tooling",
         ":types_proto_cc",
         "//tensorflow/core:lib",
@@ -380,15 +422,3 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/README.md b/tensorflow/contrib/lite/toco/README.md
index 281b2ea5e4c5553ff7aa240cdef3cb9819f19b49..522e260ad2a14c5f8e080c0a0f538f4192b7ed2d 100644
--- a/tensorflow/contrib/lite/toco/README.md
+++ b/tensorflow/contrib/lite/toco/README.md
@@ -1,26 +1,27 @@
-# The TensorFlow Lite Optimizing Converter
+# TOCO: TensorFlow Lite Optimizing Converter
 
-The TensorFlow Lite Optimizing Converter's most typical use is converting from the TensorFlow GraphDef to the TensorFlow Lite
-format, but it supports much more than that.
+The TensorFlow Lite Optimizing Converter converts TensorFlow graphs into
+TensorFlow Lite graphs. There are additional usages that are also detailed in
+the usage documentation.
 
 ## Usage documentation
 
 Usage information is given in these documents:
 
+*   [Command-line glossary](g3doc/cmdline_reference.md)
 *   [Command-line examples](g3doc/cmdline_examples.md)
-*   [Command-line reference](g3doc/cmdline_reference.md)
-*   [Python API](g3doc/python_api.md)
-
-## Design documentation
-
-Coming soon!
+*   [Python API examples](g3doc/python_api.md)
 
 ## Where the converter fits in the TensorFlow landscape
 
-In the typical case, an application developer is using TensorFlow to design and
-train models, then uses TensorFlow's freeze_graph.py to generate a frozen
-inference graph, then uses the converter to convert that into a TensorFlow Lite flatbuffer file,
-then ships that file to client devices where the TensorFlow Lite interpreter handles them
-on-device. This is represented in the following diagram:
-
-![drawing](https://storage.googleapis.com/download.tensorflow.org/example_images/tensorflow_landscape.svg)
+Once an application developer has a trained TensorFlow model, TOCO will accept
+that model and generate a TensorFlow Lite
+[FlatBuffer](https://google.github.io/flatbuffers/) file. TOCO currently supports
+[SavedModels](https://www.tensorflow.org/programmers_guide/saved_model#using_savedmodel_with_estimators)
+and frozen graphs (models generated via
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)).
+The TensorFlow Lite FlatBuffer file can be shipped to client devices, generally
+mobile devices, where the TensorFlow Lite interpreter handles them on-device.
+This flow is represented in the diagram below.
+
+![drawing](g3doc/toco_landscape.svg)
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 49cc1fc2aa365925cde86ceb658ff2b354d06911..621fbcb98db049f819ebbbda8816ad4e30538530 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -248,29 +248,49 @@ void AllocateTransientArrays(Model* model,
        op_index++) {
     const auto& op = model->operators[op_index];
     // Allocate those arrays whose lifespan starts exactly here.
+    std::vector<string> arrays_to_allocate;
     for (const auto& input : op->inputs) {
       if (StartsAt(array_lifespans[input], op_index)) {
-        AllocateTransientArray(*model, input, &allocator,
-                               transient_data_alignment);
+        if (std::find(arrays_to_allocate.begin(), arrays_to_allocate.end(),
+                      input) == arrays_to_allocate.end()) {
+          arrays_to_allocate.push_back(input);
+        }
       }
     }
     for (const auto& output : op->outputs) {
       if (StartsAt(array_lifespans[output], op_index)) {
-        AllocateTransientArray(*model, output, &allocator,
-                               transient_data_alignment);
+        if (std::find(arrays_to_allocate.begin(), arrays_to_allocate.end(),
+                      output) == arrays_to_allocate.end()) {
+          arrays_to_allocate.push_back(output);
+        }
       }
     }
+    for (const string& array : arrays_to_allocate) {
+      AllocateTransientArray(*model, array, &allocator,
+                             transient_data_alignment);
+    }
+
     // Deallocate those arrays whose lifespan ends exactly here.
+    std::vector<string> arrays_to_deallocate;
     for (const auto& input : op->inputs) {
       if (EndsAt(array_lifespans[input], op_index)) {
-        DeallocateTransientArray(*model, input, &allocator);
+        if (std::find(arrays_to_deallocate.begin(), arrays_to_deallocate.end(),
+                      input) == arrays_to_deallocate.end()) {
+          arrays_to_deallocate.push_back(input);
+        }
       }
     }
     for (const auto& output : op->outputs) {
       if (EndsAt(array_lifespans[output], op_index)) {
-        DeallocateTransientArray(*model, output, &allocator);
+        if (std::find(arrays_to_deallocate.begin(), arrays_to_deallocate.end(),
+                      output) == arrays_to_deallocate.end()) {
+          arrays_to_deallocate.push_back(output);
+        }
       }
     }
+    for (const string& array : arrays_to_deallocate) {
+      DeallocateTransientArray(*model, array, &allocator);
+    }
   }
 
   // Just out of curiosity (not used in the actual allocation process)
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 59a6115920614d38900c0370708324c122384420..39e49bc347abaf151a1e3f9483a83eeac4e49c58 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -26,6 +26,7 @@ limitations under the License.
 #endif
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
+#include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 
@@ -190,6 +191,7 @@ struct ParsedModelFlags {
   Arg<string> output_array;
   Arg<string> output_arrays;
   Arg<string> input_shapes;
+  Arg<int> batch_size = Arg<int>(1);
   Arg<float> mean_value = Arg<float>(0.f);
   Arg<string> mean_values;
   Arg<float> std_value = Arg<float>(1.f);
@@ -209,15 +211,18 @@ struct ParsedModelFlags {
   Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
   Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
   Arg<string> arrays_extra_info_file;
+  Arg<string> model_flags_file;
 };
 
 // Flags that describe the operation you would like to do (what conversion
 // you want). See toco_cmdline_flags.cc for details.
 struct ParsedTocoFlags {
   Arg<string> input_file;
+  Arg<string> savedmodel_directory;
   Arg<string> output_file;
-  Arg<string> input_format;
-  Arg<string> output_format;
+  Arg<string> input_format = Arg<string>("TENSORFLOW_GRAPHDEF");
+  Arg<string> output_format = Arg<string>("TFLITE");
+  Arg<string> savedmodel_tagset = Arg<string>(tensorflow::kSavedModelTagServe);
   // TODO(aselle): command_line_flags  doesn't support doubles
   Arg<float> default_ranges_min = Arg<float>(0.);
   Arg<float> default_ranges_max = Arg<float>(0.);
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 22a23357b36c16ea937e726f1e49aa95d7f964e3..5d51431005f38b7a21d268f04fed3280de25c313 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -357,6 +357,14 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
+  if ((src_op.dilation_width_factor != 1) ||
+      (src_op.dilation_height_factor != 1)) {
+    auto& dilations = (*conv2d_op->mutable_attr())["dilations"];
+    dilations.mutable_list()->add_i(1);
+    dilations.mutable_list()->add_i(src_op.dilation_height_factor);
+    dilations.mutable_list()->add_i(src_op.dilation_width_factor);
+    dilations.mutable_list()->add_i(1);
+  }
   string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
@@ -391,84 +399,6 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
   }
 }
 
-void ConvertDilatedConvOperator(const Model& model, const ConvOperator& src_op,
-                                GraphDef* tensorflow_graph) {
-  CHECK((src_op.dilation_width_factor > 1) ||
-        (src_op.dilation_height_factor > 1))
-      << "Conv operator must have height or width dilation factor > 1. "
-         "Otherwise, use regular conv op.";
-  CHECK_EQ(src_op.stride_width, 1)
-      << "Dilated AND strided convolution is unsupported";
-  CHECK_EQ(src_op.stride_height, 1)
-      << "Dilated AND strided convolution is unsupported";
-
-  // Emulate dilated convolution with a chain of SpaceToBatchND -> Conv ->
-  // BatchToSpaceND ops.
-
-  // Compute padding
-  const auto& input_array = model.GetArray(src_op.inputs[0]);
-  const auto& input_shape = input_array.shape();
-  CHECK_EQ(input_shape.dimensions_count(), 4);
-  int height_mod_dilation = input_shape.dims(1) % src_op.dilation_height_factor;
-  int pad_height;
-  if (height_mod_dilation) {
-    pad_height = src_op.dilation_height_factor - height_mod_dilation;
-  } else {
-    pad_height = 0;
-  }
-  int pad_width;
-  int width_mod_dilation = input_shape.dims(2) % src_op.dilation_width_factor;
-  if (width_mod_dilation) {
-    pad_width = src_op.dilation_width_factor - width_mod_dilation;
-  } else {
-    pad_width = 0;
-  }
-
-  // SpaceToBatchND op "collapses" the spatially separated elements together
-  string stb_output = src_op.outputs[0] + "/dilated_conv_SpaceToBatch";
-  auto* stb_op = tensorflow_graph->add_node();
-  stb_op->set_op("SpaceToBatchND");
-  stb_op->set_name(stb_output);
-  *stb_op->add_input() = src_op.inputs[0];
-  (*stb_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  string block_shape = src_op.outputs[0] + "/dilated_conv_block_shape";
-  CreateIntTensorConst(
-      block_shape,
-      {src_op.dilation_height_factor, src_op.dilation_width_factor}, {2},
-      tensorflow_graph);
-  *stb_op->add_input() = block_shape;
-  (*stb_op->mutable_attr())["Tblock_shape"].set_type(DT_INT32);
-  string stb_paddings = src_op.outputs[0] + "/dilated_conv_paddings";
-  CreateIntTensorConst(stb_paddings, {0, pad_height, pad_width, 0}, {2, 2},
-                       tensorflow_graph);
-  *stb_op->add_input() = stb_paddings;
-  (*stb_op->mutable_attr())["Tpaddings"].set_type(DT_INT32);
-
-  // Perform a regular conv on the "collapsed" elements
-  ConvOperator conv_op;
-  string conv_output = src_op.outputs[0] + "/dilated_conv_Conv2D";
-  conv_op.inputs = src_op.inputs;
-  conv_op.inputs[0] = stb_output;
-  conv_op.outputs = {conv_output};
-  conv_op.padding.type = src_op.padding.type;
-  conv_op.stride_width = src_op.stride_width;
-  conv_op.stride_height = src_op.stride_height;
-  conv_op.dilation_width_factor = 1;
-  conv_op.dilation_height_factor = 1;
-  ConvertConvOperator(model, conv_op, tensorflow_graph);
-
-  // BatchToSpaceND op restores elements to their original layout
-  auto* bts_op = tensorflow_graph->add_node();
-  bts_op->set_op("BatchToSpaceND");
-  bts_op->set_name(src_op.outputs[0]);
-  *bts_op->add_input() = conv_output;
-  (*bts_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  *bts_op->add_input() = block_shape;
-  (*bts_op->mutable_attr())["Tblock_shape"].set_type(DT_INT32);
-  *bts_op->add_input() = stb_paddings;
-  (*bts_op->mutable_attr())["Tcrops"].set_type(DT_INT32);
-}
-
 void ConvertDepthwiseConvOperator(const Model& model,
                                   const DepthwiseConvOperator& src_op,
                                   GraphDef* tensorflow_graph) {
@@ -1711,6 +1641,23 @@ void ConvertTopKV2Operator(const Model& model, const TopKV2Operator& src_op,
   (*topk_op->mutable_attr())["sorted"].set_b(true);
 }
 
+void ConvertRandomUniformOperator(const Model& model,
+                                  const RandomUniformOperator& src_op,
+                                  GraphDef* tensorflow_graph) {
+  CHECK(tensorflow_graph != nullptr);
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("RandomUniform");
+  CHECK_EQ(src_op.inputs.size(), 1);
+  new_op->set_name(src_op.outputs[0]);
+  *new_op->add_input() = src_op.inputs[0];
+  const auto shape_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(shape_type);
+  (*new_op->mutable_attr())["dtype"].set_type(
+      GetTensorFlowDataType(src_op.dtype));
+  (*new_op->mutable_attr())["seed"].set_i(src_op.seed);
+  (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -1719,13 +1666,8 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   }
 
   if (src_op.type == OperatorType::kConv) {
-    const ConvOperator& conv_op = static_cast<const ConvOperator&>(src_op);
-    if ((conv_op.dilation_width_factor != 1) ||
-        (conv_op.dilation_height_factor != 1)) {
-      return ConvertDilatedConvOperator(model, conv_op, tensorflow_graph);
-    } else {
-      ConvertConvOperator(model, conv_op, tensorflow_graph);
-    }
+    ConvertConvOperator(model, static_cast<const ConvOperator&>(src_op),
+                        tensorflow_graph);
   } else if (src_op.type == OperatorType::kDepthwiseConv) {
     ConvertDepthwiseConvOperator(
         model, static_cast<const DepthwiseConvOperator&>(src_op),
@@ -1897,6 +1839,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertTransposeConvOperator(
         model, static_cast<const TransposeConvOperator&>(src_op),
         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRandomUniform) {
+    ConvertRandomUniformOperator(
+        model, static_cast<const RandomUniformOperator&>(src_op),
+        tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 372c52558973f4aacc180ac44b9e95a5e9b199ef..495014c6fc67ab0ad7c975d0570034545e90f9bc 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -1,73 +1,72 @@
 # TensorFlow Lite Optimizing Converter command-line examples
 
-This page is a guide to using the TensorFlow Lite Optimizing Converter by
-looking at some example command lines. It is complemented by the following other
-documents:
+This page provides examples on how to use TOCO via command line. It is
+complemented by the following documents:
 
 *   [README](../README.md)
-*   [Command-line reference](cmdline_reference.md)
+*   [Command-line glossary](cmdline_reference.md)
+*   [Python API examples](python_api.md)
 
 Table of contents:
 
-[TOC]
-
-## Convert a TensorFlow GraphDef to TensorFlow Lite for float inference
-
-In this example, we look at the most common task: we have an ordinary TensorFlow
-GraphDef and want to convert it to a TensorFlow Lite flatbuffer to perform
-floating-point inference.
+*   [Convert a TensorFlow SavedModel to TensorFlow Lite](#savedmodel)
+*   [Convert a TensorFlow GraphDef to TensorFlow Lite for float
+    inference](#graphdef-float)
+*   [Quantization](#quantization)
+    *   [Convert a TensorFlow GraphDef to TensorFlow Lite for quantized
+        inference](#graphdef-quant)
+    *   [Use "dummy-quantization" to try out quantized inference on a float
+        graph](#dummy-quant)
+*   [Specifying input and output arrays](#specifying-input-and-output-arrays)
+    *   [Multiple output arrays](#multiple-output-arrays)
+    *   [Multiple input arrays](#multiple-input-arrays)
+    *   [Specifying subgraphs](#specifying-subgraphs)
+*   [Other conversions supported by TOCO](#other-conversions)
+    *   [Optimize a TensorFlow GraphDef](#optimize-graphdef)
+    *   [Convert a TensorFlow Lite FlatBuffer back into TensorFlow GraphDef
+        format](#to-graphdef)
+*   [Logging](#logging)
+    *   [Standard logging](#standard-logging)
+    *   [Verbose logging](#verbose-logging)
+    *   [Graph "video" logging](#graph-video-logging)
+*   [Graph visualizations](#graph-visualizations)
+    *   [Using --output_format=GRAPHVIZ_DOT](#using-output-formatgraphviz-dot)
+    *   [Using --dump_graphviz](#using-dump-graphviz)
+    *   [Legend for the graph visualizations](#graphviz-legend)
+
+## Convert a TensorFlow SavedModel to TensorFlow Lite <a name="savedmodel"></a>
+
+The follow example converts a basic TensorFlow SavedModel into a Tensorflow Lite
+FlatBuffer to perform floating-point inference.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
 bazel run --config=opt \
-  //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.tflite \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TFLITE \
-  --inference_type=FLOAT \
-  --input_shape=1,128,128,3 \
-  --input_array=input \
-  --output_array=MobilenetV1/Predictions/Reshape_1
+  third_party/tensorflow/contrib/lite/toco:toco -- \
+  --savedmodel_directory=/tmp/saved_model \
+  --output_file=/tmp/foo.tflite
 ```
 
-To explain each of these flags:
-
-*   `--input_format` and `--output_format` determine the formats of the input
-    and output files: here we are converting from `TENSORFLOW_GRAPHDEF` to
-    `TFLITE`.
-*   `--input_file` specifies the path of the input file, to be converted. When
-    `--input_format=TENSORFLOW_GRAPHDEF`, this file should be a
-    *[frozen](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)*
-    *inference* graph. Being frozen means in particular that the input file is
-    self-contained, and does not reference any external "checkpoint" file. An
-    *inference* graph is a version of a graph meant to be used for inference,
-    typically not the same graph file as was used for training a given model.
-*   `--output_file` specifies the destination to write the converted file to.
-*   `--input_array` specifies the input activations, that is, the input "tensor"
-    in the input TensorFlow GraphDef file. The array designated by
-    `--input_array` is the one that the user will have to provide the contents
-    of as input to the runtime inference code.
-*   `--output_array` specifies the output activations, that is, the output
-    "tensor" in the input TensorFlow GraphDef file. The runtime inference code
-    will store its results in the array designated by `--output_array`.
-*   `--input_shape` specifies the shape of the input array. It is currently
-    required, but the plan is for a future version to no longer require it,
-    allowing to defer the specification of the input shape until runtime. The
-    format of `input_shape` is always a comma-separated list of dimensions,
-    always in TensorFlow convention.
-*   `--inference_type` specifies what type of arithmetic the output file should
-    be relying on. It implies in particular the choice of type of the output
-    arrays in the output file.
-
-## Just optimize a TensorFlow GraphDef
+[SavedModel](https://www.tensorflow.org/programmers_guide/saved_model#using_savedmodel_with_estimators)
+has fewer required flags than frozen graphs (described [below](#graphdef-float))
+due to access to additional data contained within the SavedModel. The values for
+`--input_arrays` and `--output_arrays` are an aggregated, alphabetized list of
+the inputs and outputs in the
+[SignatureDefs](https://www.tensorflow.org/serving/signature_defs) within the
+[MetaGraphDef](https://www.tensorflow.org/programmers_guide/saved_model#apis_to_build_and_load_a_savedmodel)
+specified by `--savedmodel_tagset`. The value for `input_shapes` is
+automatically determined from the MetaGraphDef whenever possible. The default
+value for `--inference_type` for SavedModels is `FLOAT`.
 
-The converter accepts both TENSORFLOW_GRAPHDEF and TFLITE file formats as both
-`--input_format` and `--output_format`. This means that conversion from and to
-any supported format is possible, and in particular, same-format "conversions"
-are possible, and effectively ask the converter to optimize and simplify a
-graph. Example:
+There is currently no support for MetaGraphDefs without a SignatureDef or for
+MetaGraphDefs that use the [`assets/`
+directory](https://www.tensorflow.org/programmers_guide/saved_model#structure_of_a_savedmodel_directory).
+
+## Convert a TensorFlow GraphDef to TensorFlow Lite for float inference <a name="graphdef-float"></a>
+
+The follow example converts a basic TensorFlow GraphDef (frozen by
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py))
+into a TensorFlow Lite FlatBuffer to perform floating-point inference. Frozen
+graphs contain the variables stored in Checkpoint files as Const ops.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -75,56 +74,27 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.pb \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TENSORFLOW_GRAPHDEF \
+  --output_file=/tmp/foo.tflite \
+  --inference_type=FLOAT \
   --input_shape=1,128,128,3 \
   --input_array=input \
   --output_array=MobilenetV1/Predictions/Reshape_1
 ```
 
-Here we did not pass `--inference_type` because it is not considered applicable
-to the TensorFlow GraphDef format (as far as we are concerned, TensorFlow
-GraphDefs are technically always float, and the only flavor of "quantized"
-GraphDef that the converter deals with is "FakeQuantized" graphs that are still
-technically float graphs).
+## Quantization
 
-Below in the section about passing arbitrary input/output arrays we give another
-example, using the converter to extract just a sub-graph from a TensorFlow
-GraphDef.
+### Convert a TensorFlow GraphDef to TensorFlow Lite for quantized inference <a name="graphdef-quant"></a>
 
-## Convert a TensorFlow Lite flatbuffer back into TensorFlow GraphDef format
+TOCO is compatible with fixed point quantization models described
+[here](https://www.tensorflow.org/performance/quantization). These are float
+models with
+[`FakeQuant*`](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization)
+ops inserted at the boundaries of fused layers to record min-max range
+information. This generates a quantized inference workload that reproduces the
+quantization behavior that was used during training.
 
-As we mentioned that the converter supports file format conversions in any
-direction, let us just give an example of that:
-
-```
-bazel run --config=opt \
-  //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/foo.tflite \
-  --output_file=/tmp/foo.pb \
-  --input_format=TFLITE \
-  --output_format=TENSORFLOW_GRAPHDEF \
-  --input_shape=1,128,128,3 \
-  --input_array=input \
-  --output_array=MobilenetV1/Predictions/Reshape_1
-```
-
-## Convert a TensorFlow GraphDef to TensorFlow Lite for quantized inference
-
-Let us now look at a quantized model. As mentioned above, the only flavor of
-quantized TensorFlow GraphDefs that the converter is concerned with, is
-"FakeQuantized" models. These are technically float models, but with special
-`FakeQuant*` ops inserted at the boundaries of fused layers to record min-max
-range information allowing to generate a quantized inference workload that is
-able to reproduce exactly the specific quantization behavior that was used
-during training. Indeed, the whole point of quantized training is to allow for
-both training and inference to perform exactly the same arithmetic, so that the
-way that the training process about around quantization inaccuracy is
-effectively helping the quantized inference process to be more accurate.
-
-Given a quantized TensorFlow GraphDef, generating a quantized TensorFlow Lite
-flatbuffer is done like this:
+The following command generates a quantized TensorFlow Lite FlatBuffer from a
+"quantized" TensorFlow GraphDef.
 
 ```
 bazel run --config=opt \
@@ -141,36 +111,17 @@ bazel run --config=opt \
   --std_value=127
 ```
 
-Here, besides changing `--input_file` to point to a (fake-)quantized GraphDef,
-the only other changes are:
-
-*   To change `--inference_type` to `QUANTIZED_UINT8`. This effectively tells
-    the converter to generate an output file that performs quantized inference
-    on a quantized input.
-*   To pass `--mean_value` and `--std_value` flags to describe how the quantized
-    uint8 input array values are to be interpreted as the mathematical real
-    numbers that the graph is concerned with (keep in mind that even a
-    "fake-quantized" TensorFlow GraphDef is still technically a float graph).
-    The meaning of `--mean_value` and `--std_value` is explained in the
-    command-line reference; it suffices for now to say that they are a property
-    of each model.
+### Use \"dummy-quantization\" to try out quantized inference on a float graph <a name="dummy-quant"></a>
 
-## Use dummy-quantization to try out quantized inference on a float graph
+In order to evaluate the possible benefit of generating a quantized graph, TOCO
+allows "dummy-quantization" on float graphs. The flags `--default_ranges_min`
+and `--default_ranges_max` accept plausable values for the min-max ranges of the
+values in all arrays that do not have min-max information. "Dummy-quantization"
+will produce lower accuracy but will emulate the performance of a correctly
+quantized model.
 
-Sometimes, one only has a plain float graph, and one is curious as to how much
-faster inference might run if one could perform quantized inference instead of
-float inference. Rather than requiring users to first invest in quantizing their
-graphs before they can evaluate a possible benefit, the converter allows to
-simply experiment with what we call "dummy quantization": provide some vaguely
-plausible values for the min-max ranges of values in all arrays that do not have
-min-max information, so that quantization can carry on, certainly producing
-inaccurate results (do not use that in production!) but with performance
-characteristics that should be identical to those of an actually quantized
-flavor of the model.
-
-In the present example, we have a model using Relu6 activation functions almost
-everywhere, so a reasonable guess is that most activation ranges should be
-contained in [0, 6] and roughly comparable to it.
+The example below contains a model using Relu6 activation functions. Therefore,
+a reasonable guess is that most activation ranges should be contained in [0, 6].
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -191,15 +142,13 @@ bazel run --config=opt \
   --std_value=127.5
 ```
 
-## Multiple output arrays
+## Specifying input and output arrays
 
-Some models have multiple outputs. Even in a model with only one output, you may
-want for the inference code to return the contents of other arrays as well, or
-to perform inference on a subgraph with multiple outputs (see the section below
-on specifying arbitrary arrays as input/output arrays).
+### Multiple output arrays
 
-Either way, using `--output_arrays` instead of `--output_array` allows to
-specify a comma-separated list of output arrays.
+The flag `output_arrays` takes in a comma-separated list of output arrays as
+seen in the example below. This is useful for models or subgraphs with multiple
+outputs.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
@@ -216,18 +165,11 @@ bazel run --config=opt \
   --output_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu
 ```
 
-## Multiple input arrays
-
-Some models have multiple inputs; even in a model with a single input, you may
-want for the inference code to implement only a subgraph with multiple inputs
-(see the section below on specifying arbitrary arrays as input/output arrays).
+### Multiple input arrays
 
-Either way, multiple input arrays are specified by using `--input_arrays`
-instead of `--input_array` to specify a comma-separated list of input arrays. In
-that case, one also needs to use `--input_shapes` instead of `--input_shape`.
-The syntax for `--input_shapes` is a bit trickier, since already the singular
-`--input_shape` was a comma-separated list of integers! Multiple input shapes
-are delimited by a colon (`:`) in `--input_shapes`.
+The flag `input_arrays` takes in a comma-separated list of input arrays as seen
+in the example below. This is useful for models or subgraphs with multiple
+inputs.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
@@ -244,54 +186,93 @@ bazel run --config=opt \
   --output_array=InceptionV1/Logits/Predictions/Reshape_1
 ```
 
-## Specifying arbitrary arrays in a graph as input or output arrays
+Note that `input_shapes` is provided as a colon-separated list. Each input shape
+corresponds to the input array at the same position in the respective list.
 
-Any array in the input file can be specified as an input or output array. This
-allows to use the converter to extract a sub-graph out of the input graph file.
-The converter then automatically discards any part of the graph that is not
-needed for the subgraph identified by the specified input and output arrays.
-Another use case for specifying multiple output arrays is to get inference code
-to return the contents of some specified intermediate activations array, not
-just the output activations.
+### Specifying subgraphs
 
-In order to know which array you want to pass as `--input_arrays` /
-`--output_arrays`, it helps to have a visualization of the graph. See the
-section below on graph visualization. When using graph visualization for that
-purpose, make sure to use `--dump_graphviz=` to visualize exactly the graph as
-it is in the actual final form being exported to the output file.
+Any array in the input file can be specified as an input or output array in
+order to extract subgraphs out of an input graph file. TOCO discards the parts
+of the graph outside of the specific subgraph. Use [graph
+visualizations](#graph-visualizations) to identify the input and output arrays
+that make up the desired subgraph.
+
+The follow command shows how to extract a single fused layer out of a TensorFlow
+GraphDef.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --output_file=/tmp/foo.pb \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TENSORFLOW_GRAPHDEF \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
+  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --output_array=InceptionV1/InceptionV1/Mixed_3b/concat_v2
+```
 
 Note that the final representation of an on-device inference workload (say, in
-TensorFlow Lite flatbuffers format) tends to have coarser granularity than the
+TensorFlow Lite FlatBuffers format) tends to have coarser granularity than the
 very fine granularity of the TensorFlow GraphDef representation. For example,
 while a fully-connected layer is typically represented as at least four separate
 ops in TensorFlow GraphDef (Reshape, MatMul, BiasAdd, Relu...), it is typically
 represented as a single "fused" op (FullyConnected) in the converter's optimized
 representation and in the final on-device representation (e.g. in TensorFlow
-Lite flatbuffer format). As the level of granularity gets coarser, some
+Lite FlatBuffer format). As the level of granularity gets coarser, some
 intermediate arrays (say, the array between the MatMul and the BiasAdd in the
 TensorFlow GraphDef) are dropped. When specifying intermediate arrays as
-`--input_arrays` / `--output_arrays`, it is generally at least desirable (and
-often required) to specify arrays that are meant to survive in the final form of
-the graph, after fusing. These are typically the outputs of activation functions
-(since everything in each layer until the activation function tends to get
-fused).
+`--input_arrays` / `--output_arrays`, it is desirable (and often required) to
+specify arrays that are meant to survive in the final form of the graph, after
+fusing. These are typically the outputs of activation functions (since
+everything in each layer until the activation function tends to get fused).
+
+## Other conversions supported by TOCO <a name="other-conversions"></a>
+
+The converter accepts both TENSORFLOW_GRAPHDEF and TFLITE file formats as both
+`--input_format` and `--output_format`. This means that conversion to and from
+any supported format is possible.
 
-Here is an example of extracting just a sub-graph, namely just a single fused
-layer, out of a TensorFlow GraphDef, and exporting a TensorFlow GraphDef
-containing just that subgraph:
+### Optimize a TensorFlow GraphDef <a name="optimize-graphdef"></a>
+
+Same-format "conversions" can be used to optimize and simplify a graph or be
+used to [get a subgraph](#specifying-subgraphs) of a graph. The flag
+`--inference_type` is not required because TensorFlow graphs, including those
+containing the
+[`FakeQuant*`](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization)
+ops are always float graphs.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
   | tar xzv -C /tmp
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
   --output_file=/tmp/foo.pb \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TENSORFLOW_GRAPHDEF \
-  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
-  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
-  --output_array=InceptionV1/InceptionV1/Mixed_3b/concat_v2
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
+```
+
+### Convert a TensorFlow Lite FlatBuffer back into TensorFlow GraphDef format <a name="to-graphdef"></a>
+
+The converter supports file format conversions from TensorFlow Lite, back into
+TensorFlow GraphDef format.
+
+```
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/foo.tflite \
+  --output_file=/tmp/foo.pb \
+  --input_format=TFLITE \
+  --output_format=TENSORFLOW_GRAPHDEF \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
 ```
 
 ## Logging
@@ -299,8 +280,8 @@ bazel run --config=opt \
 ### Standard logging
 
 The converter generates some informative log messages during processing. The
-easiest way to view them is to add `--logtostderr` to command lines. For the
-previous example, that gives:
+easiest way to view them is to add `--logtostderr` to command lines as seen in
+the following example.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -333,42 +314,34 @@ I1101 21:51:33.309484    5339 toco_tooling.cc:249] Estimated count of arithmetic
 For debugging purposes, the converter supports two levels of verbose logging,
 which can be set by passing a `--v=` flag:
 
-*   At `--v=1`, the converter generates text dumps of the graph at various
-    points during processing, as well as log messages about every graph
-    transformation that did take place, typically answering questions of the
-    form "why was my graph transformed in this way"?
-*   At `--v=2`, the converter additionally generates log messages about graph
-    transformations that were considered but not actually performed, typically
-    answering questions of the form "why was my graph NOT transformed when I
-    expected it would be?".
+*   For `--v=1`, the converter generates text dumps of the graph at various
+    points during processing as well as log messages about every graph
+    transformation that took place.
+*   For `--v=2`, the converter additionally generates log messages about graph
+    transformations that were considered but not performed.
 
 ### Graph "video" logging
 
-When `--dump_graphviz=` is used (see the section on Graph visualizations), one
-may additionally pass `--dump_graphviz_video`, which causes a graph
-visualization to be dumped after each individual graph transformations, often
-resulting in thousands of files. Typically, one would then bisect into these
-files to understand when a given change was introduced in the graph.
+When `--dump_graphviz=` is used (see the section on [graph
+visualizations](#graph-visualizations)), one may additionally pass
+`--dump_graphviz_video`, which causes a graph visualization to be dumped after
+each individual graph transformation. This results in thousands of files.
+Typically, one would then bisect into these files to understand when a given
+change was introduced in the graph.
 
 ## Graph visualizations
 
-The converter is able to export a graph to the GraphViz Dot format, for easy
-visualization. Combined with the converter's ability to transform the graph into
-a simpler, coarser-granularity representation, that makes it a very powerful
-visualization tool.
-
-There are two ways to get the converter to export a GraphViz Dot file,
-corresponding to two separate use cases. Understanding the difference between
-them is key to getting useful graph visualizations.
+TOCO can export a graph to the GraphViz Dot format for easy visualization via
+either the `--output_format` flag or the `--dump_graphviz` flag. The subsections
+below outline the use cases for each.
 
 ### Using `--output_format=GRAPHVIZ_DOT`
 
-The first way to get a graphviz rendering is to pass
-`--output_format=GRAPHVIZ_DOT`, instead of the `--output_format` that you would
-otherwise use. This says: "I just want to get a plausible visualization of that
-graph". The upside is that it makes for very simple command lines, and makes the
-converter very lax about aspects of the graph or the command line that it would
-otherwise complain about. Example:
+The first way to get a graphviz rendering is to pass `GRAPHVIZ_DOT` into
+`--output_format`. This results in a plausable visualization of the graph. This
+reduces the requirements that normally exist during conversion between other
+input and output formats. For example, this may be useful if conversion from
+TENSORFLOW_GRAPHDEF to TFLITE is failing.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -391,7 +364,7 @@ dot -Tpdf -O /tmp/foo.dot
 ```
 
 And the resulting `.dot.pdf` can be viewed in any PDF viewer, but we suggest one
-with a good ability to pan and zoom across a very large page; Google Chrome does
+with a good ability to pan and zoom across a very large page. Google Chrome does
 well in that respect.
 
 ```
@@ -400,14 +373,14 @@ google-chrome /tmp/foo.dot.pdf
 
 Example PDF files are viewable online in the next section.
 
-### Using `--dump_graphviz=`
+### Using `--dump_graphviz`
 
-The second way to get a graphviz rendering is to pass a `--dump_graphviz=` flag
-specifying a destination directory to dump GraphViz rendering to. Unlike the
-previous approach, this one allows you to keep your real command-line (with your
-real `--output_format` and other flags) unchanged, just appending a
-`--dump_graphviz=` flag to it. This says: "I want visualizations of the actual
-graph during this specific conversion process". Example:
+The second way to get a graphviz rendering is to pass the `--dump_graphviz=`
+flag, specifying a destination directory to dump GraphViz rendering to. Unlike
+the previous approach, this one allows you to keep your real command-line (with
+your real `--output_format` and other flags) unchanged, just appending a
+`--dump_graphviz=` flag to it. This provides a visualization of the actual graph
+during a specific conversion process.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -425,8 +398,8 @@ bazel run --config=opt \
   --dump_graphviz=/tmp
 ```
 
-This generates a few files in the destination directory, here `/tmp`. Most
-important are these two files:
+This generates a few files in the destination directory, here `/tmp`. The two
+most important files are:
 
 ```
 /tmp/toco_AT_IMPORT.dot
@@ -442,8 +415,7 @@ conversion subsequently fails).
 
 `toco_AFTER_TRANSFORMATIONS.dot` represents the graph after all transformations
 were applied to it, just before it was exported to the `--output_file`.
-Typically, this is a much smaller graph, and it conveys much more information
-about each node.
+Typically, this is a much smaller graph with more information about each node.
 
 Again, these can be rendered to PDFs:
 
@@ -451,12 +423,12 @@ Again, these can be rendered to PDFs:
 dot -Tpdf -O /tmp/toco_*.dot
 ```
 
-The resulting files can be seen here:
+Sample output files can be seen here:
 
 *   [toco_AT_IMPORT.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AT_IMPORT.dot.pdf)
 *   [toco_AFTER_TRANSFORMATIONS.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AFTER_TRANSFORMATIONS.dot.pdf).
 
-### Legend for the graph visualizations
+### Legend for the graph visualizations <a name="graphviz-legend"></a>
 
 *   Operators are red square boxes with the following hues of red:
     *   Most operators are
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
index 5e077952235fa1aac1e12403d3d83633a617ccb7..9e99287f828c22aa81eb216c087f3261e378fc14 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -1,84 +1,47 @@
-# TensorFlow Lite Optimizing Converter command-line reference
+# TensorFlow Lite Optimizing Converter command-line glossary
 
 This page is complete reference of command-line flags. It is complemented by the
 following other documents:
 
 *   [README](../README.md)
 *   [Command-line examples](cmdline_examples.md)
+*   [Python API examples](python_api.md)
 
 Table of contents:
 
-[TOC]
-
-## High-level overview
-
-A full list and detailed specification of all flags is given in the next
-section. For now we focus on a higher-level description of command lines:
-
-```
-toco \
-  --input_format=... \
-  --output_format=... \
-  --input_file=... \
-  --output_file=... \
-  [model flags...] \
-  [transformation flags...] \
-  [logging flags...]
-```
-
-In other words, the converter requires at least the following mandatory flags:
-`--input_format`, `--output_format`, `--input_file`, `--output_file`. Depending
-on the input and output formats, additional flags may be allowed or mandatory:
-
-*   *Model flags* provide additional information about the model stored in the
-    input file.
-    *   `--output_array` or `--output_arrays` specify which arrays in the input
-        file are to be considered the output activations.
-    *   `--input_array` or `--input_arrays` specify which arrays in the input
-        file are to be considered the input activations.
-    *   `--input_shape` or `--input_shapes` specify the shapes of the input
-        arrays.
-    *   `--input_data_type` or `--input_data_types` specify the data types of
-        input arrays, which can be used if the input file does not already
-        specify them.
-    *   `--mean_value` or `--mean_values`, and `--std_value` or `--std_values`,
-        give the dequantization parameters of the input arrays, for the case
-        when the output file will accept quantized input arrays.
-*   *Transformation flags* specify options of the transformations to be applied
-    to the graph, i.e. they specify requested properties that the output file
-    should have.
-    *   `--inference_type` specifies the type of real-numbers arrays in the
-        output file. This only affects arrays of real numbers and allows to
-        control their quantization or dequantization, effectively switching
-        between floating-point and quantized arithmetic for the inference
-        workload, as far as real numbers are concerned. Other data types are
-        unaffected (e.g. plain integers, and strings).
-    *   `--inference_input_type` is like `--inference_type` but specifically
-        controlling input arrays, separately from other arrays. If not
-        specified, then `--inference_type` is used. The use case for specifying
-        `--inference_input_type` is when one wants to perform floating-point
-        inference on a quantized input, as is common in image models operating
-        on bitmap image inputs.
-    *   Some transformation flags allow to carry on with quantization when the
-        input graph is not properly quantized: `--default_ranges_min`,
-        `--default_ranges_max`, `--drop_fake_quant`,
-        `--reorder_across_fake_quant`.
-*   *Logging flags* described below.
-
-## Command-line flags complete reference
-
-### Mandatory flags
-
-*   `--input_format`. Type: string. Specifies the format of the input file.
-    Allowed values:
+*   [High-level flags](#high-level-flags)
+*   [Model flags](#model-flags)
+*   [Transformation flags](#transformation-flags)
+*   [Logging flags](#logging-flags)
+
+## High-level flags
+
+The following high level flags specify the location of the input and output
+files. The flag `--output_file` is always required. Additionally, either
+`--input_file` or `--savedmodel_directory` is required.
+
+*   `--savedmodel_directory`. Type: string. Specifies the full path to the
+    directory containing the SavedModel.
+*   `--savedmodel_tagset`. Type: string. Default:
+    [kSavedModelTagServe](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/tag_constants.h).
+    Specifies a comma-separated set of tags identifying the MetaGraphDef within
+    the SavedModel to analyze. All tags in the tag set must be specified.
+*   `--input_file`. Type: string. Specifies the path of the input file. This may
+    be either an absolute or a relative path.
+*   `--output_file`. Type: string. Specifies the path of the output file.
+
+The following high level flags specify the types of the input and output files:
+
+*   `--input_format`. Type: string. Default: `TENSORFLOW_GRAPHDEF`. Specifies
+    the format of the input file. Allowed values:
     *   `TENSORFLOW_GRAPHDEF` &mdash; The TensorFlow GraphDef format. Both
         binary and text proto formats are allowed.
-    *   `TFLITE` &mdash; The TensorFlow Lite flatbuffers format.
-*   `--output_format`. Type: string. Specifies the format of the output file.
-    Allowed values:
+    *   `TFLITE` &mdash; The TensorFlow Lite FlatBuffers format.
+*   `--output_format`. Type: string. Default: `TFLITE`. Specifies the format of
+    the output file. Allowed values:
     *   `TENSORFLOW_GRAPHDEF` &mdash; The TensorFlow GraphDef format. Always
         produces a file in binary (not text) proto format.
-    *   `TFLITE` &mdash; The TensorFlow Lite flatbuffers format.
+    *   `TFLITE` &mdash; The TensorFlow Lite FlatBuffers format.
         *   Whether a float or quantized TensorFlow Lite file will be produced
             depends on the `--inference_type` flag.
     *   `GRAPHVIZ_DOT` &mdash; The GraphViz `.dot` format. This asks the
@@ -95,11 +58,11 @@ on the input and output formats, additional flags may be allowed or mandatory:
             you get in your actual output format as opposed to just a merely
             plausible visualization of a model, consider using `--dump_graphviz`
             instead and keeping your true `--output_format`.
-*   `--input_file`. Type: string. Specifies the path of the input file. This may
-    be either an absolute or a relative path.
-*   `--output_file`. Type: string. Specifies the path of the output file.
 
-### Model flags
+## Model flags
+
+*Model flags* provide additional information about the model stored in the input
+file.
 
 *   `--output_array`. Type: string. Specifies a single array as the output
     activations. Incompatible with `--output_arrays`.
@@ -111,6 +74,10 @@ on the input and output formats, additional flags may be allowed or mandatory:
 *   `--input_arrays`. Type: comma-separated list of strings. Specifies a list of
     arrays as the input activations, for models with multiple inputs.
     Incompatible with `--input_array`.
+*   `--batch_size`. Type: integer. Default: 1. Specifies the batch size for the
+    model. Replaces the first dimension of an input size array if undefined. Use
+    only with SavedModels when neither `--input_shape` nor `input_shapes` flags
+    are specified. Incompatible with GraphDefs.
 
 When `--input_array` is used, the following flags are available to provide
 additional information about the single input array:
@@ -160,7 +127,11 @@ additional information about the multiple input arrays:
     the input arrays specified in `--input_arrays`, in the same order. See
     `--mean_value`, `--std_value` for details.
 
-### Transformation flags
+## Transformation flags
+
+*Transformation flags* specify options of the transformations to be applied to
+the graph, i.e. they specify requested properties that the output file should
+have.
 
 *   `--inference_type`. Type: string. Sets the type of real-number arrays in the
     output file, that is, controls the representation (quantization) of real
@@ -232,7 +203,7 @@ additional information about the multiple input arrays:
     graph transformations on them, at the cost of no longer faithfully matching
     inference and training arithmetic.
 
-### Logging flags
+## Logging flags
 
 The following are standard Google logging flags:
 
diff --git a/tensorflow/contrib/lite/toco/g3doc/python_api.md b/tensorflow/contrib/lite/toco/g3doc/python_api.md
index 440f9c367c25726e20aa8828e3050cd1dc1b230d..f0fd638a618c75c75d336a746f9b1d8dccaea470 100644
--- a/tensorflow/contrib/lite/toco/g3doc/python_api.md
+++ b/tensorflow/contrib/lite/toco/g3doc/python_api.md
@@ -1,5 +1,12 @@
 # TensorFlow Lite Optimizing Converter (TOCO) Python API reference
 
+This page provides examples on how to use TOCO via the Python API. It is
+complemented by the following documents:
+
+*   [README](../README.md)
+*   [Command-line examples](cmdline_examples.md)
+*   [Command-line glossary](cmdline_reference.md)
+
 ## High-level overview
 
 While the TensorFlow Lite Optimizing Converter can be used from the command
@@ -28,7 +35,7 @@ val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
 out = tf.identity(val, name="out")
 with tf.Session() as sess:
   tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
-  open("test.tflite", "wb").write(tflite_modeL)
+  open("test.tflite", "wb").write(tflite_model)
 ```
 
 **NOTE** Currently, the TOCO command will cause a fatal error to the Python
diff --git a/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg b/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg
new file mode 100644
index 0000000000000000000000000000000000000000..a47c088991299159be39bc490149720dae43eb53
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 720.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path fill="#434343" d="m338.49512 374.66016q-0.609375 0 -1.171875 -0.140625q-0.546875 -0.15625 -0.96875 -0.421875q-0.25 -0.15625 -0.359375 -0.296875q-0.09375 -0.140625 -0.09375 -0.34375q0 -0.171875 0.09375 -0.28125q0.109375 -0.109375 0.265625 -0.109375q0.171875 0 0.46875 0.1875q0.40625 0.25 0.796875 0.390625q0.390625 0.140625 0.984375 0.140625q0.71875 0 1.109375 -0.25q0.40625 -0.265625 0.40625 -0.734375q0 -0.296875 -0.15625 -0.46875q-0.140625 -0.1875 -0.5 -0.328125q-0.359375 -0.140625 -1.046875 -0.296875q-1.171875 -0.25 -1.6875 -0.671875q-0.5 -0.421875 -0.5 -1.15625q0 -0.578125 0.3125 -1.015625q0.328125 -0.4375 0.890625 -0.6875q0.5625 -0.265625 1.28125 -0.265625q0.53125 0 1.015625 0.140625q0.484375 0.140625 0.859375 0.390625q0.453125 0.328125 0.453125 0.671875q0 0.171875 -0.109375 0.296875q-0.109375 0.125 -0.25 0.125q-0.15625 0 -0.484375 -0.234375q-0.375 -0.234375 -0.703125 -0.359375q-0.328125 -0.140625 -0.828125 -0.140625q-0.625 0 -1.015625 0.28125q-0.375 0.265625 -0.375 0.734375q0 0.296875 0.140625 0.484375q0.140625 0.171875 0.46875 0.3125q0.328125 0.140625 0.9375 0.28125q0.90625 0.1875 1.40625 0.4375q0.5 0.234375 0.703125 0.578125q0.21875 0.34375 0.21875 0.890625q0 0.828125 -0.703125 1.34375q-0.703125 0.515625 -1.859375 0.515625zm9.241241 -1.59375q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.5551147 -0.8125q0.546875 -0.03125 0.546875 0.453125q0 0.21875 -0.125 0.34375q-0.109375 0.125 -0.40625 0.15625l-0.390625 0.03125q-0.890625 0.078125 -1.328125 0.640625q-0.4375 0.546875 -0.4375 1.296875l0 3.234375q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.359375 0.140625q0.140625 0.140625 0.140625 0.375l0 0.75q0.28125 -0.578125 0.796875 -0.890625q0.515625 -0.3125 1.1875 -0.359375l0.1875 -0.015625zm6.157959 0.328125q0.15625 -0.3125 0.46875 -0.3125q0.203125 0 0.359375 0.140625q0.15625 0.125 0.15625 0.328125q0 0.109375 -0.046875 0.203125l-2.59375 5.609375q-0.078125 0.171875 -0.25 0.28125q-0.15625 0.09375 -0.34375 0.09375q-0.171875 0 -0.328125 -0.09375q-0.15625 -0.109375 -0.25 -0.28125l-2.59375 -5.609375q-0.046875 -0.09375 -0.046875 -0.1875q0 -0.203125 0.171875 -0.34375q0.1875 -0.15625 0.390625 -0.15625q0.140625 0 0.265625 0.078125q0.125 0.078125 0.1875 0.234375l2.234375 5.0l2.21875 -4.984375zm7.2099915 4.796875q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.5551453 -0.8125q0.546875 -0.03125 0.546875 0.453125q0 0.21875 -0.125 0.34375q-0.109375 0.125 -0.40625 0.15625l-0.390625 0.03125q-0.890625 0.078125 -1.328125 0.640625q-0.4375 0.546875 -0.4375 1.296875l0 3.234375q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.359375 0.140625q0.140625 0.140625 0.140625 0.375l0 0.75q0.28125 -0.578125 0.796875 -0.890625q0.515625 -0.3125 1.1875 -0.359375l0.1875 -0.015625z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path fill="#434343" d="m134.36497 56.831844q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm9.004181 -1.421875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.839676 -0.75q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm5.84729 6.0625q-0.56248474 0 -1.0624847 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.87498474 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0624847 -0.234375 -1.5156097 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.1562347 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.56248474 0 -0.90623474 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84373474 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.2131653 0q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm7.1288147 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm1.970398 6.03125q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.5434265 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm4.721527 0.015625q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm12.222534 -4.9375q0.125 -0.28125 0.390625 -0.28125q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.078125 -0.03125 0.171875l-1.984375 5.046875q-0.078125 0.15625 -0.21875 0.25q-0.140625 0.078125 -0.296875 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-1.65625 -4.21875l-1.640625 4.21875q-0.0625 0.15625 -0.203125 0.25q-0.140625 0.078125 -0.3125 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-1.984375 -5.03125q-0.046875 -0.09375 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.171875 -0.140625 0.359375 -0.140625q0.296875 0 0.40625 0.296875l1.65625 4.421875l1.6875 -4.390625q0.078125 -0.15625 0.203125 -0.234375q0.125 -0.09375 0.265625 -0.09375q0.15625 0 0.28125 0.09375q0.125 0.078125 0.1875 0.234375l1.6875 4.375l1.65625 -4.40625zm12.637604 5.09375q0.046875 0.09375 0.046875 0.203125q0 0.171875 -0.140625 0.296875q-0.140625 0.125 -0.328125 0.125q-0.296875 0 -0.421875 -0.296875l-0.84375 -1.9375l-4.53125 0l-0.859375 1.9375q-0.125 0.296875 -0.421875 0.296875q-0.1875 0 -0.34375 -0.125q-0.140625 -0.125 -0.140625 -0.3125q0 -0.09375 0.046875 -0.1875l3.4375 -7.640625q0.078125 -0.15625 0.21875 -0.234375q0.140625 -0.09375 0.3125 -0.09375q0.171875 0 0.3125 0.09375q0.15625 0.078125 0.21875 0.234375l3.4375 7.640625zm-5.859375 -2.421875l3.8125 0l-1.90625 -4.3125l-1.90625 4.3125zm7.78656 3.046875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm4.9744263 4.34375q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm4.4157715 0.015625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m409.42255 374.66803q-0.90625 0 -1.609375 -0.40625q-0.6875 -0.421875 -1.078125 -1.171875q-0.375 -0.765625 -0.375 -1.765625q0 -1.0 0.390625 -1.765625q0.40625 -0.78125 1.109375 -1.203125q0.703125 -0.4375 1.625 -0.4375q0.5 0 1.0 0.140625q0.5 0.140625 0.875 0.40625q0.234375 0.171875 0.328125 0.328125q0.109375 0.140625 0.109375 0.328125q0 0.1875 -0.109375 0.3125q-0.09375 0.109375 -0.25 0.109375q-0.09375 0 -0.203125 -0.046875q-0.09375 -0.046875 -0.171875 -0.09375q-0.078125 -0.0625 -0.09375 -0.078125q-0.359375 -0.234375 -0.671875 -0.359375q-0.3125 -0.140625 -0.765625 -0.140625q-0.96875 0 -1.515625 0.671875q-0.53125 0.65625 -0.53125 1.828125q0 1.171875 0.53125 1.8125q0.546875 0.640625 1.515625 0.640625q0.453125 0 0.78125 -0.125q0.328125 -0.140625 0.65625 -0.375q0.15625 -0.09375 0.28125 -0.15625q0.140625 -0.0625 0.234375 -0.0625q0.140625 0 0.234375 0.125q0.109375 0.109375 0.109375 0.296875q0 0.171875 -0.09375 0.3125q-0.09375 0.140625 -0.34375 0.3125q-0.375 0.25 -0.90625 0.40625q-0.515625 0.15625 -1.0625 0.15625zm4.2591553 -0.03125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -8.46875q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 8.46875q0 0.25 -0.15625 0.390625q-0.15625 0.140625 -0.375 0.140625zm3.092102 0q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.234375 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 5.625q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125zm0 -8.09375q-0.3125 0 -0.515625 -0.171875q-0.203125 -0.1875 -0.203125 -0.5q0 -0.296875 0.203125 -0.484375q0.203125 -0.1875 0.515625 -0.1875q0.328125 0 0.515625 0.1875q0.203125 0.1875 0.203125 0.484375q0 0.3125 -0.203125 0.5q-0.1875 0.171875 -0.515625 0.171875zm7.5765076 6.53125q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.6020203 -0.84375q2.328125 0 2.328125 2.578125l0 3.609375q0 0.25 -0.140625 0.390625q-0.140625 0.140625 -0.390625 0.140625q-0.25 0 -0.40625 -0.140625q-0.140625 -0.140625 -0.140625 -0.390625l0 -3.546875q0 -0.90625 -0.359375 -1.3125q-0.34375 -0.421875 -1.125 -0.421875q-0.890625 0 -1.421875 0.546875q-0.53125 0.546875 -0.53125 1.484375l0 3.25q0 0.25 -0.140625 0.390625q-0.140625 0.140625 -0.390625 0.140625q-0.25 0 -0.40625 -0.140625q-0.140625 -0.140625 -0.140625 -0.390625l0 -5.625q0 -0.234375 0.140625 -0.375q0.15625 -0.15625 0.40625 -0.15625q0.234375 0 0.375 0.15625q0.140625 0.140625 0.140625 0.359375l0 0.6875q0.328125 -0.609375 0.890625 -0.921875q0.578125 -0.3125 1.3125 -0.3125zm7.304718 5.875q0.46875 0.03125 0.46875 0.421875q0 0.21875 -0.171875 0.34375q-0.171875 0.109375 -0.5 0.078125l-0.359375 -0.015625q-1.0625 -0.09375 -1.578125 -0.640625q-0.5 -0.5625 -0.5 -1.703125l0 -3.34375l-0.890625 0q-0.234375 0 -0.359375 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.203125 0.125 -0.3125q0.125 -0.125 0.359375 -0.125l0.890625 0l0 -1.515625q0 -0.25 0.140625 -0.390625q0.15625 -0.140625 0.40625 -0.140625q0.234375 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 1.515625l1.484375 0q0.203125 0 0.328125 0.125q0.140625 0.109375 0.140625 0.3125q0 0.1875 -0.140625 0.296875q-0.125 0.109375 -0.328125 0.109375l-1.484375 0l0 3.40625q0 0.734375 0.296875 1.0625q0.296875 0.3125 0.90625 0.359375l0.359375 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path fill="#000000" d="m237.0857 213.5031q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm4.248535 1.71875q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.417801 3.875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.199051 4.46875q0.203125 0 0.296875 0.109375q0.109375 0.09375 0.109375 0.265625q0 0.1875 -0.109375 0.296875q-0.09375 0.09375 -0.296875 0.09375l-4.203125 0q-0.203125 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.3125q0 -0.1875 0.140625 -0.359375l3.546875 -4.28125l-3.28125 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l4.0625 0q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.3125q0 0.1875 -0.140625 0.359375l-3.5625 4.28125l3.421875 0zm6.2547913 -0.59375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm3.3865662 5.875q-0.171875 0 -0.28125 -0.09375q-0.109375 -0.09375 -0.109375 -0.21875q0 -0.140625 0.109375 -0.234375q0.109375 -0.09375 0.28125 -0.09375l5.21875 0q0.171875 0 0.28125 0.09375q0.109375 0.09375 0.109375 0.234375q0 0.125 -0.109375 0.21875q-0.109375 0.09375 -0.28125 0.09375l-5.21875 0zm11.2500305 -6.609375q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 5.09375q0 1.296875 -0.671875 1.96875q-0.671875 0.671875 -1.984375 0.671875q-1.28125 0 -2.140625 -0.515625q-0.421875 -0.234375 -0.421875 -0.546875q0 -0.171875 0.078125 -0.28125q0.09375 -0.109375 0.234375 -0.109375q0.125 0 0.4375 0.171875q0.421875 0.21875 0.828125 0.34375q0.40625 0.140625 0.96875 0.140625q0.859375 0 1.28125 -0.453125q0.4375 -0.453125 0.4375 -1.3125l0 -1.03125q-0.25 0.5625 -0.78125 0.859375q-0.515625 0.296875 -1.21875 0.296875q-0.765625 0 -1.359375 -0.359375q-0.59375 -0.359375 -0.9375 -1.015625q-0.328125 -0.65625 -0.328125 -1.515625q0 -0.875 0.328125 -1.53125q0.34375 -0.65625 0.9375 -1.015625q0.59375 -0.359375 1.359375 -0.359375q0.6875 0 1.203125 0.296875q0.515625 0.296875 0.78125 0.84375l0 -0.640625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625zm-2.28125 4.984375q0.84375 0 1.3125 -0.546875q0.484375 -0.5625 0.484375 -1.546875q0 -0.984375 -0.46875 -1.53125q-0.46875 -0.5625 -1.328125 -0.5625q-0.84375 0 -1.34375 0.5625q-0.484375 0.546875 -0.484375 1.53125q0 0.984375 0.484375 1.546875q0.5 0.546875 1.34375 0.546875zm7.4695435 -4.984375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.20282 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.331665 6.046875q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm5.2167664 -6.046875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.45282 -4.9375q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-3.375 7.28125q-0.0625 0.125 -0.171875 0.1875q-0.109375 0.078125 -0.234375 0.078125q-0.1875 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.09375 0.046875 -0.1875l0.84375 -1.8125l-2.375 -5.140625q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875z" fill-rule="nonzero"/><path fill="#f4cccc" d="m154.36745 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m154.36745 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m184.89111 339.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm8.160431 0.03125q-1.171875 0 -2.046875 -0.515625q-0.859375 -0.53125 -1.328125 -1.5q-0.46875 -0.984375 -0.46875 -2.296875q0 -1.34375 0.453125 -2.3125q0.46875 -0.984375 1.328125 -1.5q0.875 -0.53125 2.0625 -0.53125q1.1875 0 2.0625 0.53125q0.875 0.515625 1.328125 1.5q0.46875 0.96875 0.46875 2.296875q0 1.3125 -0.46875 2.296875q-0.46875 0.984375 -1.34375 1.515625q-0.859375 0.515625 -2.046875 0.515625zm0 -0.84375q1.34375 0 2.09375 -0.90625q0.75 -0.90625 0.75 -2.578125q0 -1.6875 -0.75 -2.578125q-0.734375 -0.90625 -2.09375 -0.90625q-1.34375 0 -2.09375 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.09375 0.90625zm9.214935 0.84375q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm8.077179 0q-1.171875 0 -2.046875 -0.515625q-0.859375 -0.53125 -1.328125 -1.5q-0.46875 -0.984375 -0.46875 -2.296875q0 -1.34375 0.453125 -2.3125q0.46875 -0.984375 1.328125 -1.5q0.875 -0.53125 2.0625 -0.53125q1.1875 0 2.0625 0.53125q0.875 0.515625 1.328125 1.5q0.46875 0.96875 0.46875 2.296875q0 1.3125 -0.46875 2.296875q-0.46875 0.984375 -1.34375 1.515625q-0.859375 0.515625 -2.046875 0.515625zm0 -0.84375q1.34375 0 2.09375 -0.90625q0.75 -0.90625 0.75 -2.578125q0 -1.6875 -0.75 -2.578125q-0.734375 -0.90625 -2.09375 -0.90625q-1.34375 0 -2.09375 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.09375 0.90625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m314.7006 332.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm5.113556 0q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.6840515 -0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -7.5625q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.171875l3.875 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-4.375 0zm6.3394165 0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm4.987152 6.515625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#000000" d="m303.37402 346.47687q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.5434265 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm4.674652 -6.046875q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm4.3300476 -5.28125q0.765625 0 1.34375 0.375q0.59375 0.359375 0.921875 1.046875q0.328125 0.6875 0.328125 1.59375q0 0.90625 -0.328125 1.59375q-0.328125 0.6875 -0.921875 1.078125q-0.578125 0.375 -1.34375 0.375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 0.640625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.203125q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.59375q0.46875 -0.59375 0.46875 -1.65625q0 -1.046875 -0.46875 -1.625q-0.46875 -0.578125 -1.328125 -0.578125q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.687164 -5.25q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm4.8726807 -1.71875q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm3.9360352 0q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm5.873535 6.328125q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m443.6039 332.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm5.113556 0q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.6840515 -0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -7.5625q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.171875l3.875 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-4.375 0zm6.3394165 0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm4.987152 6.515625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.908142 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#000000" d="m429.9527 346.47687q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm5.237152 1.234375q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.56604 5.28125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm4.282898 -0.015625q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.14032 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.5896606 4.53125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m371.61902 334.89435l41.417297 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m371.61902 334.89435l37.990234 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m409.60925 334.89435l-1.1245728 1.1246033l3.0897522 -1.1246033l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m587.0588 293.13934q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.375 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84375 0 1.5625 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.15625 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.109375 0 2.03125 -0.328125l0 -2.578125l-1.75 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.234375 0zm2.8911743 4.46875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm7.7869263 4.375q-1.65625 0 -2.515625 -0.859375q-0.84375 -0.859375 -0.84375 -2.546875l0 -4.703125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.78125q0 1.25 0.609375 1.875q0.609375 0.609375 1.78125 0.609375q1.171875 0 1.765625 -0.609375q0.609375 -0.625 0.609375 -1.875l0 -4.78125q0 -0.234375 0.140625 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.703125q0 1.671875 -0.859375 2.546875q-0.859375 0.859375 -2.5 0.859375z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m584.63763 339.50812q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm5.0302734 -0.03125q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm7.7869263 4.375q-1.65625 0 -2.515625 -0.859375q-0.84375 -0.859375 -0.84375 -2.546875l0 -4.703125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.78125q0 1.25 0.609375 1.875q0.609375 0.609375 1.78125 0.609375q1.171875 0 1.765625 -0.609375q0.609375 -0.625 0.609375 -1.875l0 -4.78125q0 -0.234375 0.140625 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.703125q0 1.671875 -0.859375 2.546875q-0.859375 0.859375 -2.5 0.859375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m241.86351 334.89435l42.267715 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m241.86351 334.89435l38.840652 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m280.70413 334.89435l-1.1245728 1.1246033l3.0897827 -1.1246033l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#d9ead3" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m437.52493 135.68242l73.763794 0l0 31.748032l-73.763794 0z" fill-rule="evenodd"/><path fill="#000000" d="m448.0718 156.20241q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm8.3211975 -5.140625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.767517 -5.28125q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm10.15921 0.75q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm8.691681 -5.71875q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-3.375 7.28125q-0.0625 0.125 -0.171875 0.1875q-0.109375 0.078125 -0.234375 0.078125q-0.1875 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.09375 0.046875 -0.1875l0.84375 -1.8125l-2.375 -5.140625q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875zm4.902405 -0.328125q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.76532 -0.640625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m544.40155 135.68242l100.0 0l0 31.748032l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m554.9328 156.26491q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm5.3845215 -6.046875q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.456726 -1.703125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm4.248535 1.71875q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm6.3444214 0.765625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.47876 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm4.283142 -5.265625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.782898 0q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm4.7008057 6.046875q-0.8125 0 -1.453125 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.6875 -0.34375 -1.578125q0 -0.90625 0.359375 -1.59375q0.359375 -0.703125 0.984375 -1.078125q0.640625 -0.390625 1.46875 -0.390625q0.453125 0 0.90625 0.125q0.453125 0.125 0.78125 0.359375q0.21875 0.140625 0.3125 0.28125q0.09375 0.140625 0.09375 0.3125q0 0.171875 -0.09375 0.28125q-0.09375 0.09375 -0.234375 0.09375q-0.078125 0 -0.1875 -0.046875q-0.09375 -0.046875 -0.15625 -0.09375q-0.0625 -0.046875 -0.09375 -0.0625q-0.3125 -0.203125 -0.59375 -0.3125q-0.28125 -0.125 -0.6875 -0.125q-0.875 0 -1.359375 0.59375q-0.484375 0.59375 -0.484375 1.65625q0 1.046875 0.484375 1.625q0.484375 0.578125 1.359375 0.578125q0.40625 0 0.703125 -0.109375q0.296875 -0.125 0.59375 -0.328125q0.140625 -0.09375 0.25 -0.15625q0.125 -0.0625 0.203125 -0.0625q0.140625 0 0.21875 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.15625 -0.09375 0.28125q-0.078125 0.125 -0.296875 0.28125q-0.34375 0.234375 -0.8125 0.375q-0.46875 0.125 -0.953125 0.125zm6.029297 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.830017 -5.265625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm5.1851807 0q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#d9ead3" d="m78.872284 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m78.872284 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m93.328064 272.6459q-0.90625 0 -1.734375 -0.265625q-0.8125 -0.265625 -1.3125 -0.734375q-0.171875 -0.15625 -0.171875 -0.40625q0 -0.171875 0.09375 -0.296875q0.09375 -0.125 0.234375 -0.125q0.15625 0 0.328125 0.125q1.109375 0.859375 2.546875 0.859375q1.03125 0 1.578125 -0.390625q0.5625 -0.390625 0.5625 -1.125q0 -0.421875 -0.265625 -0.671875q-0.265625 -0.265625 -0.703125 -0.421875q-0.4375 -0.15625 -1.15625 -0.328125q-0.984375 -0.21875 -1.625 -0.46875q-0.625 -0.265625 -1.015625 -0.734375q-0.390625 -0.46875 -0.390625 -1.21875q0 -0.71875 0.390625 -1.265625q0.390625 -0.5625 1.09375 -0.875q0.703125 -0.3125 1.59375 -0.3125q0.84375 0 1.5625 0.265625q0.734375 0.25 1.234375 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.125 0 -0.34375 -0.140625q-0.59375 -0.46875 -1.09375 -0.65625q-0.5 -0.203125 -1.21875 -0.203125q-0.984375 0 -1.546875 0.421875q-0.546875 0.40625 -0.546875 1.15625q0 0.625 0.484375 0.953125q0.484375 0.3125 1.5 0.5625q1.09375 0.25 1.71875 0.484375q0.625 0.21875 1.03125 0.671875q0.421875 0.4375 0.421875 1.171875q0 0.71875 -0.390625 1.265625q-0.390625 0.53125 -1.109375 0.828125q-0.703125 0.296875 -1.609375 0.296875zm6.9353027 -6.078125q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm8.578796 -4.96875q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-2.34375 5.046875q-0.0625 0.15625 -0.21875 0.25q-0.140625 0.078125 -0.3125 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-2.328125 -5.046875q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875zm6.480545 4.296875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.589676 -3.28125q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.328125l0 7.625q0 0.21875 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.640625q-0.265625 0.546875 -0.78125 0.84375q-0.5 0.296875 -1.1875 0.296875q-0.765625 0 -1.359375 -0.375q-0.578125 -0.390625 -0.90625 -1.078125q-0.328125 -0.6875 -0.328125 -1.59375q0 -0.90625 0.328125 -1.59375q0.328125 -0.6875 0.90625 -1.046875q0.59375 -0.375 1.359375 -0.375q0.6875 0 1.1875 0.296875q0.515625 0.296875 0.78125 0.84375l0 -3.203125q0 -0.21875 0.125 -0.34375q0.125 -0.125 0.359375 -0.125zm-2.25 7.796875q0.84375 0 1.296875 -0.578125q0.46875 -0.59375 0.46875 -1.65625q0 -1.0625 -0.46875 -1.640625q-0.453125 -0.578125 -1.296875 -0.578125q-0.859375 0 -1.34375 0.578125q-0.46875 0.578125 -0.46875 1.625q0 1.0625 0.46875 1.65625q0.484375 0.59375 1.34375 0.59375zm12.202805 -7.796875q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.359375l0 7.59375q0 0.21875 -0.125 0.359375q-0.109375 0.125 -0.328125 0.125q-0.21875 0 -0.328125 -0.125q-0.109375 -0.140625 -0.109375 -0.359375l0 -6.125l-2.59375 4.984375q-0.171875 0.34375 -0.5 0.34375q-0.3125 0 -0.484375 -0.34375l-2.625 -4.921875l0 6.0625q0 0.21875 -0.109375 0.359375q-0.109375 0.125 -0.328125 0.125q-0.21875 0 -0.34375 -0.125q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.59375q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.3125 0 0.484375 0.34375l3.046875 5.84375l3.015625 -5.84375q0.09375 -0.1875 0.203125 -0.265625q0.125 -0.078125 0.28125 -0.078125zm4.8576965 8.59375q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm8.925674 -7.796875q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.328125l0 7.625q0 0.21875 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.640625q-0.265625 0.546875 -0.78125 0.84375q-0.5 0.296875 -1.1875 0.296875q-0.765625 0 -1.359375 -0.375q-0.578125 -0.390625 -0.90625 -1.078125q-0.328125 -0.6875 -0.328125 -1.59375q0 -0.90625 0.328125 -1.59375q0.328125 -0.6875 0.90625 -1.046875q0.59375 -0.375 1.359375 -0.375q0.6875 0 1.1875 0.296875q0.515625 0.296875 0.78125 0.84375l0 -3.203125q0 -0.21875 0.125 -0.34375q0.125 -0.125 0.359375 -0.125zm-2.25 7.796875q0.84375 0 1.296875 -0.578125q0.46875 -0.59375 0.46875 -1.65625q0 -1.0625 -0.46875 -1.640625q-0.453125 -0.578125 -1.296875 -0.578125q-0.859375 0 -1.34375 0.578125q-0.46875 0.578125 -0.46875 1.625q0 1.0625 0.46875 1.65625q0.484375 0.59375 1.34375 0.59375zm9.06218 -0.640625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm4.386551 5.296875q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m215.10997 150.37688q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.375 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84375 0 1.5625 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.15625 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.109375 0 2.03125 -0.328125l0 -2.578125l-1.75 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.234375 0zm5.1568146 -1.5625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.2028046 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.5035553 5.984375q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm10.461807 -0.515625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.480301 -2.453125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m260.00964 265.61465q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm8.9496765 -6.03125q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.767273 6.046875q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm8.535065 -0.046875q0.203125 0 0.296875 0.109375q0.109375 0.09375 0.109375 0.265625q0 0.1875 -0.109375 0.296875q-0.09375 0.09375 -0.296875 0.09375l-4.203125 0q-0.203125 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.3125q0 -0.1875 0.140625 -0.359375l3.546875 -4.28125l-3.28125 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l4.0625 0q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.3125q0 0.1875 -0.140625 0.359375l-3.5625 4.28125l3.421875 0zm6.2547913 -0.59375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.8396606 -0.75q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125z" fill-rule="nonzero"/><path fill="#000000" d="m258.07846 275.1459q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.3749847 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84373474 0 1.5624847 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.1562347 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.1093597 0 2.0312347 -0.328125l0 -2.578125l-1.7499847 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.2343597 0zm5.15683 -1.5625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.2027893 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.5035706 5.984375q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm10.461792 -0.515625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.480316 -2.453125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 232.16667l0 20.377945" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 232.16667l0 16.950867" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.85565 249.11754l-1.1246033 -1.124588l1.1246033 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#f4cccc" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m67.63894 87.62236q0.171875 0.15625 0.171875 0.359375q0 0.15625 -0.140625 0.296875q-0.140625 0.140625 -0.3125 0.140625q-0.15625 0 -0.328125 -0.140625l-4.484375 -3.921875l0 3.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 3.4375l4.28125 -3.796875q0.125 -0.140625 0.3125 -0.140625q0.171875 0 0.296875 0.140625q0.140625 0.140625 0.140625 0.3125q0 0.171875 -0.15625 0.328125l-3.875 3.421875l4.09375 3.5625zm5.8329315 -0.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.792801 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm6.3444214 0.765625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m127.74803 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.74803 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m147.45874 88.37367q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.484375 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-4.015625 0l0 2.9375l3.78125 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.78125 0l0 3.078125l4.015625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-4.484375 0zm8.31218 0.078125q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.4787903 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm1.8769073 0.765625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm8.799652 1.234375q1.9375 0 1.9375 2.3125l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.328125 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.890625 -0.359375q-0.734375 0 -1.15625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.90625 -0.359375q-0.71875 0 -1.140625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.359375 -0.140625q0.203125 0 0.328125 0.125q0.140625 0.125 0.140625 0.34375l0 0.578125q0.265625 -0.515625 0.734375 -0.78125q0.46875 -0.28125 1.078125 -0.28125q1.375 0 1.78125 1.140625q0.265625 -0.515625 0.78125 -0.828125q0.515625 -0.3125 1.171875 -0.3125zm6.0990753 0q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.8144073 0.78125q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm7.1287994 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m269.00754 88.46742q-0.90625 0 -1.734375 -0.265625q-0.8125 -0.265625 -1.3125 -0.734375q-0.171875 -0.15625 -0.171875 -0.40625q0 -0.171875 0.09375 -0.296875q0.09375 -0.125 0.234375 -0.125q0.15625 0 0.328125 0.125q1.109375 0.859375 2.546875 0.859375q1.03125 0 1.578125 -0.390625q0.5625 -0.390625 0.5625 -1.125q0 -0.421875 -0.265625 -0.671875q-0.265625 -0.265625 -0.703125 -0.421875q-0.4375 -0.15625 -1.15625 -0.328125q-0.984375 -0.21875 -1.625 -0.46875q-0.625 -0.265625 -1.015625 -0.734375q-0.390625 -0.46875 -0.390625 -1.21875q0 -0.71875 0.390625 -1.265625q0.390625 -0.5625 1.09375 -0.875q0.703125 -0.3125 1.59375 -0.3125q0.84375 0 1.5625 0.265625q0.734375 0.25 1.234375 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.125 0 -0.34375 -0.140625q-0.59375 -0.46875 -1.09375 -0.65625q-0.5 -0.203125 -1.21875 -0.203125q-0.984375 0 -1.546875 0.421875q-0.546875 0.40625 -0.546875 1.15625q0 0.625 0.484375 0.953125q0.484375 0.3125 1.5 0.5625q1.09375 0.25 1.71875 0.484375q0.625 0.21875 1.03125 0.671875q0.421875 0.4375 0.421875 1.171875q0 0.71875 -0.390625 1.265625q-0.390625 0.53125 -1.109375 0.828125q-0.703125 0.296875 -1.609375 0.296875zm5.0446777 -0.03125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm2.784027 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm8.799652 1.234375q1.9375 0 1.9375 2.3125l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.328125 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.890625 -0.359375q-0.734375 0 -1.15625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.90625 -0.359375q-0.71875 0 -1.140625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.359375 -0.140625q0.203125 0 0.328125 0.125q0.140625 0.125 0.140625 0.34375l0 0.578125q0.265625 -0.515625 0.734375 -0.78125q0.46875 -0.28125 1.078125 -0.28125q1.375 0 1.78125 1.140625q0.265625 -0.515625 0.78125 -0.828125q0.515625 -0.3125 1.171875 -0.3125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m297.8283 154.87688q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm7.358429 -6.078125q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm8.37854 4.625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.308441 5.3125q-0.8125 0 -1.453125 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.6875 -0.34375 -1.578125q0 -0.90625 0.359375 -1.59375q0.359375 -0.703125 0.984375 -1.078125q0.640625 -0.390625 1.46875 -0.390625q0.453125 0 0.90625 0.125q0.453125 0.125 0.78125 0.359375q0.21875 0.140625 0.3125 0.28125q0.09375 0.140625 0.09375 0.3125q0 0.171875 -0.09375 0.28125q-0.09375 0.09375 -0.234375 0.09375q-0.078125 0 -0.1875 -0.046875q-0.09375 -0.046875 -0.15625 -0.09375q-0.0625 -0.046875 -0.09375 -0.0625q-0.3125 -0.203125 -0.59375 -0.3125q-0.28125 -0.125 -0.6875 -0.125q-0.875 0 -1.359375 0.59375q-0.484375 0.59375 -0.484375 1.65625q0 1.046875 0.484375 1.625q0.484375 0.578125 1.359375 0.578125q0.40625 0 0.703125 -0.109375q0.296875 -0.125 0.59375 -0.328125q0.140625 -0.09375 0.25 -0.15625q0.125 -0.0625 0.203125 -0.0625q0.140625 0 0.21875 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.15625 -0.09375 0.28125q-0.078125 0.125 -0.296875 0.28125q-0.34375 0.234375 -0.8125 0.375q-0.46875 0.125 -0.953125 0.125zm7.998047 -0.84375q0.203125 0.171875 0.203125 0.375q0 0.1875 -0.125 0.328125q-0.125 0.125 -0.3125 0.125q-0.15625 0 -0.328125 -0.140625l-3.125 -2.703125l0 2.359375q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 4.875l2.859375 -2.625q0.15625 -0.140625 0.328125 -0.140625q0.1875 0 0.3125 0.140625q0.140625 0.125 0.140625 0.296875q0 0.203125 -0.171875 0.359375l-2.375 2.109375l2.59375 2.265625zm4.2812805 -5.21875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm6.67157 0.796875q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm4.722534 0.78125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm5.237152 1.234375q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.5660706 5.28125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.361267 0.78125q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m233.1085 268.03217l-66.74016 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m233.10852 268.03217l-63.313095 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m169.79543 268.03217l1.124588 -1.1246033l-3.0897675 1.1246033l3.0897675 1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m75.62205 99.34843l0 19.652092l46.992126 0l0 133.54475" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m75.62205 99.34843l0 19.652084l46.992126 0l0 130.11768" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.614174 249.1182l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m171.49606 99.34974l0 19.650558l-48.88189 0l0 133.5463" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m171.49606 99.34974l0 19.650558l-48.88189 0l0 130.1192" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.614174 249.1195l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m233.89502 131.35573l-1.124588 -1.124588l1.124588 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m326.24146 131.35573l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m579.47955 247.1612q0.203125 0 0.328125 0.140625q0.125 0.125 0.125 0.359375l0 7.578125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.234375 0 -0.390625 -0.203125l-4.984375 -6.65625l0 6.359375q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.21875 0 -0.34375 -0.140625q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.40625 0.203125l4.96875 6.65625l0 -6.359375q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.34375 -0.140625zm8.868103 0q0.203125 0 0.328125 0.140625q0.125 0.125 0.125 0.359375l0 7.578125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.234375 0 -0.390625 -0.203125l-4.984375 -6.65625l0 6.359375q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.21875 0 -0.34375 -0.140625q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.40625 0.203125l4.96875 6.65625l0 -6.359375q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.34375 -0.140625zm12.917175 7.953125q0.046875 0.09375 0.046875 0.203125q0 0.171875 -0.140625 0.296875q-0.140625 0.125 -0.328125 0.125q-0.296875 0 -0.421875 -0.296875l-0.84375 -1.9375l-4.53125 0l-0.859375 1.9375q-0.125 0.296875 -0.421875 0.296875q-0.1875 0 -0.34375 -0.125q-0.140625 -0.125 -0.140625 -0.3125q0 -0.09375 0.046875 -0.1875l3.4375 -7.640625q0.078125 -0.15625 0.21875 -0.234375q0.140625 -0.09375 0.3125 -0.09375q0.171875 0 0.3125 0.09375q0.15625 0.078125 0.21875 0.234375l3.4375 7.640625zm-5.859375 -2.421875l3.8125 0l-1.90625 -4.3125l-1.90625 4.3125zm7.78656 3.046875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm4.9744263 4.34375q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m589.5417 213.87056q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm2.7480469 0q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm2.7479858 0q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m122.620316 283.52823l0 14.9730835l75.49606 0l0 20.90091" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m122.620316 283.52823l0 14.9730835l75.49608 0l0 17.473846" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m198.1164 315.97516l-1.124588 -1.1246033l1.124588 3.0897827l1.1245728 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85654 283.52823l0 14.9730835l-78.74016 0l0 20.90091" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85654 283.52823l0 14.9730835l-78.74014 0l0 17.473846" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m198.1164 315.97516l-1.124588 -1.1246033l1.124588 3.0897827l1.1245728 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 0.06298828l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 334.95734l-1.1245728 1.1246033l3.0897827 -1.1246033l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -41.858246l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -41.858246l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 293.0361l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.1246033z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -83.74802l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -83.74802l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 251.14633l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -125.60629l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -125.60629l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 209.28806l-1.1245728 1.124588l3.0897827 -1.124588l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m233.88803 165.75919l0 17.70752l42.960632 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.88805 165.75919l0 17.70752l42.960617 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.84866 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.86572 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index d38db85280d7bd935a47cda70227d383a513fbac..0fffab574ddd8ad75ec07ae4442f363a36ed289e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -33,6 +33,11 @@ bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
   if (conv_op->stride_width != conv_op->stride_height) {
     return false;
   }
+  if ((conv_op->dilation_width_factor != 1) ||
+      (conv_op->dilation_height_factor != 1)) {
+    // Depthwise conv does not support dilation
+    return false;
+  }
   auto& weights_array = model->GetArray(conv_op->inputs[1]);
   if (!weights_array.buffer) {
     // Yield until the weights are resolved as a constant array.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
index ab943f72d1dd87ae9ff4bd53a807cd4923a88c38..c5ce3fcd95eb0aaf63dcc7f43b96d8a13ed93929 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -42,9 +42,9 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
 
   if (CountTrueOutputs(*model, *op) > 1) {
     AddMessageF(
-        "Not fusing activation function into %s because it has more than one "
-        " consumed output",
-        LogName(*op));
+        "Not fusing activation function %s into %s because it has more than "
+        "one  consumed output",
+        LogName(*ac_op), LogName(*op));
     return false;
   }
 
@@ -56,22 +56,31 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
     AddMessageF(
         "Not fusing activation function into %s because it is consumed by more "
         "than 1 other operator",
-        LogName(*op));
+        LogName(*ac_op), LogName(*op));
+    return false;
+  }
+
+  if (!IsDiscardableArray(*model, op->outputs[0])) {
+    AddMessageF(
+        "Not fusing activation function %s into %s because output %s it is not "
+        "discardable",
+        LogName(*ac_op), LogName(*op), op->outputs[0]);
     return false;
   }
 
   if (op->fused_activation_function != FusedActivationFunctionType::kNone) {
     AddMessageF(
-        "Not fusing activation function into %s because it already has a fused "
-        "activation function",
-        LogName(*op));
+        "Not fusing activation function %s into %s because it already has a "
+        "fused activation function",
+        LogName(*ac_op), LogName(*op));
     return false;
   }
 
   if (!OperatorSupportsFusedActivation(op->type)) {
     AddMessageF(
-        "Not fusing activation function because the %s op doesn't support it",
-        LogName(*op));
+        "Not fusing activation function %s because the %s op doesn't support "
+        "it",
+        LogName(*ac_op), LogName(*op));
     return false;
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 5b57178b18d2d60e1f301a1a8b257d8057618550..76c6be00d407ca30b898d088c9fa34cd7f76f656 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -50,7 +50,17 @@ void FuseAddOrSubParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
 
   // TODO(b/62904716): Bias array should become 1-D when padding removed.
   const int depth = bias_shape.dims(bias_shape.dimensions_count() - 1);
-  CHECK_EQ(depth, operand_shape.dims(operand_shape.dimensions_count() - 1));
+  int operand_channel_increment = 0;
+  if (operand_shape.dimensions_count() >= 1 &&
+      operand_shape.dims(operand_shape.dimensions_count() - 1) ==
+          bias_shape.dims(bias_shape.dimensions_count() - 1)) {
+    operand_channel_increment = 1;
+  } else if (operand_shape.dimensions_count() == 0 ||
+             operand_shape.dims(operand_shape.dimensions_count() - 1) == 1) {
+    operand_channel_increment = 0;
+  } else {
+    LOG(FATAL) << "Operand shape mismatch.";
+  }
 
   enum class OpType { BiasPlusOperand, BiasMinusOperand, OperandMinusBias };
 
@@ -60,9 +70,10 @@ void FuseAddOrSubParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
                                   ? OpType::BiasMinusOperand
                                   : OpType::OperandMinusBias;
 
+  int operand_channel = 0;
   for (int i = 0; i < depth; i++) {
     float& bias_val = bias_data[i];
-    const float operand_val = operand_data[i];
+    const float operand_val = operand_data[operand_channel];
     if (optype == OpType::BiasPlusOperand) {
       bias_val += operand_val;
     } else if (optype == OpType::BiasMinusOperand) {
@@ -72,6 +83,7 @@ void FuseAddOrSubParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
     } else {
       LOG(FATAL) << "Should not get here.";
     }
+    operand_channel += operand_channel_increment;
   }
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 11e5e19f508ecfb2131b391af9b4e51bb9cf0fb4..27c5044bb3e06e4a052ff0c4984226fb9d113f95 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -128,7 +128,9 @@ DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
 DECLARE_GRAPH_TRANSFORMATION(SplitLstmCellInputs)
 DECLARE_GRAPH_TRANSFORMATION(MergeLstmCellInputs)
+DECLARE_GRAPH_TRANSFORMATION(MergeReshapeIntoPrecedingTranspose)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyRelu1)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyPRelu)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyDilatedConv)
 DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
 DECLARE_GRAPH_TRANSFORMATION(PropagateActivationFunctionIntoConstants)
@@ -151,7 +153,8 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax)
-DECLARE_GRAPH_TRANSFORMATION(ReorderActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(ReorderElementwiseUnary)
+DECLARE_GRAPH_TRANSFORMATION(ReorderReshapeTranspose)
 DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
@@ -172,6 +175,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMeanAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTransposeAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRandomUniform)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRange)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantShapeOrRank)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStack)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 5cc82da5d544846cc095046ceccf0664525aae41..23c9e3246bc3db019cc0f5c3d1f1870ad3b65250 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -223,8 +223,11 @@ bool PropagateMinMaxAmongArrays(Model* model,
     if (array.minmax) {
       CHECK(*array.minmax == *reference_minmax)
           << "Both the following arrays have minmax, and they disagree: "
-          << reference_array_name << " and " << array_name
-          << ". Expected that either only one of them would have minmax, or at "
+          << reference_array_name << " (" << reference_minmax->min << ","
+          << reference_minmax->max << ") and " << array_name << " ("
+          << array.minmax->min << "," << array.minmax->max
+          << "). Expected that either only one of them would have minmax, or "
+             "at "
              "least that they would agree.";
     } else {
       array.GetOrCreateMinMax() = *reference_minmax;
@@ -332,6 +335,7 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
     case OperatorType::kPad:
     case OperatorType::kGather:
     case OperatorType::kTranspose:
+    case OperatorType::kMean:
       changed = HardcodeMinMaxFromFirstInput(model, op);
       break;
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_prelu.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_prelu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30be4ac0aa5e9f639bbf0630e142c2806faa3260
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_prelu.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+// This transformation rule tries to identify the PRelu structure generated by
+// Keras, and convert it to a single op.
+//
+// The formula of PReLU is:
+// f(x) = alpha * x for x < 0, f(x) = x for x >= 0.
+//
+// `x` is the input, and `alpha` is a trainable tensor which can be broadcasted
+// to the shape of `x`.
+//
+// There's no native PRelu op in TensorFlow, so Keras generates the following
+// structure which does the equivalent calculation:
+// f(x) = Relu(x) + (-alpha * Relu(-x))
+//
+// Practically, alpha is always a constant in the inference graph, and Toco have
+// other graph transformations which fold the activation functions to other ops.
+// Therefore, we're looking for the structure:
+//
+// f(x) = Relu(x) + (negative_alpha * Neg(x, activation=Relu))
+
+namespace toco {
+
+bool IdentifyPRelu::Run(Model* model, std::size_t op_index) {
+  const auto add_op_it = model->operators.begin() + op_index;
+  const auto* add_op = add_op_it->get();
+  if (add_op == nullptr || add_op->type != OperatorType::kAdd ||
+      add_op->inputs.size() != 2 ||
+      add_op->fused_activation_function != FusedActivationFunctionType::kNone) {
+    return false;
+  }
+
+  const auto* relu_input_op = GetOpWithOutput(*model, add_op->inputs[0]);
+  if (relu_input_op == nullptr || relu_input_op->type != OperatorType::kRelu ||
+      relu_input_op->inputs.size() != 1 ||
+      relu_input_op->fused_activation_function !=
+          FusedActivationFunctionType::kNone) {
+    return false;
+  }
+
+  // TODO(ycling): Both Add and Mul are commutative. Support the case where
+  // the position of operands are exchanged.
+  const auto* mul_op = GetOpWithOutput(*model, add_op->inputs[1]);
+  if (mul_op == nullptr || mul_op->type != OperatorType::kMul ||
+      mul_op->inputs.size() != 2 ||
+      mul_op->fused_activation_function != FusedActivationFunctionType::kNone) {
+    return false;
+  }
+
+  const auto neg_alpha_tensor_name = mul_op->inputs[0];
+
+  const auto* relu_neg_input_op = GetOpWithOutput(*model, mul_op->inputs[1]);
+
+  if (relu_neg_input_op == nullptr ||
+      relu_neg_input_op->type != OperatorType::kNeg ||
+      relu_neg_input_op->fused_activation_function !=
+          FusedActivationFunctionType::kRelu ||
+      relu_neg_input_op->inputs.size() != 1) {
+    return false;
+  }
+
+  if (relu_input_op->inputs[0] != relu_neg_input_op->inputs[0]) {
+    return false;
+  }
+
+  const auto input_tensor_name = relu_input_op->inputs[0];
+  const auto output_tensor_name = add_op->outputs[0];
+
+  // Construct a tensor for positive alpha (double negative).
+  const auto alpha_tensor_name =
+      AvailableArrayName(*model, neg_alpha_tensor_name + "_neg");
+  model->GetOrCreateArray(alpha_tensor_name);
+
+  auto* neg_neg_alpha_op = new NegOperator;
+  neg_neg_alpha_op->inputs = {neg_alpha_tensor_name};
+  neg_neg_alpha_op->outputs = {alpha_tensor_name};
+  model->operators.emplace(add_op_it, neg_neg_alpha_op);
+
+  auto* prelu_op = new PReluOperator;
+  prelu_op->inputs = {input_tensor_name, alpha_tensor_name};
+  prelu_op->outputs = {output_tensor_name};
+  model->operators.emplace(add_op_it, prelu_op);
+  AddMessageF("Creating %s replacing equivalent subgraph", LogName(*prelu_op));
+
+  DeleteArrayIfUsedOnce(neg_alpha_tensor_name, model);
+  DeleteArrayIfUsedOnce(add_op->inputs[0], model);
+  DeleteArrayIfUsedOnce(add_op->inputs[1], model);
+  DeleteArrayIfUsedOnce(mul_op->inputs[1], model);
+  // Remove the existing Add op that outputs the final result. If the other
+  // intermediate tensors aren't used by other ops, those will be removed by
+  // other graph transformation rules.
+  model->operators.erase(FindOp(*model, add_op));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
index 881c2d4dc892625d4640cac867a2f49c24b638f5..4a9974ed4e0ebec4381b86798156f4f51bb154a0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
+
 #include <iostream>
 #include <string>
 #include <vector>
@@ -100,3 +103,5 @@ bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
                          string* rnn_array);
 
 }  // namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5065004093434475172a39efdcfd26c10c49148b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -0,0 +1,190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool OperatorReady(const Model& model, const Operator* op) {
+  if (!model.HasArray(op->inputs[0]) || !model.HasArray(op->inputs[1]) ||
+      !model.HasArray(op->outputs[0])) {
+    // Arrays are missing.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[0]).has_shape() ||
+      !model.GetArray(op->outputs[0]).has_shape()) {
+    // Input and output needs the shape.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[1]).buffer) {
+    // Buffer needs to be a constant.
+    return false;
+  }
+
+  return true;
+}
+
+// Returns whether the reshape could be a transpose.
+std::vector<int32> ReshapeToTranspose(const Model& model,
+                                      const TensorFlowReshapeOperator* op) {
+  CHECK(!op->shape.empty());
+  CHECK(model.HasArray(op->inputs[0]));
+  CHECK(model.HasArray(op->outputs[0]));
+
+  const auto& input_array = model.GetArray(op->inputs[0]);
+  const auto& output_array = model.GetArray(op->outputs[0]);
+
+  CHECK(input_array.has_shape());
+  CHECK(output_array.has_shape());
+
+  std::vector<int> in_shape = input_array.shape().dims();
+  std::vector<int> out_shape = output_array.shape().dims();
+
+  std::vector<int> one_indices;
+  std::vector<int> not_one_indices;
+
+  // Separate into one indices and not one indices.
+  for (int i = 0; i < in_shape.size(); i++) {
+    if (in_shape[i] == 1) {
+      one_indices.push_back(i);
+    } else {
+      not_one_indices.push_back(i);
+    }
+  }
+
+  // Reorder the vertices.
+  std::vector<int> perm;
+  perm.reserve(in_shape.size());
+  int one_index = 0;
+  int not_one_index = 0;
+  for (const auto val : out_shape) {
+    if (val == 1) {
+      perm.push_back(one_indices[one_index]);
+      one_index++;
+    } else {
+      perm.push_back(not_one_indices[not_one_index]);
+      not_one_index++;
+    }
+  }
+
+  return perm;
+}
+
+}  // namespace
+
+// When a transpose is fed into a reshape, it is possible for the two operators
+// to be merged if the reshape does not affect memory ordering and does not
+// affects the number of dimensions. This only occurs when only unary dimensions
+// are shifting position.
+bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
+                                             std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  auto* reshape_op = ConvertOperator<TensorFlowReshapeOperator*>(
+      it->get(), OperatorType::kTensorFlowReshape);
+
+  if (reshape_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
+    return false;
+  }
+
+  const string intermediate_name = reshape_op->inputs[0];
+  const string output_name = reshape_op->outputs[0];
+
+  // Guarantee the input is only consume by the reshape.
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    return false;
+  }
+
+  // Check for the parent operator.
+  const auto& transpose_it = FindOpWithOutput(*model, intermediate_name);
+  if (transpose_it == model->operators.end()) {
+    return false;
+  }
+
+  // Find the parent operator and guarantee it is a transpose.
+  TransposeOperator* transpose_op = ConvertOperator<TransposeOperator*>(
+      transpose_it->get(), OperatorType::kTranspose);
+
+  if (transpose_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
+    return false;
+  }
+
+  if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
+                                      false /*allow_extra_unary_dimensions*/)) {
+    return false;
+  }
+
+  // Check that the intermediate is not an output array.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot fuse %s and %s as it would invalidate the transpose "
+        "output array.",
+        LogName(*transpose_op), LogName(*reshape_op));
+    return false;
+  }
+
+  AddMessageF("Merging operations %s and %s", LogName(*transpose_op),
+              LogName(*reshape_op));
+
+  // const auto& intermediate_array = model->GetArray(intermediate_name);
+  // const auto& output_array = model->GetArray(output_name);
+
+  auto merged_perm = ReshapeToTranspose(*model, reshape_op);
+
+  // Combine the permutations.
+  const auto& transpose_perm = transpose_op->perm;
+  for (int i = 0; i < merged_perm.size(); i++) {
+    merged_perm[i] = transpose_perm[merged_perm[i]];
+  }
+
+  // Remove the reshape as passthrough operation.
+  if (!RemoveTrivialPassthroughOp(this, model, op_index)) {
+    return false;
+  }
+
+  // Update transpose_op's constant buffer to contain the new permutation.
+  model->GetArray(transpose_op->inputs[1])
+      .GetMutableBuffer<ArrayDataType::kInt32>()
+      .data = merged_perm;
+  transpose_op->perm = merged_perm;
+
+  // transpose_ops's shape will likely has changed.
+  model->GetArray(transpose_op->outputs[0]).clear_shape();
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 778da39bf13563cbbdbe54f1140595b057253ae3..89ad58f887f3644125b64e9341e572c6b435edab 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -50,78 +50,108 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     old_output_data_types[output] = model->GetArray(output).data_type;
   }
   // Do the actual output data types propagation.
-  if (op->type == OperatorType::kDequantize ||
-      op->type == OperatorType::kResizeBilinear) {
-    // These operators unconditionally produce float outputs
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
-  } else if (op->type == OperatorType::kTensorFlowLess ||
-             op->type == OperatorType::kTensorFlowLessEqual ||
-             op->type == OperatorType::kTensorFlowGreater ||
-             op->type == OperatorType::kTensorFlowGreaterEqual) {
-    // These operators unconditionally produce bool outputs
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
-  } else if (op->type == OperatorType::kRank ||
-             op->type == OperatorType::kTensorFlowShape) {
-    // These operators only produce int32 outputs.
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
-  } else if (op->type == OperatorType::kTensorFlowSplit ||
-             op->type == OperatorType::kTensorFlowConcat ||
-             op->type == OperatorType::kFill) {
-    // These operators produce an output with the same type as their 2nd input
-    CHECK_GE(op->inputs.size(), 2);
-    const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kTransposeConv) {
-    // These operators produce an output with the same type as their 3rd input
-    CHECK_GE(op->inputs.size(), 3);
-    const ArrayDataType data_type = model->GetArray(op->inputs[2]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kCast) {
-    // Data type of the Cast op is specified.
-    CHECK_EQ(op->outputs.size(), 1);
-    auto* cast_op = static_cast<CastOperator*>(op);
-    model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
-  } else if (op->type == OperatorType::kArgMax) {
-    // Data type of the ArgMax op is specified.
-    CHECK_EQ(op->outputs.size(), 1);
-    auto* argmax_op = static_cast<ArgMaxOperator*>(op);
-    model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
-  } else if (op->type == OperatorType::kRange) {
-    auto* range_op = static_cast<RangeOperator*>(op);
-    // Output type of the Range op can be set via an attribute
-    ArrayDataType data_type;
-    if (range_op->dtype != ArrayDataType::kNone) {
-      // Use the type if specified
-      data_type = range_op->dtype;
-    } else {
-      // Otherwise use the first input
-      CHECK_GE(op->inputs.size(), 1);
-      data_type = model->GetArray(op->inputs[0]).data_type;
+  switch (op->type) {
+    case OperatorType::kDequantize:
+    case OperatorType::kResizeBilinear:
+      // These operators unconditionally produce float outputs
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
+      break;
+    case OperatorType::kTensorFlowLess:
+    case OperatorType::kTensorFlowLessEqual:
+    case OperatorType::kTensorFlowGreater:
+    case OperatorType::kTensorFlowGreaterEqual:
+      // These operators unconditionally produce bool outputs
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
+      break;
+    case OperatorType::kRank:
+    case OperatorType::kTensorFlowShape:
+      // These operators only produce int32 outputs.
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
+      break;
+    case OperatorType::kTensorFlowSplit:
+    case OperatorType::kTensorFlowConcat:
+    case OperatorType::kFill: {
+      // These operators produce an output with the same type as their 2nd input
+      CHECK_GE(op->inputs.size(), 2);
+      const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
     }
-    CHECK_EQ(op->outputs.size(), 1);
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kTensorFlowUnsupported) {
-    auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
-    // Some output tensors from the op could be eliminated by optimization.
-    // This can make unsupported_op->output_data_types have more elements than
-    // op->outputs.
-    if (unsupported_op->output_data_types.size() < op->outputs.size()) {
+    case OperatorType::kTransposeConv: {
+      // These operators produce an output with the same type as their 3rd input
+      CHECK_GE(op->inputs.size(), 3);
+      const ArrayDataType data_type = model->GetArray(op->inputs[2]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kCast: {
+      // Data type of the Cast op is specified.
+      CHECK_EQ(op->outputs.size(), 1);
+      auto* cast_op = static_cast<CastOperator*>(op);
+      model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
+      break;
+    }
+    case OperatorType::kArgMax: {
+      // Data type of the ArgMax op is specified.
+      CHECK_EQ(op->outputs.size(), 1);
+      auto* argmax_op = static_cast<ArgMaxOperator*>(op);
+      model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
+      break;
+    }
+    case OperatorType::kRange: {
+      auto* range_op = static_cast<RangeOperator*>(op);
+      // Output type of the Range op can be set via an attribute
+      ArrayDataType data_type;
+      if (range_op->dtype != ArrayDataType::kNone) {
+        // Use the type if specified
+        data_type = range_op->dtype;
+      } else {
+        // Otherwise use the first input
+        CHECK_GE(op->inputs.size(), 1);
+        data_type = model->GetArray(op->inputs[0]).data_type;
+      }
+      CHECK_EQ(op->outputs.size(), 1);
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kRandomUniform: {
+      auto* rand_op = static_cast<RandomUniformOperator*>(op);
+      // The output type of RandomUniform is specified with an attribute
+      if (rand_op->dtype == ArrayDataType::kNone) {
+        return false;
+      }
+      CHECK_EQ(op->outputs.size(), 1);
+      SetDataTypeForAllOutputs(model, op, rand_op->dtype);
+      break;
+    }
+    case OperatorType::kTensorFlowUnsupported: {
+      auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
+      // Some output tensors from the op could be eliminated by optimization.
+      // This can make unsupported_op->output_data_types have more elements than
+      // op->outputs.
+      if (unsupported_op->output_data_types.size() < op->outputs.size()) {
+        return false;
+      }
+      for (int i = 0; i < op->outputs.size(); ++i) {
+        auto output = op->outputs[i];
+        auto data_type = unsupported_op->output_data_types[i];
+        model->GetArray(output).data_type = data_type;
+      }
+      break;
+    }
+    case OperatorType::kExpandDims: {
+      // Yield on ExpandDim until it is converted to Reshape
       return false;
     }
-    for (int i = 0; i < op->outputs.size(); ++i) {
-      auto output = op->outputs[i];
-      auto data_type = unsupported_op->output_data_types[i];
-      model->GetArray(output).data_type = data_type;
+    default: {
+      // These operators produce outputs with the same type as their 1st input
+      CHECK_GT(op->inputs.size(), 0);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
     }
-  } else if (op->type == OperatorType::kExpandDims) {
-    // Yield on ExpandDim until it is converted to Reshape
-    return false;
-  } else {
-    // These operators produce outputs with the same type as their 1st input
-    CHECK_GT(op->inputs.size(), 0);
-    const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
   }
+
   // Return true if any output data type changed, false if none changed.
   for (const auto& output : op->outputs) {
     if (old_output_data_types[output] != model->GetArray(output).data_type) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 375848a7d408b58eab8692a15376b5b2e04b34af..68d6f21cf847bfb112213cabd326854bb174cfc1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -38,6 +38,16 @@ void ComputeConvSizes(const Shape& input_shape, int output_depth, int kwidth,
   const int input_height = input_shape.dims(1);
   const int batch = input_shape.dims(0);
 
+  CHECK_GE(input_width, 1);
+  CHECK_GE(input_height, 1);
+  CHECK_GE(batch, 1);
+  CHECK_GE(kwidth, 1);
+  CHECK_GE(kheight, 1);
+  CHECK_GE(stride_width, 1);
+  CHECK_GE(stride_height, 1);
+  CHECK_GE(dilation_width_factor, 1);
+  CHECK_GE(dilation_height_factor, 1);
+
   int dilated_kwidth = dilation_width_factor * (kwidth - 1) + 1;
   int dilated_kheight = dilation_height_factor * (kheight - 1) + 1;
 
@@ -392,8 +402,7 @@ void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
                          depth * block_size * block_size}));
 }
 
-void ProcessFillOperator(Model* model, FillOperator* op) {
-  CHECK_EQ(op->inputs.size(), 2);
+void ProcessOpWithShapeInput(Model* model, Operator* op) {
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
@@ -1467,6 +1476,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kRelu:
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
+    case OperatorType::kPRelu:
     case OperatorType::kSoftmax:
     case OperatorType::kLogSoftmax:
     case OperatorType::kLogistic:
@@ -1528,7 +1538,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
                                   static_cast<SpaceToDepthOperator*>(op));
       break;
     case OperatorType::kFill:
-      ProcessFillOperator(model, static_cast<FillOperator*>(op));
+      CHECK_EQ(op->inputs.size(), 2);
+      ProcessOpWithShapeInput(model, op);
       break;
     case OperatorType::kFullyConnected:
       ProcessFullyConnectedOperator(model,
@@ -1658,6 +1669,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       // transforms that remove them, so we avoid propagating shapes through
       // them and let things settle once they've been removed.
       break;
+    case OperatorType::kRandomUniform:
+      CHECK_EQ(op->inputs.size(), 1);
+      ProcessOpWithShapeInput(model, op);
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index ad3f05274b59e3019726f1b7a7080e74d1934c89..7784558b2275f19a34089011bd70cc04b45819ae 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -52,7 +52,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kStridedSlice ||
          type == OperatorType::kDepthToSpace ||
          type == OperatorType::kLstmCell || type == OperatorType::kGather ||
-         type == OperatorType::kTranspose;
+         type == OperatorType::kTranspose || type == OperatorType::kMean;
 }
 
 template <ArrayDataType A>
@@ -65,8 +65,6 @@ std::unique_ptr<GenericBuffer> QuantizeBuffer(
       static_cast<const Buffer<ArrayDataType::kFloat>&>(buffer);
   auto* quantized_buffer = new Buffer<A>;
   quantized_buffer->data.resize(float_buffer.data.size());
-  const auto qmin = static_cast<int32>(std::numeric_limits<DataType<A>>::min());
-  const auto qmax = static_cast<int32>(std::numeric_limits<DataType<A>>::max());
   for (std::size_t i = 0; i < float_buffer.data.size(); i++) {
     const float src_val = float_buffer.data[i];
     double scaled_val;  // Astonishingly, using 'float' degrades accuracy just
@@ -78,9 +76,8 @@ std::unique_ptr<GenericBuffer> QuantizeBuffer(
     } else {
       scaled_val = quantization_params.zero_point + inverse_scale * src_val;
     }
-    const auto rounded_val = static_cast<int32>(std::round(scaled_val));
-    const auto clamped_val = std::min(qmax, std::max(qmin, rounded_val));
-    quantized_buffer->data[i] = static_cast<DataType<A>>(clamped_val);
+    quantized_buffer->data[i] =
+        tflite::SafeCast<DataType<A>>(std::round(scaled_val));
   }
   return std::unique_ptr<GenericBuffer>(quantized_buffer);
 }
@@ -475,6 +472,44 @@ bool ChooseQuantizationForOperatorOutput(
 
   return true;
 }
+
+// Fixes array minmax info to match the quantization parameters.
+// This is required for when quantization parameters change for an array during
+// quantization (such as ChooseQuantizationForOperatorOutput).
+void FixMinMaxPostQuantization(ArrayDataType quantized_data_type,
+                               const QuantizationParams& quantization_params,
+                               MinMax* minmax) {
+  double qmin, qmax;
+  switch (quantized_data_type) {
+    case ArrayDataType::kUint8:
+      qmin = 0;
+      qmax = 255;
+      break;
+    case ArrayDataType::kInt16:
+      qmin = -32768;
+      qmax = 32767;
+      break;
+    default:
+      // No update required.
+      return;
+  }
+
+  // Compute new minmax values.
+  double min =
+      (qmin - quantization_params.zero_point) * quantization_params.scale;
+  double max =
+      (qmax - quantization_params.zero_point) * quantization_params.scale;
+
+  // If we are close to the existing minmax values don't bother changing them.
+  // This prevents propagating small floating point precision errors.
+  constexpr double kMinMaxThreshold = 1e-5;
+  const double width = max - min;
+  if (std::abs(min - minmax->min) > kMinMaxThreshold * width ||
+      std::abs(max - minmax->max) > kMinMaxThreshold * width) {
+    minmax->min = min;
+    minmax->max = max;
+  }
+}
 }  // namespace
 
 bool Quantize::Run(Model* model, std::size_t op_index) {
@@ -621,12 +656,19 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
                                             &quantization_params)) {
       changed = true;
       const auto& output = op.outputs[output_index];
+      auto& output_array = model->GetArray(output);
+
+      // Fix up the min/max information on the output array to match the chosen
+      // quantization parameters.
+      auto& output_minmax = output_array.GetMinMax();
+      FixMinMaxPostQuantization(quantized_data_type, quantization_params,
+                                &output_minmax);
+
       QuantizeArray(this, model, output, quantized_data_type,
                     quantization_params);
+
       const auto& dequantized_output =
           AvailableArrayName(*model, output + "_dequantized");
-      const auto& output_array = model->GetArray(output);
-      const auto& output_minmax = output_array.GetMinMax();
       auto& dequantized_output_array =
           model->GetOrCreateArray(dequantized_output);
       dequantized_output_array.data_type = ArrayDataType::kFloat;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
index 11f8d4b6eea836c5fe4efcbd5136e6183a59dc62..bdcca5b7caf61a62203debaa32c4d7a9b2eb43fa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
@@ -72,6 +72,13 @@ bool ReadFakeQuantMinMax::Run(Model* model, std::size_t op_index) {
     minmax.min = min_array.GetBuffer<ArrayDataType::kFloat>().data[0];
     minmax.max = max_array.GetBuffer<ArrayDataType::kFloat>().data[0];
     // We always want [min, max] to contain 0.
+    if (minmax.min > 0 || minmax.max < 0) {
+      LOG(ERROR) << "For " << LogName(*fakequant_op) << " the MinMax range "
+                 << "[" << minmax.min << ", " << minmax.max
+                 << "] does not contain 0. "
+                 << "Proceeding by tweaking it to contain 0, which will result "
+                    "in poor accuracy.";
+    }
     minmax.min = std::min(minmax.min, 0.);
     minmax.max = std::max(minmax.max, 0.);
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
deleted file mode 100644
index 9852c86c21b9a0714bc728e60b5d9dfe61ff52d1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ReorderActivationFunctions::Run(Model* model, std::size_t op_index) {
-  const auto ac_it = model->operators.begin() + op_index;
-  std::unique_ptr<Operator>& ac_op = *ac_it;
-  DCHECK(ac_op);
-
-  if (ac_op->type != OperatorType::kRelu6 &&
-      ac_op->type != OperatorType::kRelu1 &&
-      ac_op->type != OperatorType::kRelu) {
-    return false;
-  }
-
-  auto exchange_it = FindOpWithOutput(*model, ac_op->inputs[0]);
-  if (exchange_it == model->operators.end()) return false;
-  // Find the op producing the array passed to this activation function
-  std::unique_ptr<Operator>& exchange_op = *exchange_it;
-  DCHECK(exchange_op);
-
-  // Allow activation functions to move up over any operator that does not
-  // change the values.
-  switch (exchange_op->type) {
-    case OperatorType::kExpandDims:
-    case OperatorType::kSqueeze:
-    case OperatorType::kTensorFlowReshape:
-    case OperatorType::kTranspose:
-      break;
-    default:
-      return false;
-  }
-
-  DCHECK_EQ(exchange_op->outputs[0], ac_op->inputs[0]);
-  const auto exchange_op_input = exchange_op->inputs[0];
-  const auto intermediate_array = exchange_op->outputs[0];
-  const auto ac_op_output = ac_op->outputs[0];
-
-  int count_ops_consuming_output =
-      CountOpsWithInput(*model, intermediate_array);
-  DCHECK_GE(count_ops_consuming_output, 1);
-  if (count_ops_consuming_output > 1) {
-    AddMessageF(
-        "Not exchanging activation function with %s because it is consumed by "
-        "more than 1 other operator",
-        LogName(*exchange_op));
-    return false;
-  }
-
-  // If the ac_op was originally producing an output_array we can't trivially
-  // reorder as otherwise the output array name would change and break
-  // downstream assumptions. To work around that we perform some renaming below
-  // in that case at the cost of a bit more confusing array names in this rare
-  // case.
-  bool is_ac_op_output =
-      std::find(model->flags.output_arrays().begin(),
-                model->flags.output_arrays().end(),
-                ac_op_output) != model->flags.output_arrays().end();
-  if (is_ac_op_output) {
-    // To preserve the output array name of the activation function we need to
-    // create a temporary to use to pass between ac->ex.
-    //
-    // Original:
-    //  (a) -> EX -> (b) -> AC -> (c)
-    // Now:
-    //  (a) -> AC -> (c') -> EX -> (c)
-    AddMessageF(
-        "Exchanging activation function %s with %s but renaming to preserve "
-        "output array %s",
-        LogName(*ac_op), LogName(*exchange_op), ac_op->outputs[0]);
-
-    auto renamed_ac_op_output =
-        AvailableArrayName(*model, ac_op_output + "_exchange");
-    ac_op->inputs[0] = exchange_op_input;
-    ac_op->outputs[0] = renamed_ac_op_output;
-    model->EraseArray(exchange_op->outputs[0]);
-    exchange_op->inputs[0] = renamed_ac_op_output;
-    exchange_op->outputs[0] = ac_op_output;
-  } else {
-    // Simply swap the order and update consumers to use the exchange_op output
-    // array (b).
-    //
-    // Original:
-    //  (a) -> EX -> (b) -> AC -> (c)
-    // Now:
-    //  (a) -> AC -> (c) -> EX -> (b)
-    AddMessageF("Exchanging activation function %s with %s", LogName(*ac_op),
-                LogName(*exchange_op));
-
-    Operator* consumer = GetFirstOpWithInput(*model, ac_op_output);
-    while (consumer) {
-      for (int i = 0; i < consumer->inputs.size(); ++i) {
-        if (consumer->inputs[i] == ac_op_output) {
-          consumer->inputs[i] = intermediate_array;
-        }
-      }
-      consumer = GetFirstOpWithInput(*model, ac_op_output);
-    }
-    ac_op->inputs[0] = exchange_op_input;
-    exchange_op->inputs[0] = ac_op_output;
-  }
-
-  // Clear shapes; this will allow shape propagation to fix the sizes for us.
-  model->GetOrCreateArray(ac_op->outputs[0]).clear_shape();
-  model->GetOrCreateArray(exchange_op->outputs[0]).clear_shape();
-
-  // Finally, reorder operators.  Note that this only works when there are no
-  // other direct descendents of the exchange_op.
-  ac_op.swap(exchange_op);
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f5b7920cb937b021eb23fc1d5fdc3c1ff18a72d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsElementwiseOperator(OperatorType optype) {
+  switch (optype) {
+    case OperatorType::kCast:
+    case OperatorType::kExp:
+    case OperatorType::kFloor:
+    case OperatorType::kNeg:
+    case OperatorType::kRelu:
+    case OperatorType::kRelu1:
+    case OperatorType::kRelu6:
+    case OperatorType::kTanh:
+    case OperatorType::kTensorFlowSqrt:
+    case OperatorType::kTensorFlowSquare:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsMoveOperator(OperatorType optype) {
+  switch (optype) {
+    case OperatorType::kDepthToSpace:
+    case OperatorType::kExpandDims:
+    case OperatorType::kSpaceToDepth:
+    case OperatorType::kSqueeze:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTranspose:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace
+
+// Swap elementwise operators such that all value operators occur before all
+// element move operators, e.g. negation then transpose.
+bool ReorderElementwiseUnary::Run(Model* model, std::size_t op_index) {
+  const auto element_op_it = model->operators.begin() + op_index;
+  std::unique_ptr<Operator>& element_op = *element_op_it;
+  if (!IsElementwiseOperator(element_op->type)) {
+    return false;
+  }
+
+  const string intermediate_name = element_op->inputs[0];
+  auto it = FindOpWithOutput(*model, intermediate_name);
+  if (it == model->operators.end()) {
+    AddMessageF("No preceding operator");
+    return false;
+  }
+
+  std::unique_ptr<Operator>& move_op = *it;
+  if (!IsMoveOperator(move_op->type)) {
+    AddMessageF("Preceding operator is not a move operator");
+    return false;
+  }
+
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    AddMessageF("Input %s used elsewhere", intermediate_name);
+    return false;
+  }
+
+  // Check that the intermediate is discardable.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot swap elementwise as it would invalidate %s which is "
+        "an output array.",
+        intermediate_name);
+    return false;
+  }
+
+  // op->inputs may change so we need to keep a value by copy.
+  const string input_name = move_op->inputs[0];
+  const string output_name = element_op->outputs[0];
+
+  AddMessageF("Swapping around operators with %s and %s", LogName(*element_op),
+              LogName(*move_op));
+
+  // If the output array is an exit node for the graph then we need to retain
+  // the name as an output node. This makes the naming scheme a little confusing
+  // but is required in this rare case.
+  if (!IsDiscardableArray(*model, output_name)) {
+    // The output name of the sequence needs to stay static, so create a new
+    // array new use for the intermediate.
+    const auto new_intermediate_name =
+        AvailableArrayName(*model, element_op->outputs[0] + "_reorder");
+    AddMessageF("Adding new array %s to preserve output array name %s",
+                new_intermediate_name, output_name);
+
+    element_op->inputs[0] = input_name;
+    element_op->outputs[0] = new_intermediate_name;
+    model->EraseArray(intermediate_name);
+    move_op->inputs[0] = new_intermediate_name;
+    move_op->outputs[0] = output_name;
+  } else {
+    // The intermediate array is now the output array.
+    for (int i = 0; i < model->operators.size(); i++) {
+      Operator* consumer = model->operators[i].get();
+      for (int j = 0; j < consumer->inputs.size(); j++) {
+        if (consumer->inputs[j] == output_name) {
+          consumer->inputs[j] = intermediate_name;
+        }
+      }
+    }
+
+    element_op->inputs[0] = input_name;
+    move_op->inputs[0] = output_name;
+  }
+
+  // Reset both arrays as shape, type, min/max, etc can all change because of
+  // the position swap.
+  model->EraseArray(element_op->outputs[0]);
+  model->EraseArray(move_op->outputs[0]);
+
+  // Reconstruct.
+  model->GetOrCreateArray(element_op->outputs[0]);
+  model->GetOrCreateArray(move_op->outputs[0]);
+
+  // Swap the order of the operators.
+  element_op.swap(move_op);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e7fe1b1ccd851dd998e59e75ff798f52f7c6e5a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -0,0 +1,248 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool OperatorReady(const Model& model, const Operator* op) {
+  if (!model.HasArray(op->inputs[0]) || !model.HasArray(op->inputs[1]) ||
+      !model.HasArray(op->outputs[0])) {
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[0]).has_shape() ||
+      !model.GetArray(op->outputs[0]).has_shape()) {
+    // Input and output needs the shape.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[1]).buffer) {
+    // Buffer needs to be a constant.
+    return false;
+  }
+
+  return true;
+}
+
+// Utility function to filter out a value.
+void Filter(std::vector<int>* vec, int value) {
+  vec->erase(std::remove(vec->begin(), vec->end(), value), vec->end());
+}
+
+// Computes a new permutation used to swap a reshape-transpose to a
+// transpose-reshape. In this case the permutation operates on the intermediate
+// shape.
+std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
+                                std::vector<int> intermediate_dims,
+                                std::vector<int> perm) {
+  // These are the major axis of the input.
+  std::vector<int> input_indices;
+  for (int i = 0; i < input_dims.size(); i++) {
+    if (input_dims[i] != 1) {
+      input_indices.push_back(i);
+    }
+  }
+
+  // This maps which indices of the input produced the intermediate indices for
+  // non-unary dimensions.
+  std::unordered_map<int, int> intermediate_to_input_indices_map;
+  for (int i = 0; i < intermediate_dims.size(); i++) {
+    if (intermediate_dims[i] != 1) {
+      intermediate_to_input_indices_map[i] =
+          input_indices[intermediate_to_input_indices_map.size()];
+    }
+  }
+
+  // Translate the transpose permutation to a new permutation starting with the
+  // major indices.
+  std::vector<int> new_perm;
+  new_perm.reserve(input_dims.size());
+  for (int i = 0; i < perm.size(); i++) {
+    if (intermediate_dims[perm[i]] == 1) continue;
+
+    new_perm.push_back(intermediate_to_input_indices_map[perm[i]]);
+  }
+
+  // Fill the rest of the transpose in with the ones.
+  for (int index = 0; index < input_dims.size(); index++) {
+    if (input_dims[index] == 1) {
+      new_perm.push_back(index);
+    }
+  }
+
+  CHECK_EQ(new_perm.size(), input_dims.size());
+  return new_perm;
+}
+
+}  // namespace
+
+// Swaps reshape-transpose to transpose-reshape whenever possible. This is
+// possible when the reshape does not affect memory ordering.
+bool ReorderReshapeTranspose::Run(Model* model, std::size_t op_index) {
+  auto transpose_it = model->operators.begin() + op_index;
+
+  TransposeOperator* transpose_op = ConvertOperator<TransposeOperator*>(
+      transpose_it->get(), OperatorType::kTranspose);
+
+  if (transpose_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
+    // Wait for values to propagate.
+    return false;
+  }
+
+  // Find the operator that produces the transpose op.
+  auto reshape_it = FindOpWithOutput(*model, transpose_op->inputs[0]);
+  if (reshape_it == model->operators.end()) {
+    return false;
+  }
+
+  TensorFlowReshapeOperator* reshape_op =
+      ConvertOperator<TensorFlowReshapeOperator*>(
+          reshape_it->get(), OperatorType::kTensorFlowReshape);
+  if (reshape_op == nullptr) {
+    return false;
+  }
+
+  // Ignore if the reshape is uninitialized.
+  if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
+    return false;
+  }
+
+  // Need to copy to keep static if permutated.
+  const string input_name = reshape_op->inputs[0];
+  const string intermediate_name = reshape_op->outputs[0];
+  const string output_name = transpose_op->outputs[0];
+
+  // Intermediate should not be consumed by any other operators.
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    AddMessageF("Input %s used elsewhere", intermediate_name);
+    return false;
+  }
+
+  // Check that the intermediate is not an output array.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot reorder reshape-transpose as it would invalidate %s which is "
+        "an output array.",
+        intermediate_name);
+    return false;
+  }
+
+  // Get the arrays.
+  const auto& input_array = model->GetArray(input_name);
+  const auto& intermediate_array = model->GetArray(intermediate_name);
+  const auto& output_array = model->GetArray(output_name);
+
+  // Get the shapes of each array.
+  Shape input_shape = input_array.shape();
+  Shape intermediate_shape = intermediate_array.shape();
+  Shape output_shape = output_array.shape();
+
+  // Assign ids to non-unary indices.
+  std::vector<int> input_dims = input_shape.dims();
+  std::vector<int> intermediate_dims = intermediate_shape.dims();
+  std::vector<int> output_dims = output_shape.dims();
+
+  // If the reshape is equivalent to a transpose with fewer/more unary
+  // dimensions then it can be moved between the transpose.
+  if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
+                                      true /*allow_extra_unary_dims*/)) {
+    return false;
+  }
+
+  if (!IsDiscardableArray(*model, output_name)) {
+    // The output name of the sequence needs to stay static, so create a new
+    // array new use for the intermediate.
+    const auto new_intermediate_name =
+        AvailableArrayName(*model, transpose_op->outputs[0] + "_exchange");
+    AddMessageF("Adding new array %s to preserve output array name %s",
+                new_intermediate_name, transpose_op->outputs[0]);
+    transpose_op->inputs[0] = input_name;
+    transpose_op->outputs[0] = new_intermediate_name;
+    reshape_op->inputs[0] = new_intermediate_name;
+    reshape_op->outputs[0] = output_name;
+    model->EraseArray(intermediate_name);
+  } else {
+    // The intermediate array is now the output array.
+    for (int i = 0; i < model->operators.size(); i++) {
+      Operator* consumer = model->operators[i].get();
+      for (int j = 0; j < consumer->inputs.size(); j++) {
+        if (consumer->inputs[j] == output_name) {
+          consumer->inputs[j] = intermediate_name;
+        }
+      }
+    }
+
+    transpose_op->inputs[0] = input_name;
+    reshape_op->inputs[0] = output_name;
+  }
+
+  // If transposes constant buffer is used elsewhere, make a new copy.
+  if (CountOpsWithInput(*model, transpose_op->inputs[1]) != 1) {
+    transpose_op->inputs[1] =
+        AvailableArrayName(*model, transpose_op->inputs[1] + "_copy");
+  }
+
+  // Make the new transpose permutation.
+  const std::vector<int> new_perm =
+      ComputeNewPerm(input_dims, intermediate_dims, transpose_op->perm);
+  CHECK_EQ(input_dims.size(), new_perm.size());
+
+  auto& transpose_array = model->GetOrCreateArray(transpose_op->inputs[1]);
+  transpose_array.GetMutableBuffer<ArrayDataType::kInt32>().data = new_perm;
+  *(transpose_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(new_perm.size())};
+  transpose_op->perm = new_perm;
+
+  // If the reshape's constant buffer is reused, create a new one.
+  if (CountOpsWithInput(*model, reshape_op->inputs[1]) != 1) {
+    reshape_op->inputs[1] =
+        AvailableArrayName(*model, reshape_op->inputs[1] + "_copy");
+  }
+
+  // We need to modify the reshape input array to target the new output size.
+  auto& reshape_array = model->GetOrCreateArray(reshape_op->inputs[1]);
+  reshape_array.GetMutableBuffer<ArrayDataType::kInt32>().data = output_dims;
+  *(reshape_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(output_shape.dimensions_count())};
+  reshape_op->shape.clear();
+
+  AddMessageF("Swapping around operators between %s and %s", input_name,
+              output_name);
+
+  model->GetOrCreateArray(transpose_op->outputs[0]).clear_shape();
+  model->GetOrCreateArray(reshape_op->outputs[0]).clear_shape();
+
+  // Swap the order of the operators.
+  transpose_it->swap(*reshape_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88d06d7dc75005c89a69b881aa0064d1162227d5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace toco {
+
+template <ArrayDataType Type>
+bool ComputeRandomUniformArray(Model* model, RandomUniformOperator* op) {
+  typedef tensorflow::random::UniformDistribution<
+      tensorflow::random::PhiloxRandom, DataType<Type>>
+      Distribution;
+
+  // Allocate output
+  auto& output_array = model->GetArray(op->outputs[0]);
+  CHECK(output_array.data_type == Type);
+  std::vector<DataType<Type>>& data =
+      output_array.GetMutableBuffer<Type>().data;
+  data.resize(RequiredBufferSizeForShape(output_array.shape()));
+
+  // We use the same random number generator and distribution as TensorFlow to
+  // produce the exact same values given the same seeds. See
+  // tensorflow::functor::FillPhiloxRandomTask<Distribution, false> in
+  // //third_party/tensorflow/core/kernels/random_op.cc for the implementation.
+  tensorflow::random::PhiloxRandom generator(op->seed, op->seed2);
+  Distribution dist;
+
+  // The generator creates Distribution::kResultElementCount samples at a time.
+  size_t offset = 0;
+  size_t num_samples = Distribution::kResultElementCount;
+  while (offset < data.size()) {
+    const typename Distribution::ResultType samples = dist(&generator);
+    std::copy(&samples[0],
+              &samples[0] + std::min(num_samples, data.size() - offset),
+              &data[0] + offset);
+    offset += num_samples;
+  }
+
+  return true;
+}
+
+bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* base_op = it->get();
+  if (base_op->type != OperatorType::kRandomUniform) {
+    return false;
+  }
+  auto* op = static_cast<RandomUniformOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 1);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes
+    return false;
+  }
+
+  if ((op->seed == 0) && (op->seed2 == 0)) {
+    LOG(WARNING) << "RandomUniform op outputting \"" << op->outputs[0]
+                 << "\" is truly random (using /dev/random system entropy). "
+                    "Therefore, cannot resolve as constant. Set \"seed\" or "
+                    "\"seed2\" attr non-zero to fix this";
+    return false;
+  }
+
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      if (!ComputeRandomUniformArray<ArrayDataType::kFloat>(model, op)) {
+        return false;
+      }
+      break;
+    // For future support of double or half.
+    // case ArrayDataType::kDouble...
+    default:
+      LOG(FATAL)
+          << "Unsupported data type given to RandomUniform op with output \""
+          << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used
+  toco::DeleteArrayIfUsedOnce(op->inputs[0], model);
+
+  // Erase the operator
+  model->operators.erase(it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index 6d5636d744bed193c212227e69f87ae2caf84273..d4db6f1c009cd19515655fb31974a2e97cfa42e8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -28,24 +28,45 @@ limitations under the License.
 
 namespace toco {
 
+bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
+  auto& output_array = model->GetArray(op.outputs[0]);
+  if (output_array.minmax) {
+    return false;
+  }
+  const auto& input_array = model->GetArray(op.inputs[0]);
+  if (!input_array.minmax) {
+    return false;
+  }
+  const auto& input_minmax = input_array.GetMinMax();
+  CHECK(!output_array.minmax);
+  auto& output_minmax = output_array.GetOrCreateMinMax();
+  output_minmax.min = input_minmax.min;
+  output_minmax.max = input_minmax.max;
+  return true;
+}
+
 bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   const auto unary_it = model->operators.begin() + op_index;
   const auto* unary_op = unary_it->get();
-  // Test for unary ops of types that we know how to resolve
-  if (unary_op->type != OperatorType::kCast &&
-      unary_op->type != OperatorType::kNeg &&
-      unary_op->type != OperatorType::kTensorFlowRsqrt &&
-      unary_op->type != OperatorType::kTensorFlowSqrt &&
-      unary_op->type != OperatorType::kTensorFlowSquare &&
-      unary_op->type != OperatorType::kTensorFlowSum &&
-      unary_op->type != OperatorType::kTensorFlowMin &&
-      unary_op->type != OperatorType::kTensorFlowMax &&
-      unary_op->type != OperatorType::kTensorFlowReshape &&
-      unary_op->type != OperatorType::kRelu6 &&
-      unary_op->type != OperatorType::kRelu1 &&
-      unary_op->type != OperatorType::kRelu) {
-    return false;
+  // Test for unary ops of types that we know how to resolve.
+  switch (unary_op->type) {
+    case OperatorType::kCast:
+    case OperatorType::kNeg:
+    case OperatorType::kTensorFlowRsqrt:
+    case OperatorType::kTensorFlowSqrt:
+    case OperatorType::kTensorFlowSquare:
+    case OperatorType::kTensorFlowSum:
+    case OperatorType::kTensorFlowMin:
+    case OperatorType::kTensorFlowMax:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kRelu6:
+    case OperatorType::kRelu1:
+    case OperatorType::kRelu:
+      break;
+    default:
+      return false;
   }
+
   // Check if the input is a constant parameter.
   if (!IsConstantParameterArray(*model, unary_op->inputs[0])) {
     return false;
@@ -79,6 +100,12 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
+  // The min-max is only copied for ops that copy data without arithmetic.
+  // In future trivial transpose, etc, can be handled here.
+  if (unary_op->type == OperatorType::kTensorFlowReshape) {
+    CopyMinMaxFromFirstInput(*unary_op, model);
+  }
+
   const auto& input_array = model->GetArray(unary_op->inputs[0]);
   // We have already tested above for existence of buffers (synonymous to being
   // a constant param).
@@ -138,8 +165,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     }
   } else if (unary_op->type == OperatorType::kTensorFlowReshape) {
     CHECK(input_buffer_size == output_buffer_size);
-    memcpy(output_float_data.data(), (*input_float_data).data(),
-           output_buffer_size * sizeof(output_float_data[0]));
+    output_float_data = *input_float_data;
   } else if (unary_op->type == OperatorType::kTensorFlowSum) {
     CHECK_EQ(unary_op->inputs.size(), 2) << "Sum needs 2 inputs";
     if (!IsConstantParameterArray(*model, unary_op->inputs[1])) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index f38203c80fcb7ab8bc1639129fd98e4e342e5cb7..2a236d3f98784e8244942f94d5a250b5bc00a8ad 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -60,6 +60,13 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   string input_lhs = matmul_op->inputs[0];
   string input_rhs = transpose_op->outputs[0];
 
+  // Construct the new FullyConnectedOperator.
+  auto* fc_op = new FullyConnectedOperator;
+  fc_op->outputs = matmul_op->outputs;
+
+  // Insert the newly constructed FullyConnectedOperator.
+  model->operators.emplace(matmul_it, fc_op) + 1;
+
   // Find the op producing the array passed to this MatMul
   auto previous_op_it = model->operators.begin();
   bool found = false;
@@ -76,13 +83,6 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   }
   Operator* previous_op = (found) ? previous_op_it->get() : nullptr;
 
-  // Construct the new FullyConnectedOperator.
-  auto* fc_op = new FullyConnectedOperator;
-  fc_op->outputs = matmul_op->outputs;
-
-  // Insert the newly constructed FullyConnectedOperator.
-  model->operators.emplace(matmul_it, fc_op) + 1;
-
   // Refresh iterator.
   matmul_it = model->operators.begin();
   for (; matmul_it != model->operators.end(); ++matmul_it) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
index 2f94f9cd8a9ab24809fb3d137b5d05ab12f43003..8dcd4adc90b188c745cadb9815c3c46383705833 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
@@ -28,15 +28,3 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index 48c326651f3201b4f7a31ac2440b171841e8ed7b..cbea39bcc09ea6787c055d5aaca7f291c2b47a7f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -165,7 +165,7 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
   CHECK(mod_op && mod_op->type == OperatorType::kFloorMod)
       << "Unsupported partition strategy";
   CHECK_EQ(mod_op, GetOpWithOutput(*model, indices_partition_op->inputs[1]))
-      << "Indices and data parition ops require the same partition strategy "
+      << "Indices and data partition ops require the same partition strategy "
          "and inputs";
 
   // Glob together all of the gather data. This is not yet in the correct order.
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index a7a50e6fc9326338d69cd0334c83b60e2fa50402..876479079b5168b09a4748a3db2077345d363678 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -74,7 +74,7 @@ const string& GetStringAttr(const NodeDef& node, const string& attr_name) {
   return attr.s();
 }
 
-int GetIntAttr(const NodeDef& node, const string& attr_name) {
+int64 GetIntAttr(const NodeDef& node, const string& attr_name) {
   CHECK(HasAttr(node, attr_name)) << attr_name << " not found in:\n"
                                   << node.DebugString();
   const auto& attr = node.attr().at(attr_name);
@@ -569,6 +569,23 @@ void ConvertBiasAddOperator(const NodeDef& node,
   model->operators.emplace_back(biasadd);
 }
 
+void ConvertRandomUniform(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "RandomUniform");
+  CheckInputsCount(node, tf_import_flags, 1);
+
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_INT32);
+  auto op = absl::make_unique<RandomUniformOperator>();
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  op->dtype = ConvertDataType(GetDataTypeAttr(node, "dtype"));
+  op->seed = GetIntAttr(node, "seed");
+  op->seed2 = GetIntAttr(node, "seed2");
+  CHECK(model != nullptr);
+  model->operators.emplace_back(std::move(op));
+}
+
 void ConvertReluOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
@@ -1343,13 +1360,16 @@ void ConvertFloorOperator(const NodeDef& node,
 void ConvertGatherOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
-  CHECK_EQ(node.op(), "Gather");
-  CheckInputsCount(node, tf_import_flags, 2);
+  CHECK(node.op() == "Gather" || node.op() == "GatherV2");
+  if (node.op() == "Gather") CheckInputsCount(node, tf_import_flags, 2);
+  if (node.op() == "GatherV2") CheckInputsCount(node, tf_import_flags, 3);
   const auto indices_data_type = GetDataTypeAttr(node, "Tindices");
   CHECK(indices_data_type == DT_INT32 || indices_data_type == DT_INT64);
   auto* op = new GatherOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
+  // TODO(ahentz): we currently ignore the third tensor in GatherV2 but we
+  // should read it an pass it on to the TF Lite Interpreter.
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
@@ -1541,7 +1561,9 @@ void ConvertMeanOperator(const NodeDef& node,
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  if (HasAttr(node, "keep_dims")) {
+  if (HasAttr(node, "keepdims")) {
+    op->keep_dims = GetBoolAttr(node, "keepdims");
+  } else if (HasAttr(node, "keep_dims")) {
     op->keep_dims = GetBoolAttr(node, "keep_dims");
   }
 }
@@ -1926,7 +1948,7 @@ void ConvertTopKV2Operator(const NodeDef& node,
   // K can be encoded as attr (TopK) convert it to a const.
   if (HasAttr(node, "k")) {
     string k_array = CreateConstArray<ArrayDataType::kInt32>(
-        model, node.name() + "k", {GetIntAttr(node, "k")});
+        model, node.name() + "k", {static_cast<int32>(GetIntAttr(node, "k"))});
     op->inputs.push_back(k_array);
   } else {
     CheckInputsCount(node, tf_import_flags, 2);
@@ -2117,7 +2139,7 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertCastOperator(node, tf_import_flags, model);
     } else if (node.op() == "Floor") {
       ConvertFloorOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Gather") {
+    } else if (node.op() == "Gather" || node.op() == "GatherV2") {
       ConvertGatherOperator(node, tf_import_flags, model);
     } else if (node.op() == "ResizeBilinear") {
       ConvertResizeBilinearOperator(node, tf_import_flags, model);
@@ -2163,6 +2185,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
     } else if (node.op() == "DynamicStitch" ||
                node.op() == "ParallelDynamicStitch") {
       ConvertDynamicStitchOperator(node, tf_import_flags, model);
+    } else if (node.op() == "RandomUniform") {
+      ConvertRandomUniform(node, tf_import_flags, model);
     } else {
       ConvertUnsupportedOperator(node, tf_import_flags, model);
     }
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 3fa0089cba15fbfed869b5c33883741c35493cda..9bd72e7de19b75c14b4a383942ae744e3ca0900d 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -60,11 +60,13 @@ enum class OperatorType {
   kMaxPool,
   kFakeQuant,
   kMul,
+  kRandomUniform,
   kRange,
   kRank,
   kRelu,
   kRelu1,
   kRelu6,
+  kPRelu,
   kSoftmax,
   kLogSoftmax,
   kSub,
@@ -566,6 +568,18 @@ struct Relu6Operator : Operator {
   Relu6Operator() : Operator(OperatorType::kRelu6) {}
 };
 
+// PRelu
+//   f(x) = alpha * x for x < 0, f(x) = x for x >= 0.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the alpha array
+//
+// Equivalent to keras.layers.PReLU.
+struct PReluOperator : Operator {
+  PReluOperator() : Operator(OperatorType::kPRelu) {}
+};
+
 // Element-wise Logistic operator:
 //   x -> Logistic(x) = 1 / (1 + exp(-x))
 //
@@ -933,6 +947,13 @@ struct FloorModOperator : Operator {
   FloorModOperator() : Operator(OperatorType::kFloorMod) {}
 };
 
+struct RandomUniformOperator : Operator {
+  RandomUniformOperator() : Operator(OperatorType::kRandomUniform) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+  int64 seed;
+  int64 seed2;
+};
+
 // Creates a sequence of numbers that begins at start and extends by increments
 // of delta up to but not including limit.
 //
@@ -1486,7 +1507,14 @@ class Shape {
 
   // We still have that one convenience accessor to avoid
   // the awkward double bracket issue:  shape.dims()[i].
-  int dims(int i) const { return dims_[i]; }
+  int dims(int i) const {
+    // Always check for out-of-bounds accesses, even in optimized builds where
+    // standard assertions are disabled. Out-of-bounds access here is a common
+    // occurence.
+    CHECK_GE(i, 0);
+    CHECK_GT(dims_.size(), i);
+    return dims_[i];
+  }
 
   bool operator==(const Shape& comp) const {
     return (this->dims_ == comp.dims());
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 4e2dec15a534607ef9207149a2e6061069eabcb1..245eb524443d64e25a9f75c1fc729ec2f6b9d253 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -72,6 +72,12 @@ bool ParseModelFlagsFromCommandLineFlags(
            "Shapes corresponding to --input_arrays, colon-separated. For "
            "many models each shape takes the form batch size, input array "
            "height, input array width, input array depth."),
+      Flag("batch_size", parsed_flags.batch_size.bind(),
+           parsed_flags.batch_size.default_value(),
+           "Batch size for the model. Replaces the first dimension of an "
+           "input size array if undefined. Use only with SavedModels when "
+           "--input_shapes flag is not specified. Always use --input_shapes "
+           "flag with frozen graphs."),
       Flag("input_data_type", parsed_flags.input_data_type.bind(),
            parsed_flags.input_data_type.default_value(),
            "Deprecated: use --input_data_types instead. Input array type, if "
@@ -154,6 +160,11 @@ bool ParseModelFlagsFromCommandLineFlags(
           "Path to an optional file containing a serialized ArraysExtraInfo "
           "proto allowing to pass extra information about arrays not specified "
           "in the input model file, such as extra MinMax information."),
+      Flag("model_flags_file", parsed_flags.model_flags_file.bind(),
+           parsed_flags.model_flags_file.default_value(),
+           "Path to an optional file containing a serialized ModelFlags proto. "
+           "Options specified on the command line will override the values in "
+           "the proto."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -176,7 +187,24 @@ void ReadModelFlagsFromCommandLineFlags(
     const ParsedModelFlags& parsed_model_flags, ModelFlags* model_flags) {
   toco::port::CheckInitGoogleIsDone("InitGoogle is not done yet");
 
-// "batch" flag only exists internally
+  // Load proto containing the initial model flags.
+  // Additional flags specified on the command line will overwrite the values.
+  if (parsed_model_flags.model_flags_file.specified()) {
+    string model_flags_file_contents;
+    QCHECK(port::file::GetContents(parsed_model_flags.model_flags_file.value(),
+                                   &model_flags_file_contents,
+                                   port::file::Defaults())
+               .ok())
+        << "Specified --model_flags_file="
+        << parsed_model_flags.model_flags_file.value()
+        << " was not found or could not be read";
+    QCHECK(ParseFromStringEitherTextOrBinary(model_flags_file_contents,
+                                             model_flags))
+        << "Specified --model_flags_file="
+        << parsed_model_flags.model_flags_file.value()
+        << " could not be parsed";
+  }
+
 #ifdef PLATFORM_GOOGLE
   CHECK(!((base::SpecifiedOnCommandLine("batch") &&
            parsed_model_flags.variable_batch.specified())))
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 867b86f31d16b502a7aeb92cb3d8c96117630cd2..835dea49ebc7d8f1c620b80b60b1dcdb578e83ef 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -96,11 +96,13 @@ message RnnState {
 // model that does not already contain such MinMax information.
 message ArraysExtraInfo {
   message Entry {
-    // Next ID to use: 5.
+    // Next ID to use: 7.
     optional string name = 1;
-    optional float min = 2;
-    optional float max = 3;
+    optional double min = 2;
+    optional double max = 3;
     optional IODataType data_type = 4;
+    optional InputArrayShape shape = 5;
+    optional float constant_float_value = 6;
   }
   repeated Entry entries = 1;
 }
diff --git a/tensorflow/contrib/lite/toco/python/BUILD b/tensorflow/contrib/lite/toco/python/BUILD
index 17115047d2ef93cce7004926c2b1a4bfa58f6243..6c4f8e12cdd5b3222997c4a2b0ac243cc74324e0 100644
--- a/tensorflow/contrib/lite/toco/python/BUILD
+++ b/tensorflow/contrib/lite/toco/python/BUILD
@@ -45,9 +45,6 @@ py_binary(
     name = "toco_wrapper",
     srcs = ["toco_wrapper.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
 )
 
 tf_py_test(
@@ -63,15 +60,3 @@ tf_py_test(
     ],
     tags = ["no_pip"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/python/toco_wrapper.py b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
index e39b5f22c7c8ffafaf72129be6f54090e6761dc3..6d6b500d7eccd353f566a4bad76df35e0e849d95 100644
--- a/tensorflow/contrib/lite/toco/python/toco_wrapper.py
+++ b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
@@ -22,14 +22,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import sys
-import tensorflow as tf
 
 
 def main():
   # Pip installs the binary in aux-bin off of main site-package install.
   # Just find it and exec, passing all arguments in the process.
   # TODO(aselle): it is unfortunate to use all of tensorflow to lookup binary.
-  binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
-  os.execvp(binary, sys.argv)
+  print("""TOCO from pip install is currently not working on command line.
+Please use the python TOCO API or use
+bazel run tensorflow/contrib/lite:toco -- <args> from a TensorFlow source dir.
+""")
+  sys.exit(1)
+  # TODO(aselle): Replace this when we find a way to run toco without
+  # blowing up executable size.
+  # binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
+  # os.execvp(binary, sys.argv)
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
index 0c1a1141fca91e7d27fe48ffae4f834ae92a1e08..336e94de1ed3238d64f521cf1347acc8f0737de7 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
@@ -88,15 +88,3 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index a2b8145a67278c3ac0065f9551da6ffd1de60772..e0191801a0f0076565c51085ec293524d63cbe88 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -115,9 +115,11 @@ cc_library(
     deps = [
         ":operator",
         ":types",
+        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/toco:model",
         "//tensorflow/contrib/lite/toco:tooling_util",
+        "//tensorflow/contrib/lite/tools:verifier",
         "@flatbuffers",
     ],
 )
@@ -135,15 +137,3 @@ tf_cc_test(
         "@flatbuffers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 27719599708a7eb14f72a82f8e5d76b3b8af9dc4..335b496dccdbdb7e342515868e1d7195c98f0351 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -300,6 +300,17 @@ void Export(const Model& model, bool allow_custom_ops,
   std::set<string> error_summary;
   auto op_codes = ExportOperatorCodes(model, ops_by_type, operators_map,
                                       &builder, &error_summary);
+  const string fake_quant_operation_name = "FAKE_QUANT";
+  if (error_summary.count(fake_quant_operation_name) != 0) {
+    LOG(ERROR)
+        << fake_quant_operation_name
+        << " operation was not converted. If running quantized make sure you "
+           "are passing --inference_type=QUANTIZED_UINT8 and values for "
+           "--std_values and --mean_values.";
+    // Remove the fake quant operation from the errors, since it shouldn't
+    // be provided a custom implementation.
+    error_summary.erase(fake_quant_operation_name);
+  }
   if (!allow_custom_ops && !error_summary.empty()) {
     LOG(QFATAL) << "Some of the operators in the model are not supported by "
                    "the standard TensorFlow Lite runtime. If you have a custom "
diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/contrib/lite/toco/tflite/import.cc
index e16784fd21dd5e6eeac5f03ba6250341dcdfbdb4..c0e7ab2ef57ed8edf1b7cda08c64f6ae66172af3 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import.cc
@@ -15,10 +15,12 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/tflite/import.h"
 
 #include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/contrib/lite/tools/verifier.h"
 
 namespace toco {
 
@@ -162,8 +164,20 @@ void ImportIOTensors(const ::tflite::Model& input_model,
   }
 }
 
+namespace {
+bool Verify(const void* buf, size_t len) {
+  ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
+  return ::tflite::VerifyModelBuffer(verifier);
+}
+}  // namespace
+
 std::unique_ptr<Model> Import(const ModelFlags& model_flags,
                               const string& input_file_contents) {
+  ::tflite::AlwaysTrueResolver r;
+  if (!::tflite::Verify(input_file_contents.data(), input_file_contents.size(),
+                        r, ::tflite::DefaultErrorReporter())) {
+    LOG(FATAL) << "Invalid flatbuffer.";
+  }
   const ::tflite::Model* input_model =
       ::tflite::GetModel(input_file_contents.data());
 
diff --git a/tensorflow/contrib/lite/toco/tflite/import_test.cc b/tensorflow/contrib/lite/toco/tflite/import_test.cc
index f25b17087639d1b04de4aaf371726fa594427ff1..edd22f783f03b1fbd34039cd7b00f08d34ca9fc6 100644
--- a/tensorflow/contrib/lite/toco/tflite/import_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import_test.cc
@@ -36,12 +36,13 @@ class ImportTest : public ::testing::Test {
     return builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.data()),
                                  sizeof(T) * data.size());
   }
+
   Offset<Vector<Offset<::tflite::Buffer>>> BuildBuffers() {
     auto buf0 = ::tflite::CreateBuffer(builder_, CreateDataVector<float>({}));
-    auto buf1 =
-        ::tflite::CreateBuffer(builder_, CreateDataVector<float>({1.0f, 2.0f}));
+    auto buf1 = ::tflite::CreateBuffer(
+        builder_, CreateDataVector<float>({1.0f, 2.0f, 3.0f, 4.0f}));
     auto buf2 =
-        ::tflite::CreateBuffer(builder_, CreateDataVector<float>({3.0f}));
+        ::tflite::CreateBuffer(builder_, CreateDataVector<float>({3.0f, 4.0f}));
     return builder_.CreateVector(
         std::vector<Offset<::tflite::Buffer>>({buf0, buf1, buf2}));
   }
@@ -53,10 +54,10 @@ class ImportTest : public ::testing::Test {
         /*max=*/builder_.CreateVector<float>({0.2f}),
         /*scale=*/builder_.CreateVector<float>({0.3f}),
         /*zero_point=*/builder_.CreateVector<int64_t>({100ll}));
-    auto t1 = ::tflite::CreateTensor(builder_,
-                                     builder_.CreateVector<int>({1, 2, 3, 4}),
-                                     ::tflite::TensorType_FLOAT32, 1,
-                                     builder_.CreateString("tensor_one"), q);
+    auto t1 =
+        ::tflite::CreateTensor(builder_, builder_.CreateVector<int>({1, 2, 2}),
+                               ::tflite::TensorType_FLOAT32, 1,
+                               builder_.CreateString("tensor_one"), q);
     auto t2 =
         ::tflite::CreateTensor(builder_, builder_.CreateVector<int>({2, 1}),
                                ::tflite::TensorType_FLOAT32, 2,
@@ -65,14 +66,54 @@ class ImportTest : public ::testing::Test {
         std::vector<Offset<::tflite::Tensor>>({t1, t2}));
   }
 
+  Offset<Vector<Offset<::tflite::OperatorCode>>> BuildOpCodes(
+      std::initializer_list<::tflite::BuiltinOperator> op_codes) {
+    std::vector<Offset<::tflite::OperatorCode>> op_codes_vector;
+    for (auto op : op_codes) {
+      op_codes_vector.push_back(::tflite::CreateOperatorCode(builder_, op, 0));
+    }
+    return builder_.CreateVector(op_codes_vector);
+  }
+
   Offset<Vector<Offset<::tflite::OperatorCode>>> BuildOpCodes() {
-    auto c1 =
-        ::tflite::CreateOperatorCode(builder_, ::tflite::BuiltinOperator_CUSTOM,
-                                     builder_.CreateString("custom_op_one"));
-    auto c2 = ::tflite::CreateOperatorCode(
-        builder_, ::tflite::BuiltinOperator_CONV_2D, 0);
-    return builder_.CreateVector(
-        std::vector<Offset<::tflite::OperatorCode>>({c1, c2}));
+    return BuildOpCodes({::tflite::BuiltinOperator_MAX_POOL_2D,
+                         ::tflite::BuiltinOperator_CONV_2D});
+  }
+
+  Offset<Vector<Offset<::tflite::Operator>>> BuildOperators(
+      std::initializer_list<int> inputs, std::initializer_list<int> outputs) {
+    auto is = builder_.CreateVector<int>(inputs);
+    if (inputs.size() == 0) is = 0;
+    auto os = builder_.CreateVector<int>(outputs);
+    if (outputs.size() == 0) os = 0;
+    auto op = ::tflite::CreateOperator(
+        builder_, 0, is, os, ::tflite::BuiltinOptions_Conv2DOptions,
+        ::tflite::CreateConv2DOptions(builder_, ::tflite::Padding_VALID, 1, 1,
+                                      ::tflite::ActivationFunctionType_NONE)
+            .Union(),
+        /*custom_options=*/0, ::tflite::CustomOptionsFormat_FLEXBUFFERS);
+
+    return builder_.CreateVector(std::vector<Offset<::tflite::Operator>>({op}));
+  }
+
+  Offset<Vector<Offset<::tflite::Operator>>> BuildOperators() {
+    return BuildOperators({0}, {1});
+  }
+
+  Offset<Vector<Offset<::tflite::SubGraph>>> BuildSubGraphs(
+      Offset<Vector<Offset<::tflite::Tensor>>> tensors,
+      Offset<Vector<Offset<::tflite::Operator>>> operators,
+      int num_sub_graphs = 1) {
+    std::vector<int32_t> inputs = {0};
+    std::vector<int32_t> outputs = {1};
+    std::vector<Offset<::tflite::SubGraph>> v;
+    for (int i = 0; i < num_sub_graphs; ++i) {
+      v.push_back(::tflite::CreateSubGraph(
+          builder_, tensors, builder_.CreateVector(inputs),
+          builder_.CreateVector(outputs), operators,
+          builder_.CreateString("subgraph")));
+    }
+    return builder_.CreateVector(v);
   }
 
   // This is a very simplistic model. We are not interested in testing all the
@@ -83,14 +124,13 @@ class ImportTest : public ::testing::Test {
     auto buffers = BuildBuffers();
     auto tensors = BuildTensors();
     auto opcodes = BuildOpCodes();
-
-    auto subgraph = ::tflite::CreateSubGraph(builder_, tensors, 0, 0, 0);
-    std::vector<flatbuffers::Offset<::tflite::SubGraph>> subgraph_vector(
-        {subgraph});
-    auto subgraphs = builder_.CreateVector(subgraph_vector);
+    auto operators = BuildOperators();
+    auto subgraphs = BuildSubGraphs(tensors, operators);
     auto s = builder_.CreateString("");
-    builder_.Finish(::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
-                                          opcodes, subgraphs, s, buffers));
+
+    ::tflite::FinishModelBuffer(
+        builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
+                                        opcodes, subgraphs, s, buffers));
 
     input_model_ = ::tflite::GetModel(builder_.GetBufferPointer());
   }
@@ -99,7 +139,6 @@ class ImportTest : public ::testing::Test {
                   builder_.GetSize());
   }
   flatbuffers::FlatBufferBuilder builder_;
-  // const uint8_t* buffer_ = nullptr;
   const ::tflite::Model* input_model_ = nullptr;
 };
 
@@ -116,7 +155,7 @@ TEST_F(ImportTest, LoadOperatorsTable) {
 
   details::OperatorsTable operators;
   details::LoadOperatorsTable(*input_model_, &operators);
-  EXPECT_THAT(operators, ElementsAre("custom_op_one", "CONV_2D"));
+  EXPECT_THAT(operators, ElementsAre("MAX_POOL_2D", "CONV_2D"));
 }
 
 TEST_F(ImportTest, Tensors) {
@@ -128,9 +167,9 @@ TEST_F(ImportTest, Tensors) {
   Array& a1 = model->GetArray("tensor_one");
   EXPECT_EQ(ArrayDataType::kFloat, a1.data_type);
   EXPECT_THAT(a1.GetBuffer<ArrayDataType::kFloat>().data,
-              ElementsAre(1.0f, 2.0f));
+              ElementsAre(1.0f, 2.0f, 3.0f, 4.0f));
   ASSERT_TRUE(a1.has_shape());
-  EXPECT_THAT(a1.shape().dims(), ElementsAre(1, 2, 3, 4));
+  EXPECT_THAT(a1.shape().dims(), ElementsAre(1, 2, 2));
 
   const auto& mm = a1.minmax;
   ASSERT_TRUE(mm.get());
@@ -143,13 +182,74 @@ TEST_F(ImportTest, Tensors) {
   EXPECT_EQ(100, q->zero_point);
 }
 
-TEST_F(ImportTest, NoSubGraphs) {
+TEST_F(ImportTest, NoBuffers) {
+  auto buffers = 0;
+  auto tensors = BuildTensors();
+  auto opcodes = BuildOpCodes();
+  auto operators = BuildOperators();
+  auto subgraphs = BuildSubGraphs(tensors, operators);
+  auto comment = builder_.CreateString("");
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+  EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
+               "Missing 'buffers' section.");
+}
+
+TEST_F(ImportTest, NoInputs) {
   auto buffers = BuildBuffers();
+  auto tensors = BuildTensors();
   auto opcodes = BuildOpCodes();
-  auto subgraphs = 0;  // no subgraphs in this model
+  auto operators = BuildOperators({}, {1});
+  auto subgraphs = BuildSubGraphs(tensors, operators);
   auto comment = builder_.CreateString("");
-  builder_.Finish(::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
-                                        opcodes, subgraphs, comment, buffers));
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+  EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
+               "Missing 'inputs' for operator.");
+}
+
+TEST_F(ImportTest, NoOutputs) {
+  auto buffers = BuildBuffers();
+  auto tensors = BuildTensors();
+  auto opcodes = BuildOpCodes();
+  auto operators = BuildOperators({0}, {});
+  auto subgraphs = BuildSubGraphs(tensors, operators);
+  auto comment = builder_.CreateString("");
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+  EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
+               "Missing 'outputs' for operator.");
+}
+
+TEST_F(ImportTest, InvalidOpCode) {
+  auto buffers = BuildBuffers();
+  auto tensors = BuildTensors();
+  auto opcodes = BuildOpCodes({static_cast<::tflite::BuiltinOperator>(-1),
+                               ::tflite::BuiltinOperator_CONV_2D});
+  auto operators = BuildOperators();
+  auto subgraphs = BuildSubGraphs(tensors, operators);
+  auto comment = builder_.CreateString("");
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+  EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
+               "Operator id '-1' is out of range.");
+}
+
+TEST_F(ImportTest, MultipleSubGraphs) {
+  auto buffers = BuildBuffers();
+  auto tensors = BuildTensors();
+  auto opcodes = BuildOpCodes();
+  auto operators = BuildOperators();
+  auto subgraphs = BuildSubGraphs(tensors, operators, 2);
+  auto comment = builder_.CreateString("");
+  ::tflite::FinishModelBuffer(
+      builder_, ::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                                      subgraphs, comment, buffers));
+
   input_model_ = ::tflite::GetModel(builder_.GetBufferPointer());
 
   EXPECT_DEATH(Import(ModelFlags(), InputModelAsString()),
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index f2cc4ef71f71902e363ac4cddd3695446af30c7d..f991529569d9ab56103bf7e5f91b2d2b7f2d23fe 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -204,17 +204,22 @@ class BatchToSpaceND
                    TocoOperator* op) const override {}
 };
 
-class Cast : public CustomOperator<CastOperator> {
+class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
+                                    ::tflite::BuiltinOptions_CastOptions> {
  public:
-  using CustomOperator::CustomOperator;
-  void WriteOptions(const TocoOperator& op,
-                    flexbuffers::Builder* fbb) const override {
-    fbb->Int("src_data_type", DataType::Serialize(op.src_data_type));
-    fbb->Int("dst_data_type", DataType::Serialize(op.dst_data_type));
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateCastOptions(*builder,
+                                       DataType::Serialize(op.src_data_type),
+                                       DataType::Serialize(op.dst_data_type));
   }
-  void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
-    op->src_data_type = DataType::Deserialize(m["src_data_type"].AsInt64());
-    op->dst_data_type = DataType::Deserialize(m["dst_data_type"].AsInt64());
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->src_data_type = DataType::Deserialize(options.in_data_type());
+    op->dst_data_type = DataType::Deserialize(options.out_data_type());
   }
 };
 
@@ -827,9 +832,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new TopK_V2(::tflite::BuiltinOperator_TOPK_V2, OperatorType::kTopK_V2));
   ops.emplace_back(
       new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell));
+  ops.emplace_back(
+      new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
 
   // Custom Operators.
-  ops.emplace_back(new Cast("CAST", OperatorType::kCast));
   ops.emplace_back(
       new DepthToSpace("DEPTH_TO_SPACE", OperatorType::kDepthToSpace));
   ops.emplace_back(new FakeQuant("FAKE_QUANT", OperatorType::kFakeQuant));
@@ -854,6 +860,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new SimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1));
   ops.emplace_back(
       new SimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6));
+  ops.emplace_back(
+      new SimpleOperator<PReluOperator>("PRELU", OperatorType::kPRelu));
   ops.emplace_back(new SimpleOperator<LogisticOperator>(
       "LOGISTIC", OperatorType::kLogistic));
   ops.emplace_back(
@@ -861,6 +869,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
   ops.emplace_back(new SimpleOperator<ExpOperator>("EXP", OperatorType::kExp));
   ops.emplace_back(new SimpleOperator<LogSoftmaxOperator>(
       "LOG_SOFTMAX", OperatorType::kLogSoftmax));
+  ops.emplace_back(new SimpleOperator<TensorFlowMaximumOperator>(
+      "MAXIMUM", OperatorType::kTensorFlowMaximum));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 9c19f8d4649acf40fdd85b78874f7b18798533f2..4783843b7fa1273201e0c31816b3e1be8e98f5d5 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -109,6 +109,8 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<ExpOperator>("EXP", OperatorType::kExp);
   CheckSimpleOperator<LogSoftmaxOperator>("LOG_SOFTMAX",
                                           OperatorType::kLogSoftmax);
+  CheckSimpleOperator<TensorFlowMaximumOperator>(
+      "MAXIMUM", OperatorType::kTensorFlowMaximum);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -129,7 +131,7 @@ TEST_F(OperatorTest, BuiltinMean) {
   EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims);
 }
 
-TEST_F(OperatorTest, CustomCast) {
+TEST_F(OperatorTest, BuiltinCast) {
   CastOperator op;
   op.src_data_type = ArrayDataType::kFloat;
   op.dst_data_type = ArrayDataType::kUint8;
diff --git a/tensorflow/contrib/lite/toco/toco.cc b/tensorflow/contrib/lite/toco/toco.cc
index f01ec0ec6102494f36cca0265b79e90355661271..8041aa9e7fbfdaf44134395fee4b2bb01633893a 100644
--- a/tensorflow/contrib/lite/toco/toco.cc
+++ b/tensorflow/contrib/lite/toco/toco.cc
@@ -23,40 +23,70 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
 #include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/toco_saved_model.h"
 #include "tensorflow/contrib/lite/toco/toco_tooling.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/platform/logging.h"
 
-#ifndef CHECK_OK
-#define CHECK_OK(val) CHECK_EQ((val).ok(), true)
-#define QCHECK_OK(val) QCHECK_EQ((val).ok(), true)
-#endif
-
 namespace toco {
 namespace {
 
-#define QCHECK_REQUIRE_TOCO_FLAG(arg) \
-  QCHECK(parsed_toco_flags.arg.specified()) << "Missing required flag: " #arg;
-
-void CheckFilePermissions(const ParsedTocoFlags& parsed_toco_flags,
-                          const ParsedModelFlags& parsed_model_flags,
-                          const TocoFlags& toco_flags) {
-  port::CheckInitGoogleIsDone("InitGoogle is not done yet");
-
-  QCHECK_REQUIRE_TOCO_FLAG(input_file)
-  QCHECK_OK(port::file::Exists(parsed_toco_flags.input_file.value(),
-                               port::file::Defaults()))
-      << "Specified input_file does not exist: "
-      << parsed_toco_flags.input_file.value();
-  QCHECK_OK(port::file::Readable(parsed_toco_flags.input_file.value(),
-                                 port::file::Defaults()))
+// Checks the permissions of the output file to ensure it is writeable.
+void CheckOutputFilePermissions(const Arg<string>& output_file) {
+  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
+  QCHECK(port::file::Writable(output_file.value()).ok())
+      << "Specified output_file is not writable: " << output_file.value()
+      << ".\n";
+}
+
+// Checks the permissions of the frozen model file.
+void CheckFrozenModelPermissions(const Arg<string>& input_file) {
+  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
+  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file does not exist: " << input_file.value() << ".\n";
+  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
       << "Specified input_file exists, but is not readable: "
-      << parsed_toco_flags.input_file.value();
+      << input_file.value() << ".\n";
+}
 
-  QCHECK_REQUIRE_TOCO_FLAG(output_file);
-  QCHECK_OK(port::file::Writable(parsed_toco_flags.output_file.value()))
-      << "parsed_toco_flags.input_file.value() output_file is not writable: "
-      << parsed_toco_flags.output_file.value();
+// Checks the permissions of the SavedModel directory.
+void CheckSavedModelPermissions(const Arg<string>& savedmodel_directory) {
+  QCHECK(savedmodel_directory.specified())
+      << "Missing required flag --savedmodel_directory.\n";
+  QCHECK(
+      port::file::Exists(savedmodel_directory.value(), port::file::Defaults())
+          .ok())
+      << "Specified savedmodel_directory does not exist: "
+      << savedmodel_directory.value() << ".\n";
+}
+
+// Reads the contents of the GraphDef from either the frozen graph file or the
+// SavedModel directory. If it reads the SavedModel directory, it updates the
+// ModelFlags and TocoFlags accordingly.
+void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags,
+                   string* graph_def_contents) {
+  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
+
+  bool has_input_file = parsed_toco_flags.input_file.specified();
+  bool has_savedmodel_dir = parsed_toco_flags.savedmodel_directory.specified();
+
+  // Ensure either input_file or savedmodel_directory flag has been set.
+  QCHECK_NE(has_input_file, has_savedmodel_dir)
+      << "Specify either input_file or savedmodel_directory flag.\n";
+
+  // Checks the input file permissions and reads the contents.
+  if (has_input_file) {
+    CheckFrozenModelPermissions(parsed_toco_flags.input_file);
+    CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
+                                  graph_def_contents, port::file::Defaults())
+              .ok());
+  } else {
+    CheckSavedModelPermissions(parsed_toco_flags.savedmodel_directory);
+    GetSavedModelContents(parsed_toco_flags, parsed_model_flags, toco_flags,
+                          model_flags, graph_def_contents);
+  }
 }
 
 void ToolMain(const ParsedTocoFlags& parsed_toco_flags,
@@ -67,21 +97,20 @@ void ToolMain(const ParsedTocoFlags& parsed_toco_flags,
   TocoFlags toco_flags;
   ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
 
-  CheckFilePermissions(parsed_toco_flags, parsed_model_flags, toco_flags);
+  string graph_def_contents;
+  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
+                &model_flags, &graph_def_contents);
+  CheckOutputFilePermissions(parsed_toco_flags.output_file);
 
-  string input_file_contents;
-  CHECK_OK(port::file::GetContents(parsed_toco_flags.input_file.value(),
-                                   &input_file_contents,
-                                   port::file::Defaults()));
   std::unique_ptr<Model> model =
-      Import(toco_flags, model_flags, input_file_contents);
+      Import(toco_flags, model_flags, graph_def_contents);
   Transform(toco_flags, model.get());
   string output_file_contents;
   Export(toco_flags, *model, toco_flags.allow_custom_ops(),
          &output_file_contents);
-  CHECK_OK(port::file::SetContents(parsed_toco_flags.output_file.value(),
-                                   output_file_contents,
-                                   port::file::Defaults()));
+  CHECK(port::file::SetContents(parsed_toco_flags.output_file.value(),
+                                output_file_contents, port::file::Defaults())
+            .ok());
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index 0f67c2de728532b5b8101b3514811a78a3b3bc38..cc7803dd866f0282f67d1d6f227cce0fdd8c7fd6 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
+#include "absl/types/optional.h"
 #include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/core/platform/logging.h"
@@ -38,6 +39,9 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "Input file (model of any supported format). For Protobuf "
            "formats, both text and binary are supported regardless of file "
            "extension."),
+      Flag("savedmodel_directory", parsed_flags.savedmodel_directory.bind(),
+           parsed_flags.savedmodel_directory.default_value(),
+           "Full path to the directory containing the SavedModel."),
       Flag("output_file", parsed_flags.output_file.bind(),
            parsed_flags.output_file.default_value(),
            "Output file. "
@@ -49,6 +53,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.output_format.default_value(),
            "Output file format. "
            "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
+      Flag("savedmodel_tagset", parsed_flags.savedmodel_tagset.bind(),
+           parsed_flags.savedmodel_tagset.default_value(),
+           "Comma-separated set of tags identifying the MetaGraphDef within "
+           "the SavedModel to analyze. All tags in the tag set must be "
+           "specified."),
       Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
            parsed_flags.default_ranges_min.default_value(),
            "If defined, will be used as the default value for the min bound "
@@ -128,47 +137,72 @@ bool ParseTocoFlagsFromCommandLineFlags(
   }
 }
 
+namespace {
+
+// Defines the requirements for a given flag. kUseDefault means the default
+// should be used in cases where the value isn't specified by the user.
+enum class FlagRequirement {
+  kNone,
+  kMustBeSpecified,
+  kMustNotBeSpecified,
+  kUseDefault,
+};
+
+// Enforces the FlagRequirements are met for a given flag.
+template <typename T>
+void EnforceFlagRequirement(const T& flag, const string& flag_name,
+                            FlagRequirement requirement) {
+  if (requirement == FlagRequirement::kMustBeSpecified) {
+    QCHECK(flag.specified()) << "Missing required flag " << flag_name;
+  }
+  if (requirement == FlagRequirement::kMustNotBeSpecified) {
+    QCHECK(!flag.specified())
+        << "Given other flags, this flag should not have been specified: "
+        << flag_name;
+  }
+}
+
+// Gets the value from the flag if specified. Returns default if the
+// FlagRequirement is kUseDefault.
+template <typename T>
+absl::optional<T> GetFlagValue(const Arg<T>& flag,
+                               FlagRequirement requirement) {
+  if (flag.specified()) return flag.value();
+  if (requirement == FlagRequirement::kUseDefault) return flag.default_value();
+  return absl::optional<T>();
+}
+
+}  // namespace
+
 void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
                                        TocoFlags* toco_flags) {
   namespace port = toco::port;
   port::CheckInitGoogleIsDone("InitGoogle is not done yet");
 
-  enum class FlagRequirement { kNone, kMustBeSpecified, kMustNotBeSpecified };
-
-#define ENFORCE_FLAG_REQUIREMENT(name, requirement)                          \
-  do {                                                                       \
-    if (requirement == FlagRequirement::kMustBeSpecified) {                  \
-      QCHECK(parsed_toco_flags.name.specified())                             \
-          << "Missing required flag: " << #name;                             \
-    }                                                                        \
-    if (requirement == FlagRequirement::kMustNotBeSpecified) {               \
-      QCHECK(!parsed_toco_flags.name.specified())                            \
-          << "Given other flags, this flag should not have been specified: " \
-          << #name;                                                          \
-    }                                                                        \
-  } while (false)
-#define READ_TOCO_FLAG(name, requirement)                     \
-  ENFORCE_FLAG_REQUIREMENT(name, requirement);                \
-  do {                                                        \
-    if (parsed_toco_flags.name.specified()) {                 \
-      toco_flags->set_##name(parsed_toco_flags.name.value()); \
-    }                                                         \
+#define READ_TOCO_FLAG(name, requirement)                                \
+  do {                                                                   \
+    EnforceFlagRequirement(parsed_toco_flags.name, #name, requirement);  \
+    auto flag_value = GetFlagValue(parsed_toco_flags.name, requirement); \
+    if (flag_value.has_value()) {                                        \
+      toco_flags->set_##name(flag_value.value());                        \
+    }                                                                    \
   } while (false)
 
-#define PARSE_TOCO_FLAG(Type, name, requirement)               \
-  ENFORCE_FLAG_REQUIREMENT(name, requirement);                 \
-  do {                                                         \
-    if (parsed_toco_flags.name.specified()) {                  \
-      Type x;                                                  \
-      QCHECK(Type##_Parse(parsed_toco_flags.name.value(), &x)) \
-          << "Unrecognized " << #Type << " value "             \
-          << parsed_toco_flags.name.value();                   \
-      toco_flags->set_##name(x);                               \
-    }                                                          \
+#define PARSE_TOCO_FLAG(Type, name, requirement)                         \
+  do {                                                                   \
+    EnforceFlagRequirement(parsed_toco_flags.name, #name, requirement);  \
+    auto flag_value = GetFlagValue(parsed_toco_flags.name, requirement); \
+    if (flag_value.has_value()) {                                        \
+      Type x;                                                            \
+      QCHECK(Type##_Parse(flag_value.value(), &x))                       \
+          << "Unrecognized " << #Type << " value "                       \
+          << parsed_toco_flags.name.value();                             \
+      toco_flags->set_##name(x);                                         \
+    }                                                                    \
   } while (false)
 
-  PARSE_TOCO_FLAG(FileFormat, input_format, FlagRequirement::kMustBeSpecified);
-  PARSE_TOCO_FLAG(FileFormat, output_format, FlagRequirement::kMustBeSpecified);
+  PARSE_TOCO_FLAG(FileFormat, input_format, FlagRequirement::kUseDefault);
+  PARSE_TOCO_FLAG(FileFormat, output_format, FlagRequirement::kUseDefault);
   PARSE_TOCO_FLAG(IODataType, inference_type, FlagRequirement::kNone);
   PARSE_TOCO_FLAG(IODataType, inference_input_type, FlagRequirement::kNone);
   READ_TOCO_FLAG(default_ranges_min, FlagRequirement::kNone);
diff --git a/tensorflow/contrib/lite/toco/toco_saved_model.cc b/tensorflow/contrib/lite/toco/toco_saved_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26f55a66c729894a990258080e397bb42ea98a13
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_saved_model.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "tensorflow/contrib/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/contrib/lite/toco/toco_saved_model.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace toco {
+namespace {
+
+// Loads a SavedModel from the directory specified in parsed_toco_flags.
+// Returns a SavedModelBundle with the requested MetaGraphDef.
+const tensorflow::SavedModelBundle* LoadSavedModel(
+    const ParsedTocoFlags& parsed_toco_flags) {
+  const string model_path = parsed_toco_flags.savedmodel_directory.value();
+  QCHECK(tensorflow::MaybeSavedModelDirectory(model_path))
+      << "Model is not saved in the supported SavedModel format.\n";
+
+  // Gets the tags identifying the MetaGraphDef from the command line arguments.
+  string tags_str;
+  if (parsed_toco_flags.savedmodel_tagset.specified()) {
+    tags_str = parsed_toco_flags.savedmodel_tagset.value();
+  } else {
+    tags_str = parsed_toco_flags.savedmodel_tagset.default_value();
+  }
+  auto tags = absl::StrSplit(tags_str, ',');
+
+  // Loads MetaGraphDef.
+  auto* bundle = new tensorflow::SavedModelBundle;
+  TF_CHECK_OK(tensorflow::LoadSavedModel(tensorflow::SessionOptions(),
+                                         tensorflow::RunOptions(), model_path,
+                                         tags, bundle))
+      << "Failed to load exported model from " << model_path
+      << ". Ensure the model contains the required tags '" << tags_str
+      << "'.\n";
+  return bundle;
+}
+
+// Returns the array name without the postfix.
+//
+// e.g. reduces "input:0" to "input".
+string GetArrayName(const string& name) {
+  const std::vector<string>& names = absl::StrSplit(name, ':');
+  return names[0];
+}
+
+// Returns the list of array names without the postfix sorted alphabetically.
+std::set<string> GetSortedNames(const std::unordered_set<string>& names) {
+  std::vector<string> final_names;
+  final_names.reserve(names.size());
+  for (const auto& name : names) {
+    final_names.push_back(GetArrayName(name));
+  }
+  return std::set<string>(final_names.begin(), final_names.end());
+}
+
+// Gets the final shape after replacing the first dimension with batch size, if
+// it is undefined (containing the value -1). Returns whether the shape is
+// valid.
+bool ReplaceShapeBatchSize(const tensorflow::TensorShapeProto& shape,
+                           int batch_size,
+                           tensorflow::TensorShapeProto* final_shape) {
+  for (int idx = 0; idx < shape.dim().size(); ++idx) {
+    int64 final_dim = shape.dim()[idx].size();
+    if (final_dim == -1) {
+      if (idx > 0) return false;
+      final_dim = batch_size;
+    }
+    final_shape->add_dim()->set_size(final_dim);
+  }
+  return true;
+}
+
+// Updates the input arrays in ModelFlags to contain the shape of the array.
+void ProcessInputShapes(const tensorflow::GraphDef& graph_def, int batch_size,
+                        ModelFlags* model_flags) {
+  // Build map of input array names to input arrays.
+  std::unordered_map<string, InputArray*> input_data_map;
+  for (auto& input : *model_flags->mutable_input_arrays()) {
+    input_data_map[input.name()] = &input;
+  }
+
+  // Adds shapes to the input arrays if the shape is valid.
+  for (const tensorflow::NodeDef& node_def : graph_def.node()) {
+    if (input_data_map.find(node_def.name()) != input_data_map.end()) {
+      const auto shape_it = node_def.attr().find("shape");
+      if (shape_it != node_def.attr().end()) {
+        tensorflow::TensorShapeProto final_shape;
+        bool is_valid = ReplaceShapeBatchSize(shape_it->second.shape(),
+                                              batch_size, &final_shape);
+
+        if (is_valid) {
+          auto* shape = input_data_map.at(node_def.name())->mutable_shape();
+          QCHECK_EQ(shape->dims_size(), 0)
+              << "The shape for the input '" << node_def.name()
+              << "' was previously defined. For clarity please define inputs "
+              << "via --input_arrays and input_shapes flags.\n";
+          for (const auto& dim : final_shape.dim()) {
+            shape->add_dims(dim.size());
+          }
+        }
+      }
+    }
+  }
+
+  // Checks all input arrays have a shape.
+  for (auto const& input : model_flags->input_arrays()) {
+    QCHECK(input.shape().dims_size() > 0)
+        << "A valid input shape was not found for input '" << input.name()
+        << "'. Please define via --input_arrays and --input_shapes flags.\n";
+  }
+}
+
+}  // namespace
+
+void ParseMetaData(const tensorflow::GraphDef& graph_def,
+                   const std::unordered_set<string>& inputs,
+                   const std::unordered_set<string>& outputs,
+                   const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags) {
+  if (!parsed_model_flags.input_arrays.specified()) {
+    const std::set<string> sorted_inputs = GetSortedNames(inputs);
+    for (const auto& input_name : sorted_inputs) {
+      model_flags->add_input_arrays()->set_name(input_name);
+    }
+  }
+
+  if (!parsed_model_flags.output_arrays.specified()) {
+    const std::set<string> sorted_outputs = GetSortedNames(outputs);
+    for (const auto& output_name : sorted_outputs) {
+      model_flags->add_output_arrays(GetArrayName(output_name));
+    }
+  }
+
+  if (!parsed_model_flags.input_shapes.specified()) {
+    int batch_size = parsed_model_flags.batch_size.value();
+    ProcessInputShapes(graph_def, batch_size, model_flags);
+  }
+
+  if (!parsed_toco_flags.inference_type.specified()) {
+    toco_flags->set_inference_type(IODataType::FLOAT);
+  }
+}
+
+// TODO(nupurgarg): Add top level tests.
+void GetSavedModelContents(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags,
+                           TocoFlags* toco_flags, ModelFlags* model_flags,
+                           string* graph_def_contents) {
+  // Loads the MetaGraphDef within a SavedModelBundle.
+  auto bundle = LoadSavedModel(parsed_toco_flags);
+
+  // Converts the MetaGraphDef to frozen GraphDef.
+  tensorflow::GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_CHECK_OK(tensorflow::FreezeSavedModel(*bundle, &frozen_graph_def, &inputs,
+                                           &outputs));
+
+  // Reads the frozen GraphDef into a string.
+  QCHECK(frozen_graph_def.SerializeToString(graph_def_contents))
+      << "Unable to generate serialized GraphDef.\n";
+
+  // Process inputs and outputs and metadata within GraphDef.
+  const tensorflow::GraphDef graph_def = bundle->meta_graph_def.graph_def();
+  ParseMetaData(graph_def, inputs, outputs, parsed_toco_flags,
+                parsed_model_flags, toco_flags, model_flags);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_saved_model.h b/tensorflow/contrib/lite/toco/toco_saved_model.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a0fabd82d90131a3b2d28c757c08dcb0f9e3988
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_saved_model.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_SAVED_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_SAVED_MODEL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/cc/tools/freeze_saved_model.h"
+#include "tensorflow/contrib/lite/toco/args.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/types.pb.h"
+
+namespace toco {
+
+// Parses metadata into `toco_flags` and `model_flags`.
+//
+// Stores `inputs` as input_arrays and `outputs` as output_arrays in
+// `model_flags`. Infers input_shapes from the GraphDef and stores it in
+// `model_flags` as part of the input_arrays. Assumes inference_type is FLOAT
+// and stores it in `toco_flags`.
+void ParseMetaData(const tensorflow::GraphDef& graph_def,
+                   const std::unordered_set<string>& inputs,
+                   const std::unordered_set<string>& outputs,
+                   const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags);
+
+// Generates a frozen graph from the SavedModel in the directory specified in
+// `toco_flags`. Reads frozen graph contents into `graph_def_contents`. Parses
+// metadata relating to the GraphDef into `toco_flags` and `model_flags`.
+void GetSavedModelContents(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags,
+                           TocoFlags* toco_flags, ModelFlags* model_flags,
+                           string* graph_def_contents);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_SAVED_MODEL_H_
diff --git a/tensorflow/contrib/lite/toco/toco_saved_model_test.cc b/tensorflow/contrib/lite/toco/toco_saved_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e122afe65dc29abc85f142f4019aae5058ace51
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_saved_model_test.cc
@@ -0,0 +1,274 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/toco/toco_saved_model.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/contrib/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace toco {
+namespace {
+
+using tensorflow::ops::Add;
+using tensorflow::ops::Const;
+using tensorflow::ops::FakeQuantWithMinMaxArgs;
+using tensorflow::ops::Placeholder;
+
+class TocoSavedModelTest : public ::testing::Test {
+ protected:
+  // Calls functions to process cmdline arguments and calls ParseMetaData.
+  // ParseMetaData parses input_arrays, output_arrays, and gets metadata from
+  // SavedModel it is not defined in the cmdline arguments.
+  void ProcessGraphDefMetadata(const std::unordered_set<string>& inputs,
+                               const std::unordered_set<string>& outputs,
+                               const tensorflow::GraphDef& graph_def) {
+    ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags_, &toco_flags_);
+    ReadModelFlagsFromCommandLineFlags(parsed_model_flags_, &model_flags_);
+    ParseMetaData(graph_def, inputs, outputs, parsed_toco_flags_,
+                  parsed_model_flags_, &toco_flags_, &model_flags_);
+  }
+
+  // Gets the GraphDef from the SavedModelBundle and processes metadata.
+  void ProcessSavedModelMetadata(const std::unordered_set<string>& inputs,
+                                 const std::unordered_set<string>& outputs) {
+    const tensorflow::GraphDef graph_def = bundle_.meta_graph_def.graph_def();
+    ProcessGraphDefMetadata(inputs, outputs, graph_def);
+  }
+
+  // Returns a GraphDef representing a simple float model with a single input.
+  tensorflow::GraphDef GetFloatGraphDef(const std::vector<int64>& shape) {
+    tensorflow::GraphDef graph_def;
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    tensorflow::Output input =
+        Placeholder(scope.WithOpName("input"), tensorflow::DT_FLOAT,
+                    Placeholder::Shape(tensorflow::PartialTensorShape(shape)));
+    tensorflow::Output zero = Const(scope.WithOpName("zero"), 0.0f, {});
+    tensorflow::Output add = Add(scope.WithOpName("add"), input, zero);
+
+    TF_EXPECT_OK(scope.ToGraphDef(&graph_def));
+    return graph_def;
+  }
+
+  // Returns a GraphDef representing a simple float model with two inputs.
+  tensorflow::GraphDef GetComplexFloatGraphDef() {
+    tensorflow::GraphDef graph_def;
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    tensorflow::Output inputA =
+        Placeholder(scope.WithOpName("inputA"), tensorflow::DT_FLOAT,
+                    Placeholder::Shape(tensorflow::TensorShape({1, 3, 3, 1})));
+    tensorflow::Output inputB =
+        Placeholder(scope.WithOpName("inputB"), tensorflow::DT_FLOAT,
+                    Placeholder::Shape(tensorflow::TensorShape({1, 3, 3, 1})));
+    tensorflow::Output add = Add(scope.WithOpName("add"), inputB, inputA);
+
+    TF_EXPECT_OK(scope.ToGraphDef(&graph_def));
+    return graph_def;
+  }
+
+  // Returns a GraphDef representing a simple quantized model.
+  tensorflow::GraphDef GetQuantizedGraphDef() {
+    tensorflow::GraphDef graph_def;
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    tensorflow::Output input =
+        Placeholder(scope.WithOpName("input"), tensorflow::DT_FLOAT,
+                    Placeholder::Shape(tensorflow::TensorShape({1, 3, 3, 1})));
+    tensorflow::Output zero = Const(scope.WithOpName("zero"), 0.0f, {});
+    tensorflow::Output fake_quant =
+        FakeQuantWithMinMaxArgs(scope.WithOpName("quant"), zero);
+    tensorflow::Output add = Add(scope.WithOpName("add"), input, fake_quant);
+
+    TF_EXPECT_OK(scope.ToGraphDef(&graph_def));
+    return graph_def;
+  }
+
+  // Gets the values in the input_arrays flag.
+  std::vector<string> GetInputArrays() {
+    std::vector<string> actual;
+    for (const auto& input : model_flags_.input_arrays()) {
+      actual.push_back(input.name());
+    }
+    return actual;
+  }
+
+  // Gets the values in the output_arrays flag.
+  std::vector<string> GetOutputArrays() {
+    std::vector<string> actual(model_flags_.output_arrays().begin(),
+                               model_flags_.output_arrays().end());
+    return actual;
+  }
+
+  // Gets the shape of the given input array.
+  string GetInputShape(const string& input_array) {
+    for (const auto& input : model_flags_.input_arrays()) {
+      if (input.name() == input_array) {
+        std::vector<string> dims;
+        for (int idx = 0; idx < input.shape().dims_size(); ++idx) {
+          dims.push_back(std::to_string(input.shape().dims(idx)));
+        }
+        return absl::StrJoin(dims, ",");
+      }
+    }
+    return "";
+  }
+
+  tensorflow::SavedModelBundle bundle_;
+  ParsedTocoFlags parsed_toco_flags_;
+  ParsedModelFlags parsed_model_flags_;
+  TocoFlags toco_flags_;
+  ModelFlags model_flags_;
+};
+
+// Tests if input_arrays, output_arrays, inference_type, and output_arrays are
+// added to ModelFlags if they are not specified in cmdline arguments.
+// Tests if the default batch size replaces a -1 in the first dimension.
+TEST_F(TocoSavedModelTest, NoCmdLine) {
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({-1, 3, 3, 1});
+
+  ProcessGraphDefMetadata({"input"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"input"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("input"), "1,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if the order of input_arrays and output_arrays is deterministic when
+// they are taken from the SavedModel.
+TEST_F(TocoSavedModelTest, NoCmdLineMultipleArrays) {
+  tensorflow::GraphDef graph_def = GetComplexFloatGraphDef();
+
+  // Note: The model does not have two outputs. However, the function does not
+  // need an accurate output_array list. This is only meant to test order.
+  ProcessGraphDefMetadata({"inputB", "inputA"}, {"add", "invalid"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"inputA", "inputB"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add", "invalid"}));
+  EXPECT_EQ(GetInputShape("inputA"), "1,3,3,1");
+  EXPECT_EQ(GetInputShape("inputB"), "1,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if input_shapes is inferred when input_arrays is passed in via cmdline
+// arguments.
+TEST_F(TocoSavedModelTest, InputNameWithoutInputShape) {
+  parsed_model_flags_.input_arrays.bind()("input");
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({2, 3, 3, 1});
+
+  ProcessGraphDefMetadata({"not_used_input"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"input"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("input"), "2,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Ensures a failure occurs when input_shapes is defined without input_arrays.
+TEST_F(TocoSavedModelTest, InputShapeWithoutInputName) {
+  parsed_model_flags_.input_shapes.bind()("1,224,224,1:9,12");
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({1, 3, 3, 1});
+
+  EXPECT_DEATH(ProcessGraphDefMetadata({"input"}, {"add"}, graph_def),
+               "failed: input_shapes.size\\(\\) == "
+               "model_flags->input_arrays_size\\(\\)");
+}
+
+// Tests if the cmdline values of input_arrays, input_shapes are used when
+// specified with an empty GraphDef.
+TEST_F(TocoSavedModelTest, InputArraysCmdLine) {
+  parsed_model_flags_.input_arrays.bind()("inputA,inputB");
+  parsed_model_flags_.input_shapes.bind()("1,224,224,1:9,12");
+
+  ProcessSavedModelMetadata({"input0", "input1"}, {"output0", "output1"});
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"inputA", "inputB"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"output0", "output1"}));
+  EXPECT_EQ(GetInputShape("inputA"), "1,224,224,1");
+  EXPECT_EQ(GetInputShape("inputB"), "9,12");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if the cmdline values of input_arrays, input_shapes are used when
+// specified even if values exist within the GraphDef.
+TEST_F(TocoSavedModelTest, InputArraysCmdLineWithGraphDef) {
+  parsed_model_flags_.input_arrays.bind()("inputA");
+  parsed_model_flags_.input_shapes.bind()("1,224,224,1");
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({1, 3, 3, 1});
+
+  ProcessGraphDefMetadata({"inputA"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"inputA"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("inputA"), "1,224,224,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if the cmdline values of input_arrays, input_shapes, inference_type,
+// and output_arrays are used when specified with an empty GraphDef.
+TEST_F(TocoSavedModelTest, AllParamsCmdLine) {
+  parsed_model_flags_.input_arrays.bind()("inputA,inputB");
+  parsed_model_flags_.output_arrays.bind()("outputA,outputB");
+  parsed_model_flags_.input_shapes.bind()("1,224,224,1:9,12");
+  parsed_toco_flags_.inference_type.bind()("FLOAT");
+
+  ProcessSavedModelMetadata({"input0", "input1"}, {"output0", "output1"});
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"inputA", "inputB"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"outputA", "outputB"}));
+  EXPECT_EQ(GetInputShape("inputA"), "1,224,224,1");
+  EXPECT_EQ(GetInputShape("inputB"), "9,12");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Tests if a quantized graph gives the correct values assuming type is passed
+// in via command line.
+TEST_F(TocoSavedModelTest, QuantizedNoCmdLine) {
+  parsed_toco_flags_.inference_type.bind()("QUANTIZED_UINT8");
+  tensorflow::GraphDef graph_def = GetQuantizedGraphDef();
+
+  ProcessGraphDefMetadata({"input"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"input"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("input"), "1,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::QUANTIZED_UINT8);
+}
+
+// Tests if the provided batch size replaces a -1 in the first dimension of
+// input shape.
+TEST_F(TocoSavedModelTest, MissingShapeParameterValid) {
+  parsed_model_flags_.batch_size.bind()(3);
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({-1, 3, 3, 1});
+
+  ProcessGraphDefMetadata({"input"}, {"add"}, graph_def);
+  EXPECT_EQ(GetInputArrays(), std::vector<string>({"input"}));
+  EXPECT_EQ(GetOutputArrays(), std::vector<string>({"add"}));
+  EXPECT_EQ(GetInputShape("input"), "3,3,3,1");
+  EXPECT_EQ(toco_flags_.inference_type(), IODataType::FLOAT);
+}
+
+// Ensures a failure occurs if there is a -1 in a dimension aside from the first
+// position of input shape.
+TEST_F(TocoSavedModelTest, MissingShapeParameterInvalid) {
+  parsed_model_flags_.batch_size.bind()(3);
+  tensorflow::GraphDef graph_def = GetFloatGraphDef({1, -1, 3, 1});
+
+  EXPECT_DEATH(ProcessGraphDefMetadata({"input"}, {"add"}, graph_def),
+               "A valid input shape was not found for input 'input'.");
+}
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 024335b5e4ca4547eba24b0944dc539eddc6bdf7..76e9a27aefc559f5985cbf5739f326b5f3511231 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -74,11 +74,14 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTensorFlowMatMul);
   transformations->Add(new FuseBinaryIntoPrecedingAffine);
   transformations->Add(new FuseBinaryIntoFollowingAffine);
-  transformations->Add(new ReorderActivationFunctions);
+  transformations->Add(new MergeReshapeIntoPrecedingTranspose);
+  transformations->Add(new ReorderElementwiseUnary);
+  transformations->Add(new ReorderReshapeTranspose);
   transformations->Add(new ResolveBatchNormalization);
   transformations->Add(new ResolveConstantBinaryOperator);
   transformations->Add(new ResolveConstantFill);
   transformations->Add(new ResolveConstantGather);
+  transformations->Add(new ResolveConstantRandomUniform);
   transformations->Add(new ResolveConstantRange);
   transformations->Add(new ResolveConstantStack);
   transformations->Add(new ResolveConstantStridedSlice);
@@ -94,6 +97,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new IdentifyL2Normalization);
   transformations->Add(new IdentifyL2Pool);
   transformations->Add(new IdentifyRelu1);
+  transformations->Add(new IdentifyPRelu);
   transformations->Add(new RemoveTrivialBinaryOperator);
   transformations->Add(new ReadFakeQuantMinMax);
   transformations->Add(new ResolveSpaceToBatchNDAttributes);
@@ -289,6 +293,10 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(model);
   }
 
+  // Fix any issues with IO edges. This must happen after any transform that
+  // may modify the structure of the edges.
+  FixEdgeArrays(model);
+
   LogDump(kLogLevelModelChanged, "AFTER TRANSFORMATIONS", *model);
 
   if (output_format != GRAPHVIZ_DOT && output_format != TFLITE) {
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index e70291ad0ef4bb4c1969cfc3c87bf0f8e50ecef5..56fa8f4b695534772513976f1a80b77e53f6ff0b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -297,9 +297,11 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(L2Pool)
     HANDLE_OPERATORTYPENAME_CASE(FakeQuant)
     HANDLE_OPERATORTYPENAME_CASE(Mul)
+    HANDLE_OPERATORTYPENAME_CASE(RandomUniform)
     HANDLE_OPERATORTYPENAME_CASE(Relu)
     HANDLE_OPERATORTYPENAME_CASE(Relu1)
     HANDLE_OPERATORTYPENAME_CASE(Relu6)
+    HANDLE_OPERATORTYPENAME_CASE(PRelu)
     HANDLE_OPERATORTYPENAME_CASE(ReorderAxes)
     HANDLE_OPERATORTYPENAME_CASE(Softmax)
     HANDLE_OPERATORTYPENAME_CASE(LogSoftmax)
@@ -1047,6 +1049,117 @@ void CheckModelCounts(const Model& model) {
   }
 }
 
+void FixEdgeArrays(Model* model) {
+  for (const string& output_array_name : model->flags.output_arrays()) {
+    if (!GetOpWithOutput(*model, output_array_name)) {
+      // Output has no operator producing it. Change that by inserting a copy.
+      LOG(WARNING) << "Fixing constant output array " << output_array_name
+                   << " by inserting a copy. This is not optimal.";
+      string intermediate_array_name =
+          AvailableArrayName(*model, output_array_name + "_copy");
+      CloneArray(model, output_array_name, intermediate_array_name);
+      InsertCopyOperator(model, intermediate_array_name, output_array_name);
+    }
+  }
+}
+
+void InsertCopyOperator(Model* model, const string& source_array_name,
+                        const string& target_array_name) {
+  // Drop constant data from the target array as the copy will be done at
+  // runtime.
+  Array& target_array = model->GetOrCreateArray(target_array_name);
+  target_array.buffer.reset();
+
+  // Reshape to the same size. This should be a no-op.
+  const Array& source_array = model->GetArray(source_array_name);
+  std::vector<int> shape = source_array.shape().dims();
+
+  // Insert copy operator.
+  auto* copy_op = new TensorFlowReshapeOperator;
+  copy_op->inputs = {
+      source_array_name,
+      CreateInt32Array(model, target_array_name + "_copy_shape", shape)};
+  copy_op->outputs = {target_array_name};
+  model->operators.emplace_back(copy_op);
+}
+
+namespace {
+template <ArrayDataType A>
+void CopyArrayBuffer(const Array& source_array, Array* target_array) {
+  if (source_array.buffer) {
+    const auto& source_buffer = source_array.GetBuffer<A>();
+    auto& target_buffer = target_array->GetMutableBuffer<A>();
+    target_buffer.data = source_buffer.data;
+  }
+}
+}  // namespace
+
+void CloneArray(Model* model, const string& source_array_name,
+                const string& target_array_name) {
+  CHECK(!model->HasArray(target_array_name));
+  const Array& source_array = model->GetArray(source_array_name);
+  Array& target_array = model->GetOrCreateArray(target_array_name);
+
+  switch (source_array.data_type) {
+    case ArrayDataType::kBool:
+      CopyArrayBuffer<ArrayDataType::kBool>(source_array, &target_array);
+      break;
+    case ArrayDataType::kFloat:
+      CopyArrayBuffer<ArrayDataType::kFloat>(source_array, &target_array);
+      break;
+    case ArrayDataType::kInt8:
+      CopyArrayBuffer<ArrayDataType::kInt8>(source_array, &target_array);
+      break;
+    case ArrayDataType::kUint8:
+      CopyArrayBuffer<ArrayDataType::kUint8>(source_array, &target_array);
+      break;
+    case ArrayDataType::kInt16:
+      CopyArrayBuffer<ArrayDataType::kInt16>(source_array, &target_array);
+      break;
+    case ArrayDataType::kUint16:
+      CopyArrayBuffer<ArrayDataType::kUint16>(source_array, &target_array);
+      break;
+    case ArrayDataType::kInt32:
+      CopyArrayBuffer<ArrayDataType::kInt32>(source_array, &target_array);
+      break;
+    case ArrayDataType::kUint32:
+      CopyArrayBuffer<ArrayDataType::kUint32>(source_array, &target_array);
+      break;
+    case ArrayDataType::kInt64:
+      CopyArrayBuffer<ArrayDataType::kInt64>(source_array, &target_array);
+      break;
+    case ArrayDataType::kUint64:
+      CopyArrayBuffer<ArrayDataType::kUint64>(source_array, &target_array);
+      break;
+    case ArrayDataType::kString:
+      CopyArrayBuffer<ArrayDataType::kString>(source_array, &target_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type: "
+                 << ArrayDataTypeName(source_array.data_type);
+      return;
+  }
+
+  if (source_array.minmax) {
+    const auto& smm = source_array.GetMinMax();
+    auto& tmm = target_array.GetOrCreateMinMax();
+    tmm.min = smm.min;
+    tmm.max = smm.max;
+  }
+
+  if (source_array.quantization_params) {
+    const auto& sqp = source_array.GetQuantizationParams();
+    auto& tqp = target_array.GetOrCreateQuantizationParams();
+    tqp.zero_point = sqp.zero_point;
+    tqp.scale = sqp.scale;
+  }
+
+  target_array.data_type = source_array.data_type;
+  target_array.final_data_type = source_array.final_data_type;
+
+  target_array.copy_shape(source_array.shape());
+}
+
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims) {
   CHECK(out_dims->empty());
@@ -1808,6 +1921,35 @@ bool IsDiscardableArray(const Model& model, const string& array_name) {
   return true;
 }
 
+bool ReshapeIsEquivalentToTranspose(const Model& model,
+                                    const TensorFlowReshapeOperator* op,
+                                    bool allow_extra_unary_dims) {
+  CHECK(!op->shape.empty());
+  CHECK(model.HasArray(op->inputs[0]));
+  CHECK(model.HasArray(op->outputs[0]));
+
+  const auto& input_array = model.GetArray(op->inputs[0]);
+  const auto& output_array = model.GetArray(op->outputs[0]);
+
+  CHECK(input_array.has_shape());
+  CHECK(output_array.has_shape());
+
+  std::vector<int> in_shape = input_array.shape().dims();
+  std::vector<int> out_shape = output_array.shape().dims();
+
+  // If the reshape changes the number of dimensions so it cannot be interpreted
+  // as a transpose.
+  if (!allow_extra_unary_dims && in_shape.size() != out_shape.size()) {
+    return false;
+  }
+
+  in_shape.erase(std::remove(in_shape.begin(), in_shape.end(), 1),
+                 in_shape.end());
+  out_shape.erase(std::remove(out_shape.begin(), out_shape.end(), 1),
+                  out_shape.end());
+  return in_shape == out_shape;
+}
+
 void CheckFinalDataTypesSatisfied(const Model& model) {
   for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = *array_entry.second;
@@ -1860,13 +2002,13 @@ void FinishBuildingRNNStates(Model* model) {
 
 void UseArraysExtraInfo(Model* model) {
   for (const auto& entry : model->flags.arrays_extra_info().entries()) {
-    QCHECK(model->HasArray(entry.name()))
-        << "ArraysExtraInfo refers to non-existent array name: "
-        << entry.name();
+    if (!model->HasArray(entry.name())) {
+      continue;
+    }
     auto& array = model->GetArray(entry.name());
-    auto& minmax = array.GetOrCreateMinMax();
     if (entry.has_min() || entry.has_max()) {
       CHECK_EQ(entry.has_min(), entry.has_max());
+      auto& minmax = array.GetOrCreateMinMax();
       minmax.min = entry.min();
       minmax.max = entry.max();
     }
@@ -1874,6 +2016,25 @@ void UseArraysExtraInfo(Model* model) {
       array.final_data_type =
           ConvertIODataTypeToArrayDataType(entry.data_type());
     }
+    if (entry.has_shape()) {
+      array.clear_shape();
+      // Make sure to create the shape even if there are no dims, to
+      // correctly record 0-D shapes.
+      array.mutable_shape();
+      for (int dim : entry.shape().dims()) {
+        array.mutable_shape()->mutable_dims()->push_back(dim);
+      }
+    }
+    if (entry.has_constant_float_value()) {
+      CHECK(array.has_shape());
+      if (array.data_type == ArrayDataType::kFloat) {
+        auto& data = array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+        data.resize(RequiredBufferSizeForShape(array.shape()));
+        for (float& f : data) {
+          f = entry.constant_float_value();
+        }
+      }
+    }
   }
 }
 
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 05360e3b0a8510f57724d98dab0e2097b1062ced..259ee7fbd06db529e3cb413fd059edf299a3db3c 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -144,6 +144,18 @@ void FixOperatorOrdering(Model* model);
 void FixNoMissingArray(Model* model);
 void FixNoOrphanedArray(Model* model);
 
+// Fixes input/output arrays that may have issues during export or inference.
+void FixEdgeArrays(Model* model);
+
+// Inserts a no-op reshape operator between the source array and the target
+// array. This effectively just copies the data.
+void InsertCopyOperator(Model* model, const string& source_array_name,
+                        const string& target_array_name);
+
+// Clones an array with all data and parameters.
+void CloneArray(Model* model, const string& source_array_name,
+                const string& target_array_name);
+
 void ResolveModelFlags(const ModelFlags& model_flags, Model* model);
 
 template <ArrayDataType A>
@@ -157,10 +169,23 @@ void GetQuantizationParamsFromMinMax(const MinMax& minmax,
       ::tflite::ChooseQuantizationParams<Integer>(rmin, rmax);
 }
 
+template <typename T>
+T ConvertOperator(Operator* o, OperatorType type) {
+  if (o != nullptr && o->type == type) {
+    return static_cast<T>(o);
+  }
+
+  return nullptr;
+}
+
 void CheckIsReadyForQuantization(const Model& model);
 void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
                                  double default_ranges_max);
 
+bool ReshapeIsEquivalentToTranspose(const Model& model,
+                                    const TensorFlowReshapeOperator* op,
+                                    bool allow_extra_unary_dims);
+
 inline int Offset(const Shape& shape, const std::vector<int>& indices) {
   DCHECK_EQ(shape.dimensions_count(), indices.size());
   const int dims_count = shape.dimensions_count();
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index b5abbc0712599814e078d19bc015bc7bf1812f95..44fde69a1e1536b8d2ecff16876248cfe66a9b8a 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -91,18 +91,6 @@ cc_library(
     deps = ["//tensorflow/contrib/lite:framework"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "verifier",
     srcs = ["verifier.cc"],
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
index 59c74205f0a311ec12ff87f46622041605fb493b..8818a7dc85d9ffdc1da450fb389d5ed11139bc31 100644
--- a/tensorflow/contrib/lite/tools/verifier.cc
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -148,11 +148,52 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
   // TODO(yichengfan): verify quantized tensors.
 }
 
+using flatbuffers::Offset;
+using flatbuffers::Vector;
+
+bool VerifyOperators(const Vector<Offset<Operator>>& operators,
+                     ErrorReporter* error_reporter) {
+  for (const auto& op : operators) {
+    if (!op->inputs()) {
+      ReportError(error_reporter, "Missing 'inputs' for operator.");
+      return false;
+    }
+    if (!op->outputs()) {
+      ReportError(error_reporter, "Missing 'outputs' for operator.");
+      return false;
+    }
+  }
+  return true;
+}
+
+bool VerifySubGraphs(const Model& model, ErrorReporter* error_reporter) {
+  if (!model.subgraphs()) {
+    ReportError(error_reporter, "Missing 'subgraphs' section.");
+    return false;
+  }
+  for (const auto& subgraph : *model.subgraphs()) {
+    if (!subgraph->operators()) {
+      ReportError(error_reporter, "Missing 'operators' section in subgraph.");
+      return false;
+    }
+
+    if (!VerifyOperators(*subgraph->operators(), error_reporter)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Verifies tensors have valid properties and legit buffer if set.
 bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
   if (!model.subgraphs()) {
     return true;
   }
+  if (!model.buffers()) {
+    ReportError(error_reporter, "Missing 'buffers' section.");
+    return false;
+  }
+
   for (const auto& subgraph : *model.subgraphs()) {
     if (!subgraph->tensors()) {
       continue;
@@ -167,19 +208,23 @@ bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
         return false;
       }
       auto* buffer = model.buffers()->Get(tensor->buffer());
-      if (!buffer || !buffer->data()) {
+      if (!buffer) {
         ReportError(error_reporter, "Tensor buffer %d not set",
                     tensor->buffer());
         return false;
       }
 
-      if (tensor->type() == TensorType_STRING) {
-        if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
-          return false;
-        }
-      } else {
-        if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
-          return false;
+      // Many transient tensors don't have data in the flatbuffer. Their
+      // buffers will be allocated by the interpreter at run-time.
+      if (buffer->data()) {
+        if (tensor->type() == TensorType_STRING) {
+          if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+            return false;
+          }
+        } else {
+          if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
+            return false;
+          }
         }
       }
     }
@@ -193,6 +238,13 @@ bool VerifyOps(const Model& model, const OpResolver& resolver,
     return true;
   }
   for (const auto& opcode : *model.operator_codes()) {
+    if (opcode->builtin_code() < BuiltinOperator_MIN ||
+        opcode->builtin_code() > BuiltinOperator_MAX) {
+      ReportError(error_reporter, "Operator id '%d' is out of range.",
+                  opcode->builtin_code());
+      return false;
+    }
+
     if (opcode->builtin_code() == BuiltinOperator_CUSTOM) {
       if (!resolver.FindOp(opcode->custom_code()->c_str())) {
         ReportError(error_reporter, "Unsupported custom op: %s",
@@ -223,6 +275,9 @@ bool Verify(const void* buf, size_t len, const OpResolver& resolver,
     ReportError(error_reporter, "Invalid model version %d", model->version());
     return false;
   }
+  if (!VerifySubGraphs(*model, error_reporter)) {
+    return false;
+  }
   if (!VerifyTensors(*model, error_reporter)) {
     return false;
   }
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
index c2ee11215c861ed7b27696a8d786bb6e2a48e930..b7ce4e830576af14002d6bd9080af1da5764b1c9 100644
--- a/tensorflow/contrib/lite/tools/verifier.h
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -23,6 +23,21 @@ limitations under the License.
 
 namespace tflite {
 
+class AlwaysTrueResolver : public OpResolver {
+ public:
+  AlwaysTrueResolver() {}
+  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+    static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
+                                                   nullptr};
+    return &null_registration;
+  }
+  TfLiteRegistration* FindOp(const char* op) const override {
+    static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
+                                                   nullptr};
+    return &null_registration;
+  }
+};
+
 // Verifies the integrity of a Tensorflow Lite flatbuffer model file.
 // Currently, it verifies:
 // * The file is following a legit flatbuffer schema.
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index b3e611f999b2837efbf8876bd989db44c408b8c7..03b93afe3ed04b4bff13bc01d7c7c8e9fae9bdf3 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -113,8 +113,8 @@ TEST(VerifyModel, TestEmptyModel) {
                            /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                     MutableOpResolver{}, DefaultErrorReporter()));
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      MutableOpResolver{}, DefaultErrorReporter()));
 }
 
 TEST(VerifyModel, TestSimpleModel) {
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index 0a6edc33c57435e9466b27cb1d33570d7a98cb12..f616207d462954341dd0c4b2722471b50c06c917 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -48,15 +48,3 @@ tf_py_test(
     grpc_enabled = True,
     tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index 62f1c810fc72ba7e27c4553006c947f8fa0ef629..4942d941765951ed2ee5555138e91a202b96bf7c 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -105,7 +105,7 @@ def index_table_from_tensor(mapping,
   ...
   tf.tables_initializer().run()
 
-  ids.eval()  ==> [0, 1, 4, 2]
+  ids.eval()  ==> [0, 1, 3, 2]
   ```
 
   Args:
@@ -298,7 +298,7 @@ class MutableHashTable(LookupInterface):
   table = tf.contrib.lookup.MutableHashTable(key_dtype=tf.string,
                                              value_dtype=tf.int64,
                                              default_value=-1)
-  table.insert(keys, values)
+  sess.run(table.insert(keys, values))
   out = table.lookup(query_keys)
   print(out.eval())
   ```
@@ -494,7 +494,7 @@ class MutableDenseHashTable(LookupInterface):
                                                   value_dtype=tf.int64,
                                                   default_value=-1,
                                                   empty_key=0)
-  table.insert(keys, values)
+  sess.run(table.insert(keys, values))
   out = table.lookup(query_keys)
   print(out.eval())
   ```
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index 56942115213a762e532971a81da768b53b8537d8..728f75f8ef1eb3b107dbd0ab4ffbecd63787bf3e 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -97,15 +97,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/makefile/BUILD b/tensorflow/contrib/makefile/BUILD
index 701eeb44fe3f814cb3fb1cedd8618753946cc3e5..1abb46f4d41d2a9cc60d0cd9de865070689ddbfc 100644
--- a/tensorflow/contrib/makefile/BUILD
+++ b/tensorflow/contrib/makefile/BUILD
@@ -3,12 +3,3 @@
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:private"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index 7927997678f077a716d81749561068f259d9744f..e8c6edd7ba9aa6a45d956d1d5655b2809d8d2309 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -109,17 +109,18 @@ for arch in $archs; do
         linux)  makefile='
                         CC=${CC_PREFIX} g++
                         PLATFORM_CPPFLAGS=-DNSYNC_USE_CPP11_TIMEPOINT -DNSYNC_ATOMIC_CPP11 \
+                                          -I../../platform/c++11.futex \
                                           -I../../platform/c++11 -I../../platform/gcc \
                                           -I../../platform/posix -pthread
                         PLATFORM_CFLAGS=-std=c++11 -Werror -Wall -Wextra -pedantic
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
-                        PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
+                        PLATFORM_C=../../platform/linux/src/nsync_semaphore_futex.c \
                                    ../../platform/c++11/src/per_thread_waiter.cc \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
-                        PLATFORM_OBJS=nsync_semaphore_mutex.o per_thread_waiter.o yield.o \
+                        PLATFORM_OBJS=nsync_semaphore_futex.o per_thread_waiter.o yield.o \
                                       time_rep_timespec.o nsync_panic.o
                         TEST_PLATFORM_C=../../platform/c++11/src/start_thread.cc
                         TEST_PLATFORM_OBJS=start_thread.o
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 4ae18b2cef28335a90bbc967529c0cf76b0a5da2..8b415e6527f85a5a7844b9d4156fd39ecb1b637a 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -34,7 +34,7 @@ PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.
 RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
-CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index d56e388477db6239cfb577f7e2754321ff33bd82..76428bc1d4e682e000998a6e28fc290e218c2341 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -12,11 +12,13 @@ tensorflow/core/platform/posix/env.cc
 tensorflow/core/platform/posix/load_library.cc
 tensorflow/core/platform/posix/env_time.cc
 tensorflow/core/platform/file_system.cc
+tensorflow/core/platform/file_system_helper.cc
 tensorflow/core/platform/env.cc
 tensorflow/core/platform/env_time.cc
 tensorflow/core/platform/setround.cc
 tensorflow/core/platform/denormal.cc
 tensorflow/core/platform/default/tracing.cc
+tensorflow/core/platform/default/mutex.cc
 tensorflow/core/platform/default/logging.cc
 tensorflow/core/platform/cpu_info.cc
 tensorflow/core/lib/wav/wav_io.cc
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 5a812af4e95fe7a05b9c2634b0cc1d860fb7f619..b6acf71b9d446de6f57a7a7f077cc07276db2b17 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -228,6 +228,11 @@ tensorflow/core/kernels/cast_op_impl_int64.cc
 tensorflow/core/kernels/cast_op_impl_int8.cc
 tensorflow/core/kernels/cast_op_impl_uint16.cc
 tensorflow/core/kernels/cast_op_impl_uint8.cc
+tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+tensorflow/core/kernels/boosted_trees/resource_ops.cc
+tensorflow/core/kernels/boosted_trees/resources.cc
+tensorflow/core/kernels/boosted_trees/stats_ops.cc
+tensorflow/core/kernels/boosted_trees/training_ops.cc
 tensorflow/core/kernels/bias_op.cc
 tensorflow/core/kernels/bcast_ops.cc
 tensorflow/core/kernels/batch_norm_op.cc
@@ -258,6 +263,7 @@ tensorflow/core/kernels/requantize.cc
 tensorflow/core/kernels/remote_fused_graph_execute_op.cc
 tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
 tensorflow/core/kernels/batch_matmul_op_real.cc
+tensorflow/core/kernels/random_op.cc
 tensorflow/core/ops/training_ops.cc
 tensorflow/core/ops/string_ops.cc
 tensorflow/core/ops/state_ops.cc
@@ -285,6 +291,7 @@ tensorflow/core/ops/data_flow_ops.cc
 tensorflow/core/ops/ctc_ops.cc
 tensorflow/core/ops/control_flow_ops.cc
 tensorflow/core/ops/candidate_sampling_ops.cc
+tensorflow/core/ops/boosted_trees_ops.cc
 tensorflow/core/ops/array_ops.cc
 tensorflow/core/ops/array_grad.cc
 tensorflow/core/kernels/spacetobatch_functor.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index d569bde637b20e0ca55c48c616855332abd9fb13..1f254692d7a8fb7af3ce795428464c48f5997a54 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -18,6 +18,7 @@ tensorflow/core/protobuf/device_properties.proto
 tensorflow/core/protobuf/rewriter_config.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/lib/core/error_codes.proto
+tensorflow/core/kernels/boosted_trees/boosted_trees.proto
 tensorflow/core/framework/versions.proto
 tensorflow/core/framework/variable.proto
 tensorflow/core/framework/types.proto
diff --git a/tensorflow/contrib/memory_stats/BUILD b/tensorflow/contrib/memory_stats/BUILD
index 72424c32e7b756e6c50965f38135869e03ba730f..63843b993c16363a80b64622af665aaa64e05830 100644
--- a/tensorflow/contrib/memory_stats/BUILD
+++ b/tensorflow/contrib/memory_stats/BUILD
@@ -79,15 +79,3 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/meta_graph_transform/BUILD b/tensorflow/contrib/meta_graph_transform/BUILD
index 4b5b1c3e15d36b7602791856416ece54d24798b2..24400789f8a937c88b86141704f7977494c1495e 100644
--- a/tensorflow/contrib/meta_graph_transform/BUILD
+++ b/tensorflow/contrib/meta_graph_transform/BUILD
@@ -59,15 +59,3 @@ filegroup(
         "**/*.py",
     ]),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index e90c525113348532a3ebdadde7e712bf2d98cee9..5ca42f41c1c5055bf1917ad175b7b30666b18d4b 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -97,14 +97,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 81f05e7ce587ed1da67a17efbbeb809dbe7fc0b3..088319a5572f346ebb3409f2176e8b3589791f5d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -62,7 +62,8 @@ def _safe_div(numerator, denominator, name):
       0,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
                              labels,
                              weights=None,
@@ -106,7 +107,8 @@ def streaming_true_positives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.true_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_negatives(predictions,
                              labels,
                              weights=None,
@@ -150,7 +152,8 @@ def streaming_true_negatives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.false_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_positives(predictions,
                               labels,
                               weights=None,
@@ -194,7 +197,8 @@ def streaming_false_positives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.false_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_negatives(predictions,
                               labels,
                               weights=None,
@@ -237,7 +241,7 @@ def streaming_false_negatives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.mean')
 def streaming_mean(values,
                    weights=None,
                    metrics_collections=None,
@@ -286,7 +290,7 @@ def streaming_mean(values,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.mean_tensor')
 def streaming_mean_tensor(values,
                           weights=None,
                           metrics_collections=None,
@@ -340,9 +344,8 @@ def streaming_mean_tensor(values,
       name=name)
 
 
-@deprecated(None,
-            'Please switch to tf.metrics.accuracy. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.accuracy. Note that the order '
+                  'of the labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                        labels,
                        weights=None,
@@ -399,7 +402,8 @@ def streaming_accuracy(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.precision. Note that the order '
+                  'of the labels and predictions arguments has been switched.')
 def streaming_precision(predictions,
                         labels,
                         weights=None,
@@ -455,7 +459,8 @@ def streaming_precision(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.recall. Note that the order '
+                  'of the labels and predictions arguments has been switched.')
 def streaming_recall(predictions,
                      labels,
                      weights=None,
@@ -975,8 +980,8 @@ def streaming_curve_points(labels=None,
     return points, update_op
 
 
-@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of '
+                  'the labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                   labels,
                   weights=None,
@@ -1797,9 +1802,9 @@ def streaming_sensitivity_at_specificity(predictions,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.precision_at_thresholds. Note that the '
-    'order of the labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.precision_at_thresholds. Note that '
+            'the order of the labels and predictions arguments are switched.')
 def streaming_precision_at_thresholds(predictions,
                                       labels,
                                       thresholds,
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index ca3f13479ed32e9ab3d43dfe9a392ef8466ce5f2..f50575b2cf311e33f7b7c77488bc94b8d24c70ec 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -125,15 +125,3 @@ py_library(
         ":rnn_cells",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
index e7848adcc5ac126a2b85ef6dcb0ffa355b8b0628..30ea9122229c72950bee280f7a6c5eda4ac2fdbf 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
@@ -68,15 +68,3 @@ py_binary(
         "//tensorflow/contrib/model_pruning:pruning",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/model_pruning/python/layers/layers.py b/tensorflow/contrib/model_pruning/python/layers/layers.py
index 988748ad75bdf72f1da3f4e1c6e85aabb04a5954..466daf204a1ae86a7f37107342046305ea7249fc 100644
--- a/tensorflow/contrib/model_pruning/python/layers/layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/layers.py
@@ -214,7 +214,7 @@ def masked_convolution(inputs,
     elif data_format == 'NCHW':
       df = 'channels_first'
     else:
-      raise ValueError('Unsupported data fromat', data_format)
+      raise ValueError('Unsupported data format', data_format)
 
     layer = layer_class(
         filters=num_outputs,
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 86963be4b8aee396704752bab87e0a6b49ab1a49..5146a4a2de7806041991c04958de378b2d3dc810 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -216,7 +216,7 @@ def _partitioned_variable_assign(partitioned_var, new_value):
   """Assign op for partitioned variables.
 
   Args:
-    partitioned_var: A partitioned tensotflow variable
+    partitioned_var: A partitioned tensorflow variable
     new_value: Value to be assigned to the variable var
 
   Returns:
diff --git a/tensorflow/contrib/mpi_collectives/BUILD b/tensorflow/contrib/mpi_collectives/BUILD
index 9f9802b8fe12356c0da82ebb2b48b565cf3f7319..a7be92a35e0d62a61f7923ac61bb2c1267d039c6 100644
--- a/tensorflow/contrib/mpi_collectives/BUILD
+++ b/tensorflow/contrib/mpi_collectives/BUILD
@@ -126,15 +126,3 @@ tf_py_test(
     ],
     tags = ["manual"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index 94d01efee1546feca89a7e88acedf915b1dfb3a4..334e70318dd88185cecd93ebeb2587861b7999b9 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -31,7 +31,7 @@ tf_custom_op_library(
         "kernels/nccl_ops.cc",
     ],
     deps = if_cuda([
-        "@nccl_archive//:nccl",
+        "@local_config_nccl//:nccl",
         "//tensorflow/core:gpu_headers_lib",
     ]),
 )
@@ -61,7 +61,7 @@ tf_cuda_cc_test(
             "//tensorflow/core:test",
             "//tensorflow/core:test_main",
             "//tensorflow/core:testlib",
-            "@nccl_archive//:nccl",
+            "@local_config_nccl//:nccl",
         ],
 )
 
@@ -80,7 +80,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:stream_executor",
-        "@nccl_archive//:nccl",
+        "@local_config_nccl//:nccl",
     ],
     alwayslink = 1,
 )
@@ -141,15 +141,3 @@ cuda_py_test(
         "notap",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index bb219e0edc8a2c4ba0ce0583cbe4018a4fa3a1d1..6ff8cea84eb912d5e5c891c40efc617661725a63 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "src/nccl.h"
+#include "third_party/nccl/nccl.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/contrib/nccl/kernels/nccl_ops.cc b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
index 266d4f6f0de0274dca2bfc9022503f09b0ca7d42..c2b76caef38a4af248387b65701b8f8936e8431f 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "src/nccl.h"
+#include "third_party/nccl/nccl.h"
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc b/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
index a4de46a93fab1dfe93b47f2789cc533bc447e43a..4676e937e56e35cdec5d2ac57fa07b7bda5fe291 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA
 
 #include <forward_list>
@@ -254,7 +255,7 @@ class NcclReplacePass : public GraphOptimizationPass {
     // Find reduction and broadcast ops and replace them with Send/Recv ops.
     for (Node* node : graph->op_nodes()) {
       StringPiece type = node->type_string();
-      if (!type.starts_with("Nccl")) {
+      if (!str_util::StartsWith(type, "Nccl")) {
         continue;
       }
       if (type == "NcclReduce") {
diff --git a/tensorflow/contrib/nearest_neighbor/BUILD b/tensorflow/contrib/nearest_neighbor/BUILD
index 9500c18b1df9d772dfb827bc2b3d33e0a65974f6..6fa762446705310a60cbdd9302c1a5083b69f065 100644
--- a/tensorflow/contrib/nearest_neighbor/BUILD
+++ b/tensorflow/contrib/nearest_neighbor/BUILD
@@ -111,15 +111,3 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/nn/BUILD b/tensorflow/contrib/nn/BUILD
index 5543eb6c6e3785978e9c878f309b9bd0863b0b0a..ef7ab2264655ca0148a9c045bba04018d9599dfc 100644
--- a/tensorflow/contrib/nn/BUILD
+++ b/tensorflow/contrib/nn/BUILD
@@ -98,14 +98,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/nn/python/ops/scaled_softplus.py b/tensorflow/contrib/nn/python/ops/scaled_softplus.py
index fcbfbc239ca5b8a1d4b17b403f99b7eb05db47b0..7184ef2b66ec4662af3a37def070ab151d6e7c15 100644
--- a/tensorflow/contrib/nn/python/ops/scaled_softplus.py
+++ b/tensorflow/contrib/nn/python/ops/scaled_softplus.py
@@ -30,9 +30,7 @@ def _reduce_and_reshape_grad(g, t):
   """Returns the gradient, sum-reduced and reshaped to `t`'s shape."""
   shape = array_ops.shape(t)
   g_shape = array_ops.shape(g)
-  # pylint: disable=protected-access
-  bcast_dims, _ = gen_array_ops._broadcast_gradient_args(shape, g_shape)
-  # pylint: enable=protected-access
+  bcast_dims, _ = gen_array_ops.broadcast_gradient_args(shape, g_shape)
   return array_ops.reshape(math_ops.reduce_sum(g, bcast_dims), shape)
 
 
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 827279bd476f9666a972f43ad557fde6d0b6c59a..c57c5e3f29f1e36fa2f36f8113cb208be6c6be3e 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -52,6 +52,9 @@ py_test(
     name = "external_optimizer_test",
     srcs = ["python/training/external_optimizer_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no-internal-py3",
+    ],
     deps = [
         ":opt_py",
         "//tensorflow/python:array_ops",
@@ -262,14 +265,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
index cb6c77a86feedde3285d75092511c8eb1e63b2a5..9076cc9d128552e37c09852ab2f24aa0c9977892 100644
--- a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
@@ -22,6 +22,7 @@ import types
 import six
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -40,8 +41,10 @@ def _get_wrapper(fn, opt):
 
   def wrapper(self, grad, *args, **kwargs):  # pylint: disable=unused-argument
     all_zeros = _is_all_zeros(grad)
-    return control_flow_ops.cond(all_zeros, control_flow_ops.no_op,
-                                 lambda: fn(grad, *args, **kwargs))
+    def call_fn():
+      with ops.control_dependencies([fn(grad, *args, **kwargs)]):
+        return control_flow_ops.no_op()
+    return control_flow_ops.cond(all_zeros, control_flow_ops.no_op, call_fn)
 
   wrapper = types.MethodType(wrapper, opt)
   return wrapper
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..26ea9135f57fb9fe95e61023bccb97d1d4f5ea1c
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -0,0 +1,205 @@
+# Prototype of OptimizerV2.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "optimizer_v2_py",
+    srcs = ["optimizer_v2_symbols.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":training",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "training",
+    srcs = [
+        "adadelta.py",
+        "adagrad.py",
+        "adam.py",
+        "gradient_descent.py",
+        "momentum.py",
+        "optimizer_v2.py",
+        "rmsprop.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "adadelta_test",
+    size = "medium",
+    srcs = ["adadelta_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "adagrad_test",
+    size = "small",
+    srcs = ["adagrad_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "small",
+    srcs = ["adam_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "checkpointable_utils_test",
+    srcs = ["checkpointable_utils_test.py"],
+    additional_deps = [
+        ":training",
+        "@six_archive//:six",
+        "//tensorflow/contrib/eager/python:checkpointable_utils",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:layers_base",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+    ],
+    tags = ["notsan"],
+)
+
+cuda_py_test(
+    name = "gradient_descent_test",
+    size = "medium",
+    srcs = ["gradient_descent_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "momentum_test",
+    size = "medium",
+    srcs = ["momentum_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "optimizer_v2_test",
+    size = "medium",
+    srcs = ["optimizer_v2_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "rmsprop_test",
+    size = "small",
+    srcs = ["rmsprop_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/optimizer_v2/adadelta.py b/tensorflow/contrib/optimizer_v2/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..b206f9f61bd56581e5105b2bc635c69abbc9af4c
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adadelta.py
@@ -0,0 +1,113 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adadelta for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class AdadeltaOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adadelta algorithm.
+
+  See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
+  ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
+  """
+
+  def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-8,
+               use_locking=False, name="Adadelta"):
+    """Construct a new Adadelta optimizer.
+
+    Some of the args below are hyperparameters, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+        To match the exact form in the original paper use 1.0.
+      rho: A float hyperparameter. The decay rate.
+      epsilon: A float hyperparameter. A constant epsilon used to better
+        condition the grad update.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adadelta".
+    """
+    super(AdadeltaOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("rho", rho)
+    self._set_hyper("epsilon", epsilon)
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      state.zeros_slot(v, "accum")
+      state.zeros_slot(v, "accum_update")
+
+  def _apply_dense(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.apply_adadelta(
+        var,
+        accum,
+        accum_update,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_dense(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.resource_apply_adadelta(
+        var.handle,
+        accum.handle,
+        accum_update.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.sparse_apply_adadelta(
+        var,
+        accum,
+        accum_update,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.resource_sparse_apply_adadelta(
+        var.handle,
+        accum.handle,
+        accum_update.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/adadelta_test.py b/tensorflow/contrib/optimizer_v2/adadelta_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cfec0d50d691cb9e618400fa4b37708a8a3ba2
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adadelta_test.py
@@ -0,0 +1,167 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adadelta Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import adadelta
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdadeltaOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False):
+    num_updates = 4  # number of ADADELTA steps to perform
+    for dtype in [dtypes.half, dtypes.float32]:
+      for grad in [0.2, 0.1, 0.01]:
+        for lr in [1.0, 0.5, 0.1]:
+          with self.test_session():
+            var0_init = [1.0, 2.0]
+            var1_init = [3.0, 4.0]
+            if use_resource:
+              var0 = resource_variable_ops.ResourceVariable(
+                  var0_init, dtype=dtype)
+              var1 = resource_variable_ops.ResourceVariable(
+                  var1_init, dtype=dtype)
+            else:
+              var0 = variables.Variable(var0_init, dtype=dtype)
+              var1 = variables.Variable(var1_init, dtype=dtype)
+
+            grads = constant_op.constant([grad, grad], dtype=dtype)
+
+            accum = 0.0
+            accum_update = 0.0
+
+            # ADADELTA gradient optimizer
+            rho = 0.95
+            epsilon = 1e-8
+            adadelta_opt = adadelta.AdadeltaOptimizer(lr, rho, epsilon)
+            adadelta_update = adadelta_opt.apply_gradients(
+                zip([grads, grads], [var0, var1]))
+
+            opt_vars = adadelta_opt.variables()
+            self.assertStartsWith(opt_vars[0].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[1].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[2].name, var1._shared_name)
+            self.assertStartsWith(opt_vars[3].name, var1._shared_name)
+            self.assertEqual(4, len(opt_vars))
+
+            variables.global_variables_initializer().run()
+
+            # Assign slots
+            slot = [None] * 2
+            slot_update = [None] * 2
+            self.assertEqual(["accum", "accum_update"],
+                             adadelta_opt.get_slot_names())
+            slot[0] = adadelta_opt.get_slot(var0, "accum")
+            self.assertEquals(slot[0].get_shape(), var0.get_shape())
+            self.assertFalse(slot[0] in variables.trainable_variables())
+
+            slot_update[0] = adadelta_opt.get_slot(var0, "accum_update")
+            self.assertEquals(slot_update[0].get_shape(), var0.get_shape())
+            self.assertFalse(slot_update[0] in variables.trainable_variables())
+
+            slot[1] = adadelta_opt.get_slot(var1, "accum")
+            self.assertEquals(slot[1].get_shape(), var1.get_shape())
+            self.assertFalse(slot[1] in variables.trainable_variables())
+
+            slot_update[1] = adadelta_opt.get_slot(var1, "accum_update")
+            self.assertEquals(slot_update[1].get_shape(), var1.get_shape())
+            self.assertFalse(slot_update[1] in variables.trainable_variables())
+
+            # Fetch params to validate initial values
+            self.assertAllClose(var0_init, var0.eval())
+            self.assertAllClose(var1_init, var1.eval())
+
+            update = [None] * num_updates
+            tot_update = 0
+            for step in range(num_updates):
+              # Run adadelta update for comparison
+              adadelta_update.run()
+
+              # Perform initial update without previous accum values
+              accum = accum * rho + (grad**2) * (1 - rho)
+              update[step] = (np.sqrt(accum_update + epsilon) *
+                              (1. / np.sqrt(accum + epsilon)) * grad)
+              accum_update = (accum_update * rho + (update[step]**2) *
+                              (1.0 - rho))
+              tot_update += update[step] * lr
+
+              # Check that the accumulators have been updated
+              for slot_idx in range(2):
+                self.assertAllCloseAccordingToType(
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
+                    slot[slot_idx].eval(),
+                    rtol=1e-5)
+
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [accum_update, accum_update],
+                        dtype=dtype.as_numpy_dtype()),
+                    slot_update[slot_idx].eval(),
+                    rtol=1e-5)
+
+              # Check that the parameters have been updated
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var0_init[0] - tot_update, var0_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  var0.eval(),
+                  rtol=1e-5)
+
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var1_init[0] - tot_update, var1_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  var1.eval(),
+                  rtol=1e-5)
+
+  def testBasic(self):
+    self.doTestBasic(use_resource=False)
+
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adadelta.AdadeltaOptimizer(
+            1.0, 1.0, 1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[-111, -138]], var0.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/adagrad.py b/tensorflow/contrib/optimizer_v2/adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..c333d1e089047e707c3f13acddae268d935b2b3e
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adagrad.py
@@ -0,0 +1,118 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adagrad optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class AdagradOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adagrad algorithm.
+
+  See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  or this
+  [intro](http://cs.stanford.edu/~ppasupat/a9online/uploads/proximal_notes.pdf).
+  """
+
+  def __init__(self, learning_rate, initial_accumulator_value=0.1,
+               use_locking=False, name="Adagrad"):
+    """Construct a new Adagrad optimizer.
+
+    The learning_rate arg below is a hyperparameter, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      initial_accumulator_value: A floating point value.
+        Starting value for the accumulators, must be positive.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adagrad".
+
+    Raises:
+      ValueError: If the `initial_accumulator_value` is invalid.
+    """
+    if initial_accumulator_value <= 0.0:
+      raise ValueError("initial_accumulator_value must be positive: %s" %
+                       initial_accumulator_value)
+    super(AdagradOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+
+    self._initial_accumulator_value = initial_accumulator_value
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      # TODO(isaprykin): Delete colocate_with(v) from other optimizers and
+      # confirm that colocation will happen anyway.
+      dtype = v.dtype.base_dtype
+      if v.get_shape().is_fully_defined():
+        init = init_ops.constant_initializer(self._initial_accumulator_value,
+                                             dtype=dtype)
+      else:
+        # Use a Tensor instead of initializer if variable does not have static
+        # shape.
+        init_constant = gen_array_ops.fill(
+            array_ops.shape(v), self._initial_accumulator_value)
+        init = math_ops.cast(init_constant, dtype)
+      state.create_slot_with_initializer(v, init, v.get_shape(), dtype,
+                                         "accumulator")
+
+  def _apply_dense(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.apply_adagrad(
+        var,
+        acc,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_dense(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.resource_apply_adagrad(
+        var.handle,
+        acc.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.sparse_apply_adagrad(
+        var,
+        acc,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.resource_sparse_apply_adagrad(
+        var.handle,
+        acc.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/adagrad_test.py b/tensorflow/contrib/optimizer_v2/adagrad_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..18191c3ef2cb78f63b6558c289b36b6107b6c171
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adagrad_test.py
@@ -0,0 +1,282 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for aggregate operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import adagrad
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_locking=False, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(
+            3.0, initial_accumulator_value=0.1, use_locking=use_locking)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic(use_locking=False)
+
+  def testBasicResource(self):
+    self.doTestBasic(use_locking=False, use_resource=True)
+
+  def testBasicLocked(self):
+    self.doTestBasic(use_locking=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adagrad.AdagradOptimizer(1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType(
+            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(
+            constant_op.constant(3.0), initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        ada_opt = adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([[1.0], [2.0]], var0.eval())
+        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([[3.0], [3.715679168701172]]), var1.eval())
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def testSparseRepeatedIndicesResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = adagrad.AdagradOptimizer(
+            2.0).minimize(loss_repeated)
+        update_op_aggregated = adagrad.AdagradOptimizer(
+            2.0).minimize(loss_aggregated)
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(
+            var_repeated.eval(), var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(
+              var_repeated.eval(), var_aggregated.eval())
+
+  def testSparseStability(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        shape = [1, 6]
+        var0 = variables.Variable(
+            [[
+                0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257,
+                -0.0105945
+            ]],
+            dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[
+                    -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05,
+                    -8.4877e-05, -9.48906e-05
+                ]],
+                shape=shape,
+                dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = adagrad.AdagradOptimizer(1.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(3.0)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+        # Validate updated params (the same as with only 1 Adagrad).
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testDynamicShapeVariable_Ok(self):
+    with self.test_session():
+      v = variable_scope.get_variable("v", initializer=constant_op.constant(1.),
+                                      validate_shape=False)
+      self.assertFalse(v.shape.is_fully_defined())
+      # Creating optimizer should cause no exception.
+      adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..42b7f92a76c1971e2a63722d769ee006c3f3210b
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -0,0 +1,202 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adam optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+
+
+class AdamOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adam algorithm.
+
+  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+               use_locking=False, name="Adam"):
+    """Construct a new Adam optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize initial 2nd moment vector)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section2 of the paper:
+
+    ```
+    t <- t + 1
+    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
+    variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+    ```
+
+    The default value of 1e-8 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+
+    Some of the args below are hyperparameters where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      beta1: A float hyperparameter. The exponential decay rate for the 1st
+        moment estimates.
+      beta2: A float hyperparameter. The exponential decay rate for the 2nd
+        moment estimates.
+      epsilon: A float hyperparameter. This epsilon is "epsilon hat" in the
+        Kingma and Ba paper (in the formula just before Section 2.1), not the
+        epsilon in Algorithm 1 of the paper.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".
+    """
+    super(AdamOptimizer, self).__init__(use_locking, name)
+
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("beta1", beta1)
+    self._set_hyper("beta2", beta2)
+    self._set_hyper("epsilon", epsilon)
+
+  def _get_beta_accumulators(self, state=None):
+    if state is None:
+      state = self._get_per_graph_state()
+    return (state.get_non_slot("beta1_power"),
+            state.get_non_slot("beta2_power"))
+
+  def _create_vars(self, var_list, state):
+    # Non-slot variables end up on the same device(s).
+    state.create_non_slot(initial_value=state.get_hyper("beta1"),
+                          name="beta1_power")
+    state.create_non_slot(initial_value=state.get_hyper("beta2"),
+                          name="beta2_power")
+
+    # Create slots for the first and second moments.
+    for v in var_list:
+      state.zeros_slot(v, "m")
+      state.zeros_slot(v, "v")
+
+  def _apply_dense(self, grad, var, state):
+    m = state.get_slot(var, "m")
+    v = state.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    return training_ops.apply_adam(
+        var, m, v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("beta1", var.dtype.base_dtype),
+        state.get_hyper("beta2", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad, use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    m = state.get_slot(var, "m")
+    v = state.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    return training_ops.resource_apply_adam(
+        var.handle, m.handle, v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
+        state.get_hyper("learning_rate", grad.dtype.base_dtype),
+        state.get_hyper("beta1", grad.dtype.base_dtype),
+        state.get_hyper("beta2", grad.dtype.base_dtype),
+        state.get_hyper("epsilon", grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add, state):
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = state.get_hyper("learning_rate", var.dtype.base_dtype)
+    beta1_t = state.get_hyper("beta1", var.dtype.base_dtype)
+    beta2_t = state.get_hyper("beta2", var.dtype.base_dtype)
+    epsilon_t = state.get_hyper("epsilon", var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = state.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta1_t)
+    m_t = state_ops.assign(m, m * beta1_t,
+                           use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = state.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(var,
+                                      lr * m_t / (v_sqrt + epsilon_t),
+                                      use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var, state):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking),
+        state)
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    return self._apply_sparse_shared(
+        grad, var, indices, self._resource_scatter_add, state)
+
+  def _finish(self, state):
+    # Update the power accumulators.
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    update_beta1 = beta1_power.assign(
+        beta1_power * state.get_hyper("beta1"),
+        use_locking=self._use_locking)
+    update_beta2 = beta2_power.assign(
+        beta2_power * state.get_hyper("beta2"),
+        use_locking=self._use_locking)
+    return control_flow_ops.group(update_beta1, update_beta2)
diff --git a/tensorflow/contrib/optimizer_v2/adam_test.py b/tensorflow/contrib/optimizer_v2/adam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ad58b0a607ecef1df097c8858b074361e7892b
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adam_test.py
@@ -0,0 +1,333 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import adam
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adam.AdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam.AdamOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam.AdamOptimizer().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adam.AdamOptimizer().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def doTestBasic(self, use_resource=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.test_session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adam.AdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertTrue(beta2_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+        self.assertIn(beta2_power, opt_variables)
+
+        with ops.Graph().as_default():
+          # Shouldn't return non-slot variables from other graphs.
+          self.assertEqual(0, len(opt.variables()))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta2_power))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.AdamOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.AdamOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = adam.AdamOptimizer()
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.AdamOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(6, len(set(opt.variables())))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..08f9699e850a6519dbb5de3bbf0d8b8de01c61b2
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -0,0 +1,686 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# TODO(josh11b): Forked from contrib/eager/python to test OptimizerV2 the same way
+# OptimizerV1 is tested. This file should be removed once the fork is resolved.
+
+import functools
+import os
+
+import six
+
+from tensorflow.contrib.eager.python import checkpointable_utils
+from tensorflow.contrib.optimizer_v2 import adam
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.layers import core
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import saver as core_saver
+from tensorflow.python.training import training_util
+
+
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class _MirroringSaveable(
+    core_saver.BaseSaverBuilder.ResourceVariableSaveable):
+
+  def __init__(self, primary_variable, mirrored_variable, name):
+    self._primary_variable = primary_variable
+    self._mirrored_variable = mirrored_variable
+    super(_MirroringSaveable, self).__init__(
+        self._primary_variable, "", name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group(
+        self._primary_variable.assign(tensor),
+        self._mirrored_variable.assign(tensor))
+
+
+class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
+  """A Checkpointable object which returns a more complex SaveableObject."""
+
+  def __init__(self):
+    self.non_dep_variable = variable_scope.get_variable(
+        name="non_dep_variable", initializer=6., use_resource=True)
+    self.mirrored = variable_scope.get_variable(
+        name="mirrored", initializer=15., use_resource=True)
+
+  def _gather_saveables_for_checkpoint(self):
+    def _saveable_factory(name=self.non_dep_variable.name):
+      return _MirroringSaveable(
+          primary_variable=self.non_dep_variable,
+          mirrored_variable=self.mirrored,
+          name=name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  # The Saver sorts by name before parsing, so we need a name property.
+  @property
+  def name(self):
+    return self.non_dep_variable.name
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph = (
+        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step:0",
+        named_variables["optimizer_step" + suffix].name)
+    self.assertEqual(
+        "my_model/dense_1/kernel:0",
+        named_variables["model/_second/kernel" + suffix].name)
+    self.assertEqual(
+        "my_model/dense/kernel:0",
+        named_variables["model/_named_dense/kernel" + suffix].name)
+    self.assertEqual(
+        "beta1_power:0",
+        named_variables["optimizer/beta1_power" + suffix].name)
+    self.assertEqual(
+        "beta2_power:0",
+        named_variables["optimizer/beta2_power" + suffix].name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta1_power",
+                     optimizer_node.children[0].local_name)
+    self.assertEqual("beta1_power",
+                     serialized_graph.nodes[optimizer_node.children[0].node_id]
+                     .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=named_variables["model/_named_dense/kernel" + suffix],
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_checkpointable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when appying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0, beta2=1.0)
+    on_create_root = checkpointable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_consumed()
+    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(core_saver.latest_checkpoint(checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = checkpointable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+          with self.test_session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            else:
+              status.assert_consumed()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  def _get_checkpoint_name(self, name):
+    root = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        root, name=name, shape=[1, 2], dtype=dtypes.float64)
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    checkpoint_name, = named_variables.keys()
+    with ops.name_scope("root/" + checkpoint_name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return checkpoint_name
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = checkpointable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = checkpointable.Checkpointable()
+    root.var = checkpointable_utils.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.AdamOptimizer(0.1)
+    if context.executing_eagerly():
+      optimizer.minimize(root.var.read_value)
+    else:
+      train_op = optimizer.minimize(root.var)
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(checkpointable_utils.gather_initializers(
+          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
+                                   14.))
+    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "with_slots"))
+    new_root = checkpointable.Checkpointable()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(slots_path)
+    no_slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = checkpointable_utils.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.AdamOptimizer(0.1)
+    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+      slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+    else:
+      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
+                    None)
+    if context.executing_eagerly():
+      new_root.optimizer.minimize(new_root.var.read_value)
+    else:
+      train_op = new_root.optimizer.minimize(new_root.var)
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+      self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        saver.save(checkpoint_prefix)
+        before_ops = graph.get_operations()
+        saver.save(checkpoint_prefix)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        save_path = saver.save(checkpoint_prefix)
+        saver.restore(save_path)
+        before_ops = graph.get_operations()
+        saver.restore(save_path)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testMultipleGraphsNonSlotVariables(self):
+    with context.graph_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer = adam.AdamOptimizer(0.001)
+      # Construct a model in one graph
+      first_graph = ops.Graph()
+      first_session = session_lib.Session(graph=first_graph)
+      with first_graph.as_default(), first_session.as_default():
+        first_variable = resource_variable_ops.ResourceVariable([1.])
+        first_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=first_variable)
+        train_op = optimizer.minimize(first_variable.read_value)
+        self.evaluate(checkpointable_utils.gather_initializers(
+            first_root_checkpointable))
+        self.evaluate(train_op)
+        self.evaluate(first_variable.assign([1.]))
+        self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m").assign([2.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.))
+
+      # Save and load in a second graph
+      second_graph = ops.Graph()
+      with second_graph.as_default(), session_lib.Session(graph=second_graph):
+        second_variable = resource_variable_ops.ResourceVariable([1.])
+        second_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=second_variable)
+        train_op = optimizer.minimize(second_variable.read_value)
+        second_root_checkpointable.restore(None).initialize_or_restore()
+        self.evaluate(train_op)
+        self.evaluate(second_variable.assign([4.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([5.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(6.))
+        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        self.evaluate(second_variable.assign([7.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([8.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+        status = second_root_checkpointable.restore(save_path)
+        status.assert_consumed().run_restore_ops()
+        self.assertAllEqual([4.], self.evaluate(second_variable))
+        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+
+      # Check that the first graph is unmolested
+      with first_graph.as_default(), first_session.as_default():
+        self.assertAllEqual([1.], self.evaluate(first_variable))
+        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta1_power))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = core_saver.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+
+  # TODO(allenl): Test for the core name-based saver loading object-based
+  # checkpoints once object-based checkpointing is in core.
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        object_saver = checkpointable_utils.CheckpointableSaver(root)
+        save_path = object_saver.save(
+            session=session, file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/gradient_descent.py b/tensorflow/contrib/optimizer_v2/gradient_descent.py
new file mode 100644
index 0000000000000000000000000000000000000000..945c8de5595394341077ae13cae3161c71ad4f98
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/gradient_descent.py
@@ -0,0 +1,69 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""GradientDescent optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import training_ops
+
+
+class GradientDescentOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the gradient descent algorithm."""
+
+  def __init__(self, learning_rate, use_locking=False, name="GradientDescent"):
+    """Construct a new gradient descent optimizer.
+
+    The learning rate arg below is a hyperparameter where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate to use.
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "GradientDescent".
+    """
+    super(GradientDescentOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+
+  def _apply_dense(self, grad, var, state):
+    return training_ops.apply_gradient_descent(
+        var,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, handle, state):
+    lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
+    return training_ops.resource_apply_gradient_descent(
+        handle.handle, lr, grad, use_locking=self._use_locking)
+
+  def _resource_apply_sparse_duplicate_indices(
+      self, grad, handle, indices, state):
+    lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
+    return resource_variable_ops.resource_scatter_add(
+        handle.handle, indices, -grad * lr)
+
+  def _apply_sparse_duplicate_indices(self, grad, var, state):
+    delta = ops.IndexedSlices(
+        grad.values * state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.indices, grad.dense_shape)
+    return var.scatter_sub(delta, use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/gradient_descent_test.py b/tensorflow/contrib/optimizer_v2/gradient_descent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad9aef804fb250395d0c42fcd145f8a1707237d0
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/gradient_descent_test.py
@@ -0,0 +1,223 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for GradientDescent optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import gradient_descent
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class GradientDescentOptimizerTest(test.TestCase):
+
+  def testBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        optimizer = gradient_descent.GradientDescentOptimizer(3.0)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        self.assertEqual(0, len(optimizer.variables()))
+
+  def testBasicResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testMinimizeResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(var0, x) + var1
+        loss = pred * pred
+        sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        pred += var1
+        loss = pred * pred
+        sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        sgd_op = gradient_descent.GradientDescentOptimizer(
+            lrate).apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testGradWrtRef(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        opt = gradient_descent.GradientDescentOptimizer(3.0)
+        values = [1.0, 3.0]
+        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        variables.global_variables_initializer().run()
+        for grad, _ in grads_and_vars:
+          self.assertAllCloseAccordingToType([1.0], grad.eval())
+
+  def testWithGlobalStep(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        global_step = variables.Variable(0, trainable=False)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]), global_step=global_step)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params and global_step
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        self.assertAllCloseAccordingToType(1, global_step.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                           var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/momentum.py b/tensorflow/contrib/optimizer_v2/momentum.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a5aadc2d13074cec440a7b508be56bd195d7517
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/momentum.py
@@ -0,0 +1,124 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Momentum for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class MomentumOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Momentum algorithm.
+
+  Computes (if `use_nesterov = False`):
+
+  ```
+  accumulation = momentum * accumulation + gradient
+  variable -= learning_rate * accumulation
+  ```
+
+  Note that in the dense version of this algorithm, `accumulation` is updated
+  and applied regardless of a gradient's value, whereas the sparse version (when
+  the gradient is an `IndexedSlices`, typically because of `tf.gather` or an
+  embedding) only updates variable slices and corresponding `accumulation` terms
+  when that part of the variable was used in the forward pass.
+  """
+
+  def __init__(self, learning_rate, momentum,
+               use_locking=False, name="Momentum", use_nesterov=False):
+    """Construct a new Momentum optimizer.
+
+    Some of the args below are hyperparameters, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      momentum: A float hyperparameter. The momentum.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Momentum".
+      use_nesterov: If `True` use Nesterov Momentum.
+        See [Sutskever et al., 2013](
+        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
+        This implementation always computes gradients at the value of the
+        variable(s) passed to the optimizer. Using Nesterov Momentum makes the
+        variable(s) track the values called `theta_t + mu*v_t` in the paper.
+
+    @compatibility(eager)
+    When eager execution is enabled, learning_rate and momentum can each be a
+    callable that takes no arguments and returns the actual value to use. This
+    can be useful for changing these values across different invocations of
+    optimizer functions.
+    @end_compatibility
+    """
+    super(MomentumOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("momentum", momentum)
+    self._use_nesterov = use_nesterov
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      state.zeros_slot(v, "momentum")
+
+  def _apply_dense(self, grad, var, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.apply_momentum(
+        var,
+        mom,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.resource_apply_momentum(
+        var.handle,
+        mom.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov)
+
+  def _apply_sparse(self, grad, var, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.sparse_apply_momentum(
+        var,
+        mom,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_momentum(
+        var.handle,
+        mom.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        indices,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov)
diff --git a/tensorflow/contrib/optimizer_v2/momentum_test.py b/tensorflow/contrib/optimizer_v2/momentum_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f37eb48181d6bef195215b86f14f69d3df65a8ac
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/momentum_test.py
@@ -0,0 +1,562 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Momentum."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.optimizer_v2 import momentum as momentum_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class MomentumOptimizerTest(test.TestCase):
+
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    var = var + accum * lr * momentum
+    accum = accum * momentum + g
+    var = var - lr * accum
+    var = var - accum * lr * momentum
+    return var, accum
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            [3.0, 4.0], dtype=dtype, name="var1_%d" % i)
+      else:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      learning_rate = lambda: 2.0
+      momentum = lambda: 0.9
+      if not use_callable_params:
+        learning_rate = learning_rate()
+        momentum = momentum()
+      mom_opt = momentum_lib.MomentumOptimizer(
+          learning_rate=learning_rate, momentum=momentum)
+      mom_update = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+
+      if not context.executing_eagerly():
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      # Check we have slots
+      self.assertEqual(["momentum"], mom_opt.get_slot_names())
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEquals(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEquals(slot1.get_shape(), var1.get_shape())
+      if not context.executing_eagerly():
+        self.assertFalse(slot0 in variables.trainable_variables())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+      # Step 1: the momentum accumulators where 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      if not context.executing_eagerly():
+        self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
+                                         self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(np.array([0.01, 0.01]),
+                                         self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+          self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+      # Step 2: the momentum accumulators contain the previous update.
+      if context.executing_eagerly():
+        mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      else:
+        self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+          self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+              2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                  (0.9 * 0.01 + 0.01) * 2.0)
+          ]), self.evaluate(var1))
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testVariablesAcrossGraphs(self):
+    optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5)
+    with ops.Graph().as_default():
+      var0 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var0")
+      var1 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var1")
+      if context.executing_eagerly():
+        loss = lambda: math_ops.reduce_sum(var0 + var1)
+      else:
+        loss = math_ops.reduce_sum(var0 + var1)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var0")
+      self.assertStartsWith(optimizer_variables[1].name, "var1")
+      self.assertEquals(2, len(optimizer_variables))
+
+    with ops.Graph().as_default():
+      var2 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var2")
+      var3 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var3")
+      if context.executing_eagerly():
+        loss = lambda: math_ops.reduce_sum(var2 + var3)
+      else:
+        loss = math_ops.reduce_sum(var2 + var3)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var2")
+      self.assertStartsWith(optimizer_variables[1].name, "var3")
+      self.assertEquals(2, len(optimizer_variables))
+
+  def testNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        cost = 5 * var0 * var0 + 3 * var1
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name="global_step")
+        mom_op = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+        opt_op = mom_op.minimize(cost, global_step, [var0, var1])
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  def testSparseNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        grads = []
+        for t in range(1, 5):
+          grads.append(var0_np * 10)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        loss = 5 * var0 * var0 + 3 * var1
+        mom_op = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+        x_feed = array_ops.placeholder(dtype)
+        y_feed = ops.IndexedSlices(
+            x_feed, constant_op.constant([0, 1]), constant_op.constant([2]))
+        grads_and_vars = [(y_feed, var0), (constant_op.constant(
+            [3.0, 3.0], dtype=dtype), var1)]
+        opt_update = mom_op.apply_gradients(grads_and_vars)
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_update.run(feed_dict={x_feed: grads[t - 1]})
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        return pred * pred
+      # pylint: enable=cell-var-from-loop
+
+      opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss)
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
+    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
+
+  def testTensorLearningRateAndMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=constant_op.constant(2.0),
+            momentum=constant_op.constant(0.9))
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertFalse(slot0 in variables.trainable_variables())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+  def _dbParamsMom01(self):
+    """Return dist-belief momentum values.
+
+    Return values been generated from the dist-belief momentum unittest,
+    running with a learning rate of 0.1 and a momentum of 0.1.
+
+    These values record how a parameter vector of size 10, initialized with 0.0,
+    gets updated with 10 consecutive momentum steps.  It uses random gradients.
+
+    Returns:
+      db_grad: The gradients to apply
+      db_out: The parameters after the momentum update.
+    """
+    db_grad = [[]] * 10
+    db_out = [[]] * 10
+    # pylint: disable=line-too-long
+    db_grad[0] = [
+        0.00096264342, 0.17914793, 0.93945462, 0.41396621, 0.53037018,
+        0.93197989, 0.78648776, 0.50036013, 0.55345792, 0.96722615
+    ]
+    db_out[0] = [
+        -9.6264346e-05, -0.017914793, -0.093945466, -0.041396622, -0.053037018,
+        -0.093197994, -0.078648776, -0.050036013, -0.055345792, -0.096722618
+    ]
+    db_grad[1] = [
+        0.17075552, 0.88821375, 0.20873757, 0.25236958, 0.57578111, 0.15312378,
+        0.5513742, 0.94687688, 0.16012503, 0.22159521
+    ]
+    db_out[1] = [
+        -0.017181443, -0.10852765, -0.12421377, -0.070773244, -0.11591884,
+        -0.11783017, -0.14165108, -0.14972731, -0.076892875, -0.1285544
+    ]
+    db_grad[2] = [
+        0.35077485, 0.47304362, 0.44412705, 0.44368884, 0.078527533, 0.81223965,
+        0.31168157, 0.43203235, 0.16792089, 0.24644311
+    ]
+    db_out[2] = [
+        -0.053967446, -0.1648933, -0.1716533, -0.1180798, -0.13005978,
+        -0.20151734, -0.17911947, -0.20289968, -0.095839672, -0.15638189
+    ]
+    db_grad[3] = [
+        0.9694621, 0.75035888, 0.28171822, 0.83813518, 0.53807181, 0.3728098,
+        0.81454384, 0.03848977, 0.89759839, 0.93665648
+    ]
+    db_out[3] = [
+        -0.15459226, -0.24556576, -0.20456907, -0.20662397, -0.18528105,
+        -0.24716705, -0.2643207, -0.21206589, -0.18749419, -0.2528303
+    ]
+    db_grad[4] = [
+        0.38578293, 0.8536852, 0.88722926, 0.66276771, 0.13678469, 0.94036359,
+        0.69107032, 0.81897682, 0.5433259, 0.67860287
+    ]
+    db_out[4] = [
+        -0.20323303, -0.33900154, -0.29658359, -0.28175515, -0.20448165,
+        -0.34576839, -0.34194785, -0.29488021, -0.25099224, -0.33033544
+    ]
+    db_grad[5] = [
+        0.27885768, 0.76100707, 0.24625534, 0.81354135, 0.18959245, 0.48038563,
+        0.84163809, 0.41172323, 0.83259648, 0.44941229
+    ]
+    db_out[5] = [
+        -0.23598288, -0.42444581, -0.33041057, -0.3706224, -0.22536094,
+        -0.40366709, -0.43387437, -0.34433398, -0.34060168, -0.38302717
+    ]
+    db_grad[6] = [
+        0.27233034, 0.056316052, 0.5039115, 0.24105175, 0.35697976, 0.75913221,
+        0.73577434, 0.16014607, 0.57500273, 0.071136251
+    ]
+    db_out[6] = [
+        -0.26649091, -0.43862185, -0.38418442, -0.40361428, -0.26314685,
+        -0.48537019, -0.51664448, -0.36529395, -0.40706289, -0.39540997
+    ]
+    db_grad[7] = [
+        0.58697265, 0.2494842, 0.08106143, 0.39954534, 0.15892942, 0.12683646,
+        0.74053431, 0.16033, 0.66625422, 0.73515922
+    ]
+    db_out[7] = [
+        -0.32823896, -0.46498787, -0.39766794, -0.446868, -0.28281838,
+        -0.50622416, -0.59897494, -0.38342294, -0.48033443, -0.47016418
+    ]
+    db_grad[8] = [
+        0.8215279, 0.41994119, 0.95172721, 0.68000203, 0.79439718, 0.43384039,
+        0.55561525, 0.22567581, 0.93331909, 0.29438227
+    ]
+    db_out[8] = [
+        -0.41656655, -0.50961858, -0.49418902, -0.51919359, -0.36422527,
+        -0.55169362, -0.6627695, -0.40780342, -0.58099347, -0.50707781
+    ]
+    db_grad[9] = [
+        0.68297005, 0.67758518, 0.1748755, 0.13266537, 0.70697063, 0.055731893,
+        0.68593478, 0.50580865, 0.12602448, 0.093537711
+    ]
+    db_out[9] = [
+        -0.49369633, -0.58184016, -0.52132869, -0.5396927, -0.44306302,
+        -0.56181377, -0.73774242, -0.46082234, -0.60366184, -0.52012295
+    ]
+    # pylint: enable=line-too-long
+    return db_grad, db_out
+
+  def testLikeDistBeliefMom01(self):
+    with self.test_session():
+      db_grad, db_out = self._dbParamsMom01()
+      num_samples = len(db_grad)
+      var0 = variables.Variable([0.0] * num_samples)
+      grads0 = constant_op.constant([0.0] * num_samples)
+      mom_opt = momentum_lib.MomentumOptimizer(learning_rate=0.1, momentum=0.1)
+      mom_update = mom_opt.apply_gradients(zip([grads0], [var0]))
+      variables.global_variables_initializer().run()
+      for i in xrange(num_samples):
+        mom_update.run(feed_dict={grads0: db_grad[i]})
+        self.assertAllClose(np.array(db_out[i]), var0.eval())
+
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
+        var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[.1, .1]], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([4, 2]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [[.01, .01], [.01, .01]], dtype=dtype),
+            constant_op.constant([2, 3]),
+            constant_op.constant([4, 2]))
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([0, 0], var0.eval()[0])
+        self.assertAllClose([0, 0], var0.eval()[1])
+        self.assertAllClose([1, 1], var1.eval()[2])
+
+        # Step 1: the momentum accumulators are 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]), slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]), var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]), var1.eval()[2])
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), -(0.1 * 2.0) - (
+                    (0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0), 0.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval()[2])
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9)
+        mom_update1 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        mom_update2 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update1.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the second momentum accumulators contain the previous update.
+        mom_update2.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d19578ea8c4f53019657ab85950a814d1a47b8
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -0,0 +1,1352 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Version 2 of class Optimizer."""
+# pylint: disable=g-bad-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training import slot_creator
+from tensorflow.python.util import nest
+
+
+class _OptimizableVariable(object):
+  """Interface for abstracting over variables in the optimizers."""
+
+  @abc.abstractmethod
+  def target(self):
+    """Returns the optimization target for this variable."""
+    raise NotImplementedError("Calling an abstract method.")
+
+  @abc.abstractmethod
+  def update_op(self, optimizer, g, *args):
+    """Returns the update ops for updating the variable."""
+    raise NotImplementedError("Calling an abstract method.")
+
+
+class _RefVariableProcessor(_OptimizableVariable):
+  """Processor for Variable."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v._ref()  # pylint: disable=protected-access
+
+  def update_op(self, optimizer, g, *args):
+    if isinstance(g, ops.Tensor):
+      update_op = optimizer._apply_dense(g, self._v, *args)  # pylint: disable=protected-access
+      if self._v.constraint is not None:
+        with ops.control_dependencies([update_op]):
+          return self._v.assign(self._v.constraint(self._v))
+      else:
+        return update_op
+    else:
+      assert isinstance(g, ops.IndexedSlices), ("Gradient ", g, " is neither a "
+                                                "tensor nor IndexedSlices.")
+      if self._v.constraint is not None:
+        raise RuntimeError(
+            "Cannot use a constraint function on a sparse variable.")
+      # pylint: disable=protected-access
+      return optimizer._apply_sparse_duplicate_indices(g, self._v, *args)
+
+
+class _DenseReadResourceVariableProcessor(_OptimizableVariable):
+  """Processor for dense ResourceVariables."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    # pylint: disable=protected-access
+    update_op = optimizer._resource_apply_dense(g, self._v.op.inputs[0], *args)
+    if self._v.constraint is not None:
+      with ops.control_dependencies([update_op]):
+        return self._v.assign(self._v.constraint(self._v))
+    else:
+      return update_op
+
+
+class _DenseResourceVariableProcessor(_OptimizableVariable):
+  """Processor for dense ResourceVariables."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    # pylint: disable=protected-access
+    if isinstance(g, ops.IndexedSlices):
+      if self._v.constraint is not None:
+        raise RuntimeError(
+            "Cannot use a constraint function on a sparse variable.")
+      return optimizer._resource_apply_sparse_duplicate_indices(
+          g.values, self._v, g.indices, *args)
+    update_op = optimizer._resource_apply_dense(g, self._v, *args)
+    if self._v.constraint is not None:
+      with ops.control_dependencies([update_op]):
+        return self._v.assign(self._v.constraint(self._v))
+    else:
+      return update_op
+
+
+class _StreamingModelPortProcessor(_OptimizableVariable):
+  """Processor for streaming ModelPorts."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    return g
+
+
+class _TensorProcessor(_OptimizableVariable):
+  """Processor for ordinary Tensors.
+
+  Even though a Tensor can't really be updated, sometimes it is useful to
+  compute the gradients with respect to a Tensor using the optimizer. Updating
+  the Tensor is, of course, unsupported.
+  """
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    raise NotImplementedError("Trying to update a Tensor ", self._v)
+
+
+def _get_processor(v):
+  """The processor of v."""
+  if context.executing_eagerly():
+    if isinstance(v, ops.Tensor):
+      return _TensorProcessor(v)
+    else:
+      return _DenseResourceVariableProcessor(v)
+  if v.op.type == "VarHandleOp":
+    return _DenseResourceVariableProcessor(v)
+  if isinstance(v, variables.Variable):
+    return _RefVariableProcessor(v)
+  if v.op.type == "SubmodelPort":
+    return _StreamingModelPortProcessor(v)
+  if isinstance(v, ops.Tensor):
+    return _TensorProcessor(v)
+  raise NotImplementedError("Trying to optimize unsupported type ", v)
+
+
+def _var_key_v2(var):
+  """Key for representing a primary variable, for looking up slots."""
+  # pylint: disable=protected-access
+  if hasattr(var, "_mirrored_container"):
+    mirrored_container = var._mirrored_container()
+    assert mirrored_container is not None
+    if context.executing_eagerly():
+      return mirrored_container._unique_id
+    return mirrored_container._shared_name
+  if context.executing_eagerly():
+    return var._unique_id
+  return var.op.name
+
+
+def _resolve(value, name):
+  if callable(value):
+    value = value()
+  return ops.convert_to_tensor(value, name=name)
+
+
+def _is_dynamic(value):
+  """Returns true if __init__ arg `value` should be re-evaluated each step."""
+  if callable(value): return True
+  # Don't need to do anything special in graph mode, since dynamic values
+  # will propagate correctly automatically.
+  # TODO(josh11b): Add per-device caching across steps using variables for
+  # truly static values once we add distributed support.
+  if context.executing_eagerly() and isinstance(
+      value, resource_variable_ops.ResourceVariable):
+    return True
+  return False
+
+
+class _OptimizerV2State(object):
+  """Holds per-graph and per-step optimizer state.
+
+  Use _init_with_static_hyper() to create the state for a graph, and then
+  _copy_with_dynamic_hyper() to convert that to state for a particular step.
+  The difference between the two is that the former only has hyper
+  parameter values that are static and the latter also has values that
+  can change every step (according to _is_dynamic()).
+  """
+
+  def __init__(self, op_name):
+    self._op_name = op_name
+
+  def _init_with_static_hyper(self, hyper):
+    """Initialize a fresh state object from hyper dict."""
+    # self._hyper contains a dict from name to a dict with the Tensor values.
+    # This dict starts with a single item with key "None" with the hyper
+    # parameter value converted to a Tensor. Other items have dtype keys
+    # with that Tensor cast to that dtype.
+    self._hyper = {name: {None: ops.convert_to_tensor(value, name=name)}
+                   for name, (dynamic, value) in hyper.items() if not dynamic}
+    self._slots = {}
+    self._non_slot_dict = {}
+    # Extra state to help Optimizers implement Checkpointable. Holds information
+    # about variables which will be restored as soon as they're created.
+    self._deferred_dependencies = {}  # Non-slot variables
+    self._deferred_slot_restorations = {}  # Slot variables
+
+  def _copy_with_dynamic_hyper(self, hyper, distribution, non_slot_devices):
+    """Create a new state object for a particular step."""
+    ret = _OptimizerV2State(self._op_name)
+    # pylint: disable=protected-access
+    ret._slots = self._slots
+    ret._non_slot_dict = self._non_slot_dict
+    ret._deferred_dependencies = self._deferred_dependencies
+    ret._deferred_slot_restorations = self._deferred_slot_restorations
+    ret._hyper = {name: {None: _resolve(value, name)}
+                  for name, (dynamic, value) in hyper.items() if dynamic}
+    ret._hyper.update(self._hyper)
+    ret._non_slot_devices = non_slot_devices
+    ret._distribution = distribution
+    return ret
+
+  def _variables(self):
+    """Returns a list of all variables held by self."""
+    optimizer_variables = list(self._non_slot_dict.values())
+    for variable_dict in self._slots.values():
+      for slot_for_variable in variable_dict.values():
+        optimizer_variables.append(slot_for_variable)
+    # Sort variables by name so that the return is deterministic.
+    return sorted(optimizer_variables, key=lambda v: v.name)
+
+  def _slot_dict(self, slot_name):
+    """Returns a dict for caching slots created under the given name.
+
+    Args:
+      slot_name: Name for the slot.
+
+    Returns:
+      A dict that maps primary `Variable` objects to the slot created
+      for that variable, under the given slot name.
+    """
+    named_slots = self._slots.get(slot_name, None)
+    if named_slots is None:
+      named_slots = {}
+      self._slots[slot_name] = named_slots
+    return named_slots
+
+  def create_slot(self, var, val, slot_name, optional_op_name=None):
+    """Find or create a slot for a variable.
+
+    Args:
+      var: A `Variable` object.
+      val: A `Tensor`.  The initial value of the slot.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_slot(
+          var, val, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def create_slot_with_initializer(self, var, initializer, shape, dtype,
+                                   slot_name, optional_op_name=None):
+    """Find or create a slot for a variable, using an Initializer.
+
+    Args:
+      var: A `Variable` object.
+      initializer: An `Initializer`.  The initial value of the slot.
+      shape: Shape of the initial value of the slot.
+      dtype: Type of the value of the slot.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_slot_with_initializer(
+          var, initializer, shape, dtype, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def zeros_slot(self, var, slot_name, optional_op_name=None):
+    """Find or create a slot initialized with 0.0.
+
+    Args:
+      var: A `Variable` object.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_zeros_slot(
+          var, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable,
+      optional_op_name=None):
+    """Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored. When executing eagerly, we create the slot variable with a
+    restoring initializer.
+
+    No new variables are created when graph building. Instead,
+    _restore_slot_variable catches these after normal creation and adds restore
+    ops to the graph. This method is nonetheless important when graph building
+    for the case when a slot variable has already been created but `variable`
+    has just been added to a dependency graph (causing us to realize that the
+    slot variable needs to be restored).
+
+    Args:
+      slot_variable_position: A `checkpointable._CheckpointPosition` object
+        indicating the slot variable `Checkpointable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+    """
+    slot_variable = self.get_slot(var=variable, name=slot_name)
+    if (slot_variable is None and context.executing_eagerly() and
+        slot_variable_position.is_simple_variable()):
+      initializer = checkpointable.CheckpointInitialValue(
+          checkpoint_position=slot_variable_position)
+      slot_variable = self.create_slot(
+          var=variable,
+          val=initializer,
+          slot_name=slot_name,
+          optional_op_name=optional_op_name)
+      # Optimizers do not have unconditional dependencies on their slot
+      # variables (nor do any other objects). They are only saved if the
+      # variables they were created for are also saved.
+    if slot_variable is not None:
+      # If we've either made this slot variable, or if we've pulled out an
+      # existing slot variable, we should restore it.
+      slot_variable_position.restore(slot_variable)
+    else:
+      # We didn't make the slot variable. Defer restoring until it gets created
+      # normally. We keep a list rather than the one with the highest restore
+      # UID in case slot variables have their own dependencies, in which case
+      # those could differ between restores.
+      variable_key = _var_key_v2(variable)
+      self._deferred_slot_restorations.setdefault(
+          slot_name, {}).setdefault(variable_key, []).append(
+              slot_variable_position)
+
+  def get_slot(self, var, name):
+    """Return a slot named `name` created for `var` by the Optimizer.
+
+    Some `Optimizer` subclasses use additional variables.  For example
+    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
+    gives access to these `Variable` objects if for some reason you need them.
+
+    Use `get_slot_names()` to get the list of slot names created by the
+    `Optimizer`.
+
+    Args:
+      var: A variable passed to `minimize()` or `apply_gradients()`.
+      name: A string.
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    named_slots = self._slots.get(name, None)
+    if not named_slots:
+      return None
+    return named_slots.get(_var_key_v2(var), None)
+
+  def get_slot_names(self):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    See `get_slot()`.
+
+    Returns:
+      A list of strings.
+    """
+    return sorted(self._slots.keys())
+
+  def create_non_slot(self, initial_value, name, colocate_with=None):
+    """Add an extra variable, not associated with a slot."""
+    v = self._non_slot_dict.get(name, None)
+    if v is None:
+      if colocate_with is None: colocate_with = self._non_slot_devices
+      with self._distribution.colocate_vars_with(colocate_with):
+        # TODO(josh11b): Use get_variable() except for the legacy Adam use case.
+        v = variable_scope.variable(initial_value, name=name, trainable=False)
+      self._non_slot_dict[name] = v
+      deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
+      for checkpoint_position in sorted(
+          deferred_dependencies_list,
+          key=lambda restore: restore.checkpoint.restore_uid,
+          reverse=True):
+        checkpoint_position.restore(v)
+    return v
+
+  def _restore_slot_variable(self, slot_name, variable, slot_variable):
+    """Restore a newly created slot variable's value."""
+    variable_key = _var_key_v2(variable)
+    deferred_restorations = self._deferred_slot_restorations.get(
+        slot_name, {}).pop(variable_key, [])
+    # Iterate over restores, highest restore UID first to minimize the number
+    # of assignments.
+    deferred_restorations.sort(key=lambda position: position.restore_uid,
+                               reverse=True)
+    for checkpoint_position in deferred_restorations:
+      checkpoint_position.restore(slot_variable)
+
+  def get_non_slot(self, name):
+    """Returns the non-slot variable identified by `name`."""
+    return self._non_slot_dict.get(name, None)
+
+  def get_hyper(self, name, dtype=None):
+    """Returns the `name` hyper parameter, optionally cast to `dtype`."""
+    dtype_dict = self._hyper[name]
+    # Do we have the value cast to dtype already cached? This should always
+    # succeed when dtype is None.
+    if dtype in dtype_dict:
+      return dtype_dict[dtype]
+    # Not cached, cast to dtype and save the result in the cache.
+    result = math_ops.cast(dtype_dict[None], dtype)
+    dtype_dict[dtype] = result
+    return result
+
+
+class OptimizerV2(optimizer_v1.Optimizer):
+  """Updated base class for optimizers.
+
+  This class defines the API to add Ops to train a model.  You never use this
+  class directly, but instead instantiate one of its subclasses such as
+  `GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
+
+  ### Usage
+
+  ```python
+  # Create an optimizer with the desired parameters.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+  # Add Ops to the graph to minimize a cost by updating a list of variables.
+  # "cost" is a Tensor, and the list of variables contains tf.Variable
+  # objects.
+  opt_op = opt.minimize(cost, var_list=<list of variables>)
+  ```
+
+  In the training program you will just have to run the returned Op.
+
+  ```python
+  # Execute opt_op to do one step of training:
+  opt_op.run()
+  ```
+
+  ### Processing gradients before applying them.
+
+  Calling `minimize()` takes care of both computing the gradients and
+  applying them to the variables.  If you want to process the gradients
+  before applying them you can instead use the optimizer in three steps:
+
+  1.  Compute the gradients with `compute_gradients()`.
+  2.  Process the gradients as you wish.
+  3.  Apply the processed gradients with `apply_gradients()`.
+
+  Example:
+
+  ```python
+  # Create an optimizer.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+
+  # Compute the gradients for a list of variables.
+  grads_and_vars = opt.compute_gradients(loss, <list of variables>)
+
+  # grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
+  # need to the 'gradient' part, for example cap them, etc.
+  capped_grads_and_vars = [(MyCapper(gv[0]), gv[1]) for gv in grads_and_vars]
+
+  # Ask the optimizer to apply the capped gradients.
+  opt.apply_gradients(capped_grads_and_vars)
+  ```
+
+  ### Gating Gradients
+
+  Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
+  argument that controls the degree of parallelism during the application of
+  the gradients.
+
+  The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.
+
+  <b>`GATE_NONE`</b>: Compute and apply gradients in parallel.  This provides
+  the maximum parallelism in execution, at the cost of some non-reproducibility
+  in the results.  For example the two gradients of `matmul` depend on the input
+  values: With `GATE_NONE` one of the gradients could be applied to one of the
+  inputs _before_ the other gradient is computed resulting in non-reproducible
+  results.
+
+  <b>`GATE_OP`</b>: For each Op, make sure all gradients are computed before
+  they are used.  This prevents race conditions for Ops that generate gradients
+  for multiple inputs where the gradients depend on the inputs.
+
+  <b>`GATE_GRAPH`</b>: Make sure all gradients for all variables are computed
+  before any one of them is used.  This provides the least parallelism but can
+  be useful if you want to process all gradients before applying any of them.
+
+  ### Slots
+
+  Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
+  allocate and manage additional variables associated with the variables to
+  train.  These are called <i>Slots</i>.  Slots have names and you can ask the
+  optimizer for the names of the slots that it uses.  Once you have a slot name
+  you can ask the optimizer for the variable it created to hold the slot value.
+
+  This can be useful if you want to log debug a training algorithm, report stats
+  about the slots, etc.
+
+  ### Non-slot variables
+
+  Some optimizer subclasses, such as `AdamOptimizer` have variables that
+  are not associated with the variables to train, just the step itself.
+
+  ### Hyper parameters
+
+  These are arguments passed to the optimizer subclass constructor
+  (the `__init__` method), and then passed to `self._set_hyper()`.
+  They can be either regular Python values (like 1.0), tensors, or
+  callables. If they are callable, the callable will be called during
+  `apply_gradients()` to get the value for the hyper parameter.
+
+  ### State
+
+  Internal methods apre passed a `state` argument with the correct
+  values to use for the slot and non-slot variables, and the hyper
+  parameters.
+  """
+
+  # Values for gate_gradients.
+  GATE_NONE = 0
+  GATE_OP = 1
+  GATE_GRAPH = 2
+
+  def __init__(self, use_locking, name):
+    """Create a new Optimizer.
+
+    This must be called by the constructors of subclasses.
+    Note that Optimizer instances should not bind to a single graph,
+    and so shouldn't keep Tensors as member variables. Generally
+    you should be able to use the _set_hyper()/state.get_hyper()
+    facility instead.
+
+    Args:
+      use_locking: Bool. If True apply use locks to prevent concurrent updates
+        to variables.
+      name: A non-empty string.  The name to use for accumulators created
+        for the optimizer.
+
+    Raises:
+      ValueError: If name is malformed.
+      RuntimeError: If _create_slots has been overridden instead of
+          _create_vars.
+    """
+    # Note: We intentionally don't call parent __init__.
+
+    # Optimizer._create_slots was replaced by _create_vars in OptimizerV2.
+    if (self.__class__._create_slots.__code__ is not  # pylint: disable=protected-access
+        OptimizerV2._create_slots.__code__):
+      raise RuntimeError("Override _create_vars instead of _create_slots when "
+                         "descending from OptimizerV2 (class %s)" %
+                         self.__class__.__name__)
+    if not name:
+      raise ValueError("Must specify the optimizer name")
+
+    self._use_locking = use_locking
+    self._name = name
+    # Map from graph_key to state for that graph. We use the graph_key
+    # since it works in both eager and graph mode, and gives the outer
+    # graph inside functions.
+    tower_context = distribute_lib.get_tower_context()
+    if tower_context is None:
+      # In a cross-tower context for a DistributionStrategy, which means
+      # only one Optimizer will be created, not one per tower.
+      self._per_graph_state = {}
+    else:
+      # We use get_tower_context().merge_call() to get a single dict
+      # shared across all model replicas when running with a
+      # DistributionStrategy.
+      self._per_graph_state = tower_context.merge_call(lambda _: {})
+
+    # Hyper parameters, and whether they should be re-evaluated every step.
+    self._hyper = {}
+
+  def _set_hyper(self, name, value):
+    self._hyper[name] = (_is_dynamic(value), value)
+
+  def minimize(self, loss, global_step=None, var_list=None,
+               gate_gradients=GATE_OP, aggregation_method=None,
+               colocate_gradients_with_ops=False, name=None,
+               grad_loss=None, stop_gradients=None,
+               scale_loss_by_num_towers=None):
+    """Add operations to minimize `loss` by updating `var_list`.
+
+    This method simply combines calls `compute_gradients()` and
+    `apply_gradients()`. If you want to process the gradient before applying
+    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    of using this function.
+
+    Args:
+      loss: A `Tensor` containing the value to minimize.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      var_list: Optional list or tuple of `Variable` objects to update to
+        minimize `loss`.  Defaults to the list of variables collected in
+        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      name: Optional name for the returned operation.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
+        down by the number of towers. By default, auto-detects whether this
+        is needed.
+
+    Returns:
+      An Operation that updates the variables in `var_list`.  If `global_step`
+      was not `None`, that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If some of the variables are not `Variable` objects.
+
+    @compatibility(eager)
+    When eager execution is enabled, `loss` should be a Python function that
+    takes elements of `var_list` as arguments and computes the value to be
+    minimized. If `var_list` is None, `loss` should take no arguments.
+    Minimization (and gradient computation) is done with respect to the
+    elements of `var_list` if not None, else with respect to any trainable
+    variables created during the execution of the `loss` function.
+    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
+    `grad_loss` are ignored when eager execution is enabled.
+    @end_compatibility
+    """
+    grads_and_vars = self.compute_gradients(
+        loss, var_list=var_list, gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss, stop_gradients=stop_gradients,
+        scale_loss_by_num_towers=scale_loss_by_num_towers)
+
+    vars_with_grad = [v for g, v in grads_and_vars if g is not None]
+    if not vars_with_grad:
+      raise ValueError(
+          "No gradients provided for any variable, check your graph for ops"
+          " that do not support gradients, between variables %s and loss %s." %
+          ([str(v) for _, v in grads_and_vars], loss))
+
+    return self.apply_gradients(grads_and_vars, global_step=global_step,
+                                name=name)
+
+  def compute_gradients(self, loss, var_list=None,
+                        gate_gradients=GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None, stop_gradients=None,
+                        scale_loss_by_num_towers=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize or a callable taking
+        no arguments which returns the value to minimize. When eager execution
+        is enabled it must be a callable.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
+        down by the number of towers. By default, auto-detects whether this
+        is needed.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid.
+      RuntimeError: If called with eager execution enabled and `loss` is
+        not callable.
+
+    @compatibility(eager)
+    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
+    and `colocate_gradients_with_ops` are ignored.
+    @end_compatibility
+    """
+    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
+    if callable(loss):
+      with backprop.GradientTape() as tape:
+        if var_list is not None:
+          tape.watch(var_list)
+        loss_value = loss()
+
+        # Scale loss for number of towers (callable-loss case). In this case,
+        # we have to be careful to call distribute_lib.get_loss_reduction()
+        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        if scale_loss_by_num_towers is None:
+          scale_loss_by_num_towers = (
+              distribute_lib.get_loss_reduction() == "mean")
+        if scale_loss_by_num_towers:
+          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          if num_towers > 1:
+            loss_value *= 1. / num_towers
+
+      if var_list is None:
+        var_list = tape.watched_variables()
+      grads = tape.gradient(loss_value, var_list, grad_loss)
+      return list(zip(grads, var_list))
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "`loss` passed to Optimizer.compute_gradients should "
+          "be a function when eager execution is enabled.")
+
+    # Scale loss for number of towers (non-callable-loss case).
+    if scale_loss_by_num_towers is None:
+      scale_loss_by_num_towers = (
+          distribute_lib.get_loss_reduction() == "mean")
+    if scale_loss_by_num_towers:
+      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      if num_towers > 1:
+        loss *= 1. / num_towers
+
+    if gate_gradients not in [optimizer_v1.Optimizer.GATE_NONE,
+                              optimizer_v1.Optimizer.GATE_OP,
+                              optimizer_v1.Optimizer.GATE_GRAPH]:
+      raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
+                       "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" %
+                       gate_gradients)
+    self._assert_valid_dtypes([loss])
+    if grad_loss is not None:
+      self._assert_valid_dtypes([grad_loss])
+    if var_list is None:
+      var_list = (
+          variables.trainable_variables() +
+          ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+    else:
+      var_list = nest.flatten(var_list)
+    # pylint: disable=protected-access
+    var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS)
+    # pylint: enable=protected-access
+    processors = [_get_processor(v) for v in var_list]
+    if not var_list:
+      raise ValueError("No variables to optimize.")
+    var_refs = [p.target() for p in processors]
+    grads = gradients.gradients(
+        loss, var_refs, grad_ys=grad_loss,
+        gate_gradients=(gate_gradients == optimizer_v1.Optimizer.GATE_OP),
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        stop_gradients=stop_gradients)
+    if gate_gradients == optimizer_v1.Optimizer.GATE_GRAPH:
+      grads = control_flow_ops.tuple(grads)
+    grads_and_vars = list(zip(grads, var_list))
+    self._assert_valid_dtypes(
+        [v for g, v in grads_and_vars
+         if g is not None and v.dtype != dtypes.resource])
+    return grads_and_vars
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    # This is a default implementation of apply_gradients() that can be shared
+    # by most optimizers.  It relies on the subclass implementing the following
+    # methods: _create_vars(), _prepare(), _apply_dense(), and _apply_sparse().
+
+    # Filter out variables with gradients of `None`.
+    grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
+    if not grads_and_vars:
+      raise ValueError("No variables provided.")
+    filtered = tuple((g, v) for (g, v) in grads_and_vars if g is not None)
+    if not filtered:
+      raise ValueError("No gradients provided for any variable: %s." %
+                       ([str(v) for _, v in grads_and_vars],))
+    return distribute_lib.get_tower_context().merge_call(
+        self._distributed_apply, filtered, global_step=global_step, name=name)
+
+  def _get_or_create_state(self, var_list=None):
+    """Either looks up or creates `_OptimizerV2State`.
+
+    If any variables are available, they should be passed via the `var_list`
+    argument, and these will be used to determine the graph to create/retrieve
+    state for. Otherwise the returned state is for the current default graph.
+
+    Args:
+      var_list: A list of variables to extract a graph from.
+
+    Returns:
+      An `_OptimizerV2State` object.
+    """
+    # Determine the graph_key from the current graph.
+    eager_execution = context.executing_eagerly()
+    if eager_execution or var_list is None:
+      graph = ops.get_default_graph()
+    else:
+      graph = ops._get_graph_from_inputs(var_list)  # pylint: disable=protected-access
+    assert graph is not None
+    graph_key = graph._graph_key  # pylint: disable=protected-access
+
+    # Get the per graph state by looking up the graph_key.
+    if graph_key in self._per_graph_state:
+      per_graph_state = self._per_graph_state[graph_key]
+    else:
+      per_graph_state = _OptimizerV2State(self._name)
+      per_graph_state._init_with_static_hyper(self._hyper)  # pylint: disable=protected-access
+      self._per_graph_state[graph_key] = per_graph_state
+    return per_graph_state
+
+  def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
+    """`apply_gradients` for use with a `DistributionStrategy`."""
+    reduced_grads = distribution.batch_reduce("sum", grads_and_vars)
+    var_list = [v for _, v in grads_and_vars]
+    grads_and_vars = zip(reduced_grads, var_list)
+
+    unwrapped_var_list = [x for v in var_list for x in distribution.unwrap(v)]
+    eager_execution = context.executing_eagerly()
+    if eager_execution:
+      # Give a clear error in this case instead of "name not supported
+      # for Eager Tensors" when we compute non_slot_devices.
+      for v in unwrapped_var_list:
+        if isinstance(v, ops.Tensor):
+          raise NotImplementedError("Trying to update a Tensor ", v)
+
+    with ops.name_scope(name, self._name) as name:
+      per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list)
+      # Include the current value of any dynamic hyper parameters in `state`.
+      non_slot_devices = distribution.non_slot_devices(var_list)
+      state = per_graph_state._copy_with_dynamic_hyper(  # pylint: disable=protected-access
+          self._hyper, distribution, non_slot_devices)
+
+    # Create any slot and non-slot variables we need in `state`.
+    with ops.init_scope():
+      self._create_vars(var_list, state)
+
+    with ops.name_scope(name):  # Re-enter name_scope created above
+      # Give the child class a chance to do something before we start
+      # applying gradients.
+      self._prepare(state)
+
+      def update(v, g):
+        """Update variable `v` using gradient `g`."""
+        assert v is not None
+
+        # Convert the grad to Tensor or IndexedSlices if necessary, and
+        # look up a processor for each variable's type.
+        try:
+          g = ops.convert_to_tensor_or_indexed_slices(g)
+        except TypeError:
+          raise TypeError(
+              "Gradient must be convertible to a Tensor"
+              " or IndexedSlices, or None: %s" % g)
+        if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
+          raise TypeError(
+              "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
+        processor = _get_processor(v)
+
+        # We colocate all ops created in _apply_dense or _apply_sparse
+        # on the same device as the variable.
+        # TODO(apassos): figure out how to get the variable name here.
+        scope_name = "" if eager_execution else v.op.name
+        # device_policy is set because non-mirrored tensors will be read in
+        # `update_op`.
+        # TODO(josh11b): Make different state objects for each device to
+        # avoid needing to set the device_policy.
+        with ops.name_scope("update_" + scope_name), \
+            context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+          return processor.update_op(self, g, state)
+
+      # Use the processors to update the variables.
+      update_ops = []
+      for grad, var in grads_and_vars:
+        update_ops.extend(distribution.unwrap(distribution.update(
+            var, update, grad)))
+
+      # Give the child class a chance to do something after applying
+      # gradients
+      def finish():
+        # TODO(josh11b): Make different state objects for each device to
+        # avoid needing to set the device_policy.
+        with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+          return self._finish(state)
+
+      update_ops = control_flow_ops.group(update_ops)
+      with ops.control_dependencies([update_ops]):
+        finish_updates = distribution.update_non_slot(non_slot_devices, finish)
+      if finish_updates is None:
+        finish_updates = update_ops
+
+      # Update `global_step` (if any).
+      if global_step is None:
+        apply_updates = distribution.group(finish_updates, name=name)
+      else:
+        with ops.control_dependencies(distribution.unwrap(finish_updates)):
+
+          def update_global_step(global_step):
+            if isinstance(global_step, resource_variable_ops.ResourceVariable):
+              return global_step.assign_add(
+                  ops.convert_to_tensor(1, dtype=global_step.dtype),
+                  read_value=False)
+            else:
+              return state_ops.assign_add(global_step, 1)
+
+          apply_updates = distribution.group(
+              distribution.update(global_step, update_global_step), name=name)
+
+      # Add the training op to the TRAIN_OP graph collection in graph mode.
+      if not eager_execution:
+        if isinstance(apply_updates, ops.Tensor):
+          apply_updates = apply_updates.op
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        if apply_updates not in train_op:
+          train_op.append(apply_updates)
+
+      return apply_updates
+
+  def get_slot(self, var, name):
+    """Return a slot named `name` created for `var` by the Optimizer.
+
+    Some `Optimizer` subclasses use additional variables.  For example
+    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
+    gives access to these `Variable` objects if for some reason you need them.
+
+    Use `get_slot_names()` to get the list of slot names created by the
+    `Optimizer`.
+
+    Args:
+      var: A variable passed to `minimize()` or `apply_gradients()`.
+      name: A string.
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    state = self._get_state_for_var(var)
+    return state.get_slot(var, name) if state is not None else None
+
+  def get_slot_names(self):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    See `get_slot()`.
+
+    Returns:
+      A list of strings.
+    """
+    state = self._get_per_graph_state()
+    return state.get_slot_names() if state is not None else []
+
+  def variables(self):
+    """A list of variables which encode the current state of `Optimizer`.
+
+    Includes slot variables and additional global variables created by the
+    optimizer in the current default graph.
+
+    Returns:
+      A list of variables.
+    """
+    state = self._get_per_graph_state()
+    return state._variables() if state is not None else []  # pylint: disable=protected-access
+
+  # --------------
+  # Methods to be implemented by subclasses if they want to use the
+  # inherited implementation of apply_gradients() or compute_gradients().
+  # --------------
+  def _create_vars(self, var_list, state):
+    """Create all slots needed by the variables and any non-slot variables.
+
+    Args:
+      var_list: A list of `Variable` objects.
+      state: An object with these methods:
+        `create_slot(var, val, slot_name, optional_op_name)`,
+        `create_slot_with_initializer(`
+            `var, initializer, shape, dtype, slot_name, optional_op_name)`,
+        `zeros_slot(var, slot_name, optional_op_name)`,
+        `create_non_slot_variable(initial_value, name, colocate_with)`,
+        `get_hyper(name)`
+    """
+    # No slots needed by default
+    pass
+
+  def _prepare(self, state):
+    """Code to execute before applying gradients.
+
+    Note that most uses of _prepare() in Optimizer have been subsumed
+    by explicit support for hyper parameters in OptimizerV2
+
+    Args:
+      state: An object with a `get_hyper(name)` method.
+
+    Returns:
+      Return value will be ignored.
+    """
+    pass
+
+  def _apply_dense(self, grad, var, state):
+    """Add ops to apply dense gradients to `var`.
+
+    Args:
+      grad: A `Tensor`.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_dense(self, grad, handle, state):
+    """Add ops to apply dense gradients to the variable `handle`.
+
+    Args:
+      grad: a `Tensor` representing the gradient.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_sparse_duplicate_indices(
+      self, grad, handle, indices, state):
+    """Add ops to apply sparse gradients to `handle`, with repeated indices.
+
+    Optimizers which override this method must deal with repeated indices. See
+    the docstring of `_apply_sparse_duplicate_indices` for details. By default
+    the correct behavior, to sum non-unique indices and their associated
+    gradients, is enforced by first pre-processing `grad` and `indices` and
+    passing them on to `_resource_apply_sparse`. Optimizers which deal correctly
+    with duplicate indices may instead override this method to avoid the
+    overhead of summing.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      indices: a `Tensor` of integral type representing the indices for
+       which the gradient is nonzero. Indices may be repeated.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    # pylint: disable=protected-access
+    summed_grad, unique_indices = optimizer_v1._deduplicate_indexed_slices(
+        values=grad, indices=indices)
+    # pylint: enable=protected-access
+    return self._resource_apply_sparse(
+        summed_grad, handle, unique_indices, state)
+
+  def _resource_apply_sparse(self, grad, handle, indices, state):
+    """Add ops to apply sparse gradients to the variable `handle`.
+
+    Similar to `_apply_sparse`, the `indices` argument to this method has been
+    de-duplicated. Optimizers which deal correctly with non-unique indices may
+    instead override `_resource_apply_sparse_duplicate_indices` to avoid this
+    overhead.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      indices: a `Tensor` of integral type representing the indices for
+       which the gradient is nonzero. Indices are unique.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _apply_sparse_duplicate_indices(self, grad, var, state):
+    """Add ops to apply sparse gradients to `var`, with repeated sparse indices.
+
+    Optimizers which override this method must deal with IndexedSlices objects
+    such as the following:
+
+      IndexedSlicesValue(values=[1, 1], indices=[0, 0], dense_shape=[1])
+
+    The correct interpretation is:
+
+      IndexedSlicesValue(values=[2], indices=[0], dense_shape=[1])
+
+    Many optimizers deal incorrectly with repeated indices when updating based
+    on sparse gradients (e.g. summing squares rather than squaring the sum, or
+    applying momentum terms multiple times). Adding first is always the correct
+    behavior, so this is enforced here by reconstructing the IndexedSlices to
+    have only unique indices, then calling _apply_sparse.
+
+    Optimizers which deal correctly with repeated indices may instead override
+    this method to avoid the overhead of summing indices.
+
+    Args:
+      grad: `IndexedSlices`.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    # pylint: disable=protected-access
+    summed_values, unique_indices = optimizer_v1._deduplicate_indexed_slices(
+        values=grad.values, indices=grad.indices)
+    # pylint: enable=protected-access
+    gradient_no_duplicate_indices = ops.IndexedSlices(
+        indices=unique_indices,
+        values=summed_values,
+        dense_shape=grad.dense_shape)
+    return self._apply_sparse(gradient_no_duplicate_indices, var, state)
+
+  def _apply_sparse(self, grad, var, state):
+    """Add ops to apply sparse gradients to `var`.
+
+    The IndexedSlices object passed to `grad` in this function is by default
+    pre-processed in `_apply_sparse_duplicate_indices` to remove duplicate
+    indices (see its docstring for details). Optimizers which can tolerate or
+    have correct special cases for duplicate sparse indices may override
+    `_apply_sparse_duplicate_indices` instead of this function, avoiding that
+    overhead.
+
+    Args:
+      grad: `IndexedSlices`, with no repeated indices.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    raise NotImplementedError()
+
+  def _finish(self, state):
+    """Do what is needed to finish the update.
+
+    This is called inside a scope colocated with any non-slot variables.
+
+    Args:
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      The operation to apply updates, or None if no updates.
+    """
+    return None
+
+  # --------------
+  # Utility methods for subclasses.
+  # --------------
+  def _get_per_graph_state(self):
+    # pylint: disable=protected-access
+    return self._per_graph_state.get(ops.get_default_graph()._graph_key, None)
+
+  def _get_state_for_var(self, var):
+    # pylint: disable=protected-access
+    return self._per_graph_state.get(var._graph_key, None)
+
+  # --------------
+  # Overridden methods from Checkpointable.
+  # --------------
+
+  def _track_checkpointable(self, *args, **kwargs):
+    """Optimizers may not track dependencies. Raises an error."""
+    raise NotImplementedError(
+        "Optimizers may not have dependencies. File a feature request if this "
+        "limitation bothers you.")
+
+  @property
+  def _checkpoint_dependencies(self):
+    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    current_graph_non_slot_variables = []
+    state = self._get_per_graph_state()
+    if state is not None:
+      for name, variable_object in sorted(
+          state._non_slot_dict.items(),  # pylint: disable=protected-access
+          # Avoid comparing variables
+          key=lambda item: item[0]):
+        current_graph_non_slot_variables.append(
+            checkpointable.CheckpointableReference(
+                name=name, ref=variable_object))
+    # Note: ignores super(); Optimizers may not have any dependencies outside of
+    # state objects.
+    return current_graph_non_slot_variables
+
+  def _lookup_dependency(self, name):
+    """From Checkpointable. Find a non-slot variable in the current graph."""
+    state = self._get_per_graph_state()
+    if state is None:
+      return None
+    else:
+      return state.get_non_slot(name)
+
+  @property
+  def _deferred_dependencies(self):
+    """Lets Checkpointable know where non-slot variables are created.
+
+    If necessary, creates a new state object for the current default graph.
+    Checkpointable will then add entries to that state's deferred dependency
+    dictionary. The state object will check that dictionary when creating
+    non-slot variables, restoring their value if an entry is found.
+
+    Returns:
+      A dictionary which holds deferred dependencies for the current default
+      graph.
+    """
+    state = self._get_or_create_state()
+    return state._deferred_dependencies  # pylint: disable=protected-access
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable):
+    """Checkpointable: Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored.
+
+    Args:
+      slot_variable_position: A `checkpointable._CheckpointPosition` object
+        indicating the slot variable `Checkpointable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+    """
+    state = self._get_or_create_state(var_list=[variable])
+    state._create_or_restore_slot_variable(  # pylint: disable=protected-access
+        slot_variable_position=slot_variable_position,
+        slot_name=slot_name,
+        variable=variable,
+        optional_op_name=self._name)
+
+  # --------------
+  # Unsupported parent methods
+  # --------------
+  def _slot_dict(self, slot_name):
+    raise NotImplementedError(
+        "_slot_dict() method unsupported in OptimizerV2")
+
+  def _get_or_make_slot(self, var, val, slot_name, op_name):
+    raise NotImplementedError(
+        "_get_or_make_slot() method unsupported in OptimizerV2")
+
+  def _get_or_make_slot_with_initializer(self, var, initializer, shape, dtype,
+                                         slot_name, op_name):
+    raise NotImplementedError(
+        "_get_or_make_slot_with_initializer() method unsupported in "
+        "OptimizerV2")
+
+  def _create_non_slot_variable(self, initial_value, name, colocate_with):
+    raise NotImplementedError(
+        "_create_non_slot_variable() method unsupported in OptimizerV2")
+
+  def _get_non_slot_variable(self, name, graph=None):
+    raise NotImplementedError(
+        "_get_non_slot_variable() method unsupported in OptimizerV2")
+
+  def _non_slot_variables(self):
+    raise NotImplementedError(
+        "_non_slot_variables() method unsupported in OptimizerV2")
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_symbols.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_symbols.py
new file mode 100644
index 0000000000000000000000000000000000000000..24eada06ccdd68090f44c62646040fcd7d659727
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_symbols.py
@@ -0,0 +1,42 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution-aware version of Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.optimizer_v2.adadelta import AdadeltaOptimizer
+from tensorflow.contrib.optimizer_v2.adagrad import AdagradOptimizer
+from tensorflow.contrib.optimizer_v2.adam import AdamOptimizer
+from tensorflow.contrib.optimizer_v2.gradient_descent import GradientDescentOptimizer
+from tensorflow.contrib.optimizer_v2.momentum import MomentumOptimizer
+from tensorflow.contrib.optimizer_v2.optimizer_v2 import OptimizerV2
+from tensorflow.contrib.optimizer_v2.rmsprop import RMSPropOptimizer
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'AdadeltaOptimizer',
+    'AdagradOptimizer',
+    'AdamOptimizer',
+    'GradientDescentOptimizer',
+    'MomentumOptimizer',
+    'OptimizerV2',
+    'RMSPropOptimizer',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8599af32f6f4cc5529cd812e83c02ef3812cb71e
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
@@ -0,0 +1,294 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for OptimizerV2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import gradient_descent
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class OptimizerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBasic(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a_%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b_%d' % i)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+      # Note that for eager execution, minimize expects a function instead of a
+      # Tensor.
+      global_step = resource_variable_ops.ResourceVariable(
+          array_ops.zeros([], dtypes.int64), name='global_step_%d' % i)
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+
+      self.evaluate(variables.global_variables_initializer())
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Run 1 step of sgd through optimizer
+      opt_op = sgd_op.minimize(loss, global_step, [var0, var1])
+      self.evaluate(opt_op)
+      # Validate updated params
+      self.assertAllClose([-14., -13.], self.evaluate(var0))
+      self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  def testAggregationMethod(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        cost = 5 * var0 + 3 * var1
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name='global_step')
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+        opt_op = sgd_op.minimize(
+            cost,
+            global_step, [var0, var1],
+            aggregation_method=gradients_impl.AggregationMethod.
+            EXPERIMENTAL_ACCUMULATE_N)
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd through optimizer
+        opt_op.run()
+        # Validate updated params
+        self.assertAllClose([-14., -13.], var0.eval())
+        self.assertAllClose([-6., -5.], var1.eval())
+
+  def testPrecomputedGradient(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        cost = 5 * var0 + 3 * var1
+        grad_loss = constant_op.constant([42, -42], dtype=dtype)
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name='global_step')
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+        opt_op = sgd_op.minimize(
+            cost, global_step, [var0, var1], grad_loss=grad_loss)
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd through optimizer
+        opt_op.run()
+        # Validate updated params
+        self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
+                            var0.eval())
+        self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
+                            var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoVariables(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        var0 = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype, trainable=False, name='a')
+        var1 = resource_variable_ops.ResourceVariable(
+            [3.0, 4.0], dtype=dtype, trainable=False, name='b')
+        return 5 * var0 + var1
+      # pylint: enable=cell-var-from-loop
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No.*variables'):
+        sgd_op.minimize(loss)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoGradients(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b%d' % i)
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        return 5 * var0
+      # pylint: enable=cell-var-from-loop
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No gradients'):
+        # var1 has no gradient
+        sgd_op.minimize(loss, var_list=[var1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoGradientsForAnyVariables_Minimize(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a_%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b_%d' % i)
+      def loss():
+        return constant_op.constant(5.0)
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'No gradients provided for any variable'):
+        sgd_op.minimize(loss, var_list=[var0, var1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoGradientsForAnyVariables_ApplyGradients(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a_%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b_%d' % i)
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'No gradients provided for any variable'):
+        sgd_op.apply_gradients([(None, var0), (None, var1)])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientsAsVariables(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b%d' % i)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      grads_and_vars = sgd_op.compute_gradients(loss, [var0, var1])
+      # Convert gradients to tf.Variables
+      converted_grads = [
+          resource_variable_ops.ResourceVariable(array_ops.zeros([2], dtype),
+                                                 name='c_%d_%d' % (i, j))
+          for j, gv in enumerate(grads_and_vars)
+      ]
+      convert_ops = [
+          state_ops.assign(converted_grads[j], gv[0])
+          for j, gv in enumerate(grads_and_vars)
+      ]
+
+      self.evaluate(variables.global_variables_initializer())
+      # Run convert_ops to achieve the gradietns converting
+      self.evaluate(convert_ops)
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      # Run 1 step of sgd through optimizer
+      converted_grads_and_vars = list(zip(converted_grads, [var0, var1]))
+      opt_op = sgd_op.apply_gradients(converted_grads_and_vars)
+      self.evaluate(opt_op)
+
+      # Validate updated params
+      self.assertAllClose([-14., -13.], self.evaluate(var0))
+      self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testComputeGradientsWithTensors(self):
+    x = ops.convert_to_tensor(1.0)
+    def f():
+      return x * x
+    sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+    grads_and_vars = sgd_op.compute_gradients(f, [x])
+    self.assertEqual(1, len(grads_and_vars))
+    grad, x_as_var = grads_and_vars[0]
+    self.assertIs(x, x_as_var)
+    self.assertEqual(2.0, self.evaluate(grad))
+
+    with self.assertRaises(NotImplementedError):
+      sgd_op.apply_gradients(grads_and_vars)
+
+  def testTrainOp(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0])
+      var1 = variables.Variable([3.0, 4.0])
+      cost = 5 * var0 + 3 * var1
+      global_step = variables.Variable(
+          array_ops.zeros([], dtypes.int64), name='global_step')
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+      self.assertTrue(opt_op in ops.get_collection(ops.GraphKeys.TRAIN_OP))
+
+  def testConstraint(self):
+    constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
+    constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0],
+                                constraint=constraint_01)
+      var1 = variables.Variable([3.0, 4.0],
+                                constraint=constraint_0)
+      cost = 5 * var0 + 3 * var1
+      global_step = variables.Variable(
+          array_ops.zeros([], dtypes.int64), name='global_step')
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Run 1 step of sgd through optimizer
+      opt_op.run()
+      # Validate updated params
+      self.assertAllClose([-0.1, -0.1], var0.eval())
+      self.assertAllClose([0., 0.], var1.eval())
+
+  def testStopGradients(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0], name='var0')
+      var1 = variables.Variable([3.0, 4.0], name='var1')
+      var0_id = array_ops.identity(var0)
+      cost = 5 * var0_id + 3 * var1
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      grads_and_vars = sgd_op.compute_gradients(cost, [var0, var1],
+                                                stop_gradients=[var0_id])
+      grad_dict = {var.op.name: grad for grad, var in grads_and_vars}
+      self.assertIsNone(grad_dict['var0'])
+      self.assertIsNotNone(grad_dict['var1'])
+
+  def testDoNotOverrideCreateSlots(self):
+    class ShouldNotOverrideCreateSlots(optimizer_v2.OptimizerV2):
+
+      def _create_slots(self, var_list):
+        """In OptimizerV2 _create_slots was renamed _create_vars."""
+        return var_list
+
+    with self.assertRaises(RuntimeError):
+      ShouldNotOverrideCreateSlots(True, 'name')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop.py b/tensorflow/contrib/optimizer_v2/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..164ff0ea0670bd07d19fa642e2e3cde1ab84612a
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/rmsprop.py
@@ -0,0 +1,233 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RMSprop optimizer for Tensorflow.
+
+rmsprop algorithm [tieleman2012rmsprop]
+
+A detailed description of rmsprop.
+
+- maintain a moving (discounted) average of the square of gradients
+- divide gradient by the root of this average
+
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square + epsilon)
+delta = - mom
+
+This implementation of RMSProp uses plain momentum, not Nesterov momentum.
+
+The centered version additionally maintains a moving (discounted) average of the
+gradients, and uses that average to estimate the variance:
+
+mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t /
+    sqrt(mean_square - mean_grad**2 + epsilon)
+delta = - mom
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+
+from tensorflow.python.training import training_ops
+
+
+class RMSPropOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the RMSProp algorithm.
+
+  See the
+  [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  """
+
+  def __init__(self,
+               learning_rate,
+               decay=0.9,
+               momentum=0.0,
+               epsilon=1e-10,
+               use_locking=False,
+               centered=False,
+               name="RMSProp"):
+    """Construct a new RMSProp optimizer.
+
+    Note that in the dense implementation of this algorithm, variables and their
+    corresponding accumulators (momentum, gradient moving average, square
+    gradient moving average) will be updated even if the gradient is zero
+    (i.e. accumulators will decay, momentum will be applied). The sparse
+    implementation (used when the gradient is an `IndexedSlices` object,
+    typically because of `tf.gather` or an embedding lookup in the forward pass)
+    will not update variable slices or their accumulators unless those slices
+    were used in the forward pass (nor is there an "eventual" correction to
+    account for these omitted updates). This leads to more efficient updates for
+    large embedding lookup tables (where most of the slices are not accessed in
+    a particular graph execution), but differs from the published algorithm.
+
+    Some of the args below are hyperparameters, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      decay: A float hyperparameter. Discounting factor for the history/coming
+        gradient.
+      momentum: A float hyperparameter.
+      epsilon: A float hyperparameter. Small value to avoid zero denominator.
+      use_locking: If True use locks for update operation.
+      centered: If True, gradients are normalized by the estimated variance of
+        the gradient; if False, by the uncentered second moment. Setting this to
+        True may help with training, but is slightly more expensive in terms of
+        computation and memory. Defaults to False.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "RMSProp".
+    """
+    super(RMSPropOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", decay)
+    self._set_hyper("momentum", momentum)
+    self._set_hyper("epsilon", epsilon)
+
+    self._centered = centered
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      if v.get_shape().is_fully_defined():
+        init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
+      else:
+        init_rms = array_ops.ones_like(v)
+      state.create_slot_with_initializer(v, init_rms, v.get_shape(),
+                                         v.dtype.base_dtype, "rms")
+      if self._centered:
+        state.zeros_slot(v, "mg")
+      state.zeros_slot(v, "momentum")
+
+  def _apply_dense(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.apply_centered_rms_prop(
+          var,
+          mg,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking).op
+    else:
+      return training_ops.apply_rms_prop(
+          var,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.resource_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.sparse_apply_centered_rms_prop(
+          var,
+          mg,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad.values,
+          grad.indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.sparse_apply_rms_prop(
+          var,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad.values,
+          grad.indices,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_sparse_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          indices,
+          use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed68f6afbf8bf9678649c1ce6fc59c3b91026dc0
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -0,0 +1,449 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rmsprop optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import itertools
+import math
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import rmsprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+_DATA_TYPES = [dtypes.half, dtypes.float32]
+
+_TEST_PARAM_VALUES = [
+    # learning_rate, decay, momentum, epsilon, centered, use_resource
+    [0.5, 0.9, 0.0, 1e-3, True, False],
+    [0.5, 0.9, 0.0, 1e-3, False, False],
+    [0.5, 0.9, 0.0, 1e-3, True, True],
+    [0.5, 0.9, 0.0, 1e-3, False, True],
+    [0.1, 0.9, 0.0, 1e-3, True, False],
+    [0.5, 0.95, 0.0, 1e-3, False, False],
+    [0.5, 0.95, 0.0, 1e-5, True, False],
+    [0.5, 0.95, 0.9, 1e-5, True, False],
+]
+
+_TESTPARAMS = [
+    [data_type] + values
+    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
+]
+
+
+class RMSPropOptimizerTest(test.TestCase):
+
+  def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, decay, momentum,
+                            epsilon, centered):
+    rms_t = rms * decay + (1 - decay) * g * g
+    denom_t = rms_t + epsilon
+    if centered:
+      mg_t = mg * decay + (1 - decay) * g
+      denom_t -= mg_t * mg_t
+    else:
+      mg_t = mg
+    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
+    var_t = var - mom_t
+    return var_t, mg_t, rms_t, mom_t
+
+  def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
+                                   lr, decay, momentum, epsilon, centered):
+    mg_t = copy.deepcopy(mg)
+    rms_t = copy.deepcopy(rms)
+    mom_t = copy.deepcopy(mom)
+    var_t = copy.deepcopy(var)
+    for i in range(len(gindexs)):
+      gindex = gindexs[i]
+      gvalue = gvalues[i]
+      rms_t[gindex] = rms[gindex] * decay + (1 - decay) * gvalue * gvalue
+      denom_t = rms_t[gindex] + epsilon
+      if centered:
+        mg_t[gindex] = mg_t[gindex] * decay + (1 - decay) * gvalue
+        denom_t -= mg_t[gindex] * mg_t[gindex]
+      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
+      var_t[gindex] = var[gindex] - mom_t[gindex]
+    return var_t, mg_t, rms_t, mom_t
+
+  def testDense(self):
+    # TODO(yori): Use ParameterizedTest when available
+    for (dtype, learning_rate, decay, momentum,
+         epsilon, centered, use_resource) in _TESTPARAMS:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=learning_rate,
+            decay=decay,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        mg0 = opt.get_slot(var0, "mg")
+        self.assertEqual(mg0 is not None, centered)
+        mg1 = opt.get_slot(var1, "mg")
+        self.assertEqual(mg1 is not None, centered)
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 4 steps of RMSProp
+        for _ in range(1, 5):
+          update.run()
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
+              decay, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
+              decay, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSPropOptimizer(
+            learning_rate=1.0,
+            decay=0.0,
+            momentum=0.0,
+            epsilon=0.0,
+            centered=False).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0., 1.]], var0.eval(), atol=0.01)
+
+  def testMinimizeSparseResourceVariableCentered(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSPropOptimizer(
+            learning_rate=1.0,
+            decay=0.0,
+            momentum=0.0,
+            epsilon=1.0,
+            centered=True).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[-111, -138]], var0.eval(), atol=0.01)
+
+  def testSparse(self):
+    # TODO(yori): Use ParameterizedTest when available
+    for (dtype, learning_rate, decay,
+         momentum, epsilon, centered, _) in _TESTPARAMS:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+        grads1_np_indices = np.array([1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=learning_rate,
+            decay=decay,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        mg0 = opt.get_slot(var0, "mg")
+        self.assertEqual(mg0 is not None, centered)
+        mg1 = opt.get_slot(var1, "mg")
+        self.assertEqual(mg1 is not None, centered)
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 4 steps of RMSProp
+        for _ in range(1, 5):
+          update.run()
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+              learning_rate, decay, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+              learning_rate, decay, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testWithoutMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.test_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the rms accumulators where 1. So we should see a normal
+        # update: v -= grad * learning_rate
+        update.run()
+        # Check the root mean square accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901, 0.901]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001, 0.90001]), rms1.eval())
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
+            ]), var1.eval())
+        # Step 2: the root mean square accumulators contain the previous update.
+        update.run()
+        # Check the rms accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
+            ]), var1.eval())
+
+  def testWithMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.test_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: rms = 1, mom = 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        update.run()
+        # Check the root mean square accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901, 0.901]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001, 0.90001]), rms1.eval())
+        # Check the momentum accumulators
+        self.assertAllCloseAccordingToType(
+            np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+
+        # Check that the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
+            ]), var1.eval())
+
+        # Step 2: the root mean square accumulators contain the previous update.
+        update.run()
+        # Check the rms accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
+                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
+            ]), mom0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
+                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
+            ]), mom1.eval())
+
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
+            ]), var0.eval())
+
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
+            ]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index bd9078ae76ee27ec26c09d1aa2012f871cbdf5e9..6ca7fe8b6e59b0dc24be76262d4f54f387e53e48 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -94,18 +94,6 @@ py_test(
 #     srcs_version = "PY2AND3",
 # )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "custom_op_sources",
     srcs = glob(
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index a80f060b91df3b6d5e2ca9ff63c721382f0cbb0a..36e21af618f5af744ce793509813eaf36e1b8479 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -8,18 +8,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "predictor",
     srcs = ["__init__.py"],
diff --git a/tensorflow/contrib/py2tf/utils/__init__.py b/tensorflow/contrib/py2tf/utils/__init__.py
deleted file mode 100644
index d9d8e3468966bc9da31c3fc756a9660f5ff7d115..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/utils/__init__.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility module that contains APIs usable in the generated code."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.py2tf.utils.builtins import dynamic_builtin
-from tensorflow.contrib.py2tf.utils.builtins import dynamic_print
-from tensorflow.contrib.py2tf.utils.builtins import dynamic_range
-from tensorflow.contrib.py2tf.utils.context_managers import control_dependency_on_returns
-from tensorflow.contrib.py2tf.utils.misc import alias_tensors
-from tensorflow.contrib.py2tf.utils.multiple_dispatch import dynamic_is
-from tensorflow.contrib.py2tf.utils.multiple_dispatch import dynamic_is_not
-from tensorflow.contrib.py2tf.utils.multiple_dispatch import run_cond
-from tensorflow.contrib.py2tf.utils.multiple_dispatch import run_while
-from tensorflow.contrib.py2tf.utils.py_func import wrap_py_func
-from tensorflow.contrib.py2tf.utils.tensor_list import dynamic_list_append
-from tensorflow.contrib.py2tf.utils.testing import fake_tf
-from tensorflow.contrib.py2tf.utils.type_check import is_tensor
-from tensorflow.contrib.py2tf.utils.type_hints import set_element_type
diff --git a/tensorflow/contrib/py2tf/utils/py_func.py b/tensorflow/contrib/py2tf/utils/py_func.py
deleted file mode 100644
index 838872d092a3ab07e965180eff4fec7ff6c4ccf9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/utils/py_func.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Pyfunc creation utilities."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import script_ops
-
-
-def wrap_py_func(f, return_dtypes, arguments, use_dummy_return=False):
-  """Helper that wraps a callable to py_func.
-
-  The helper passes tensor arguments through the py_func interface. Non-tensor
-  arguments are allowed, and will be passed to f directly. Note that non-tensor
-  arguments are captured by f will not update every time the wrapper is
-  called (this is consistent with its argument list, which only includes
-  the tensor arguments). In general, it's safest not to reuse this wrapper.
-
-  Args:
-    f: Callable
-    return_dtypes: DType, tuple, list or None, the data type for each of f's
-        return value. None if f has no return values or use_dummy_return is
-        True.
-    arguments: Arguments for f
-    use_dummy_return: If True, the function will return a dummy value of 1
-        and discard its actual return value.
-  Returns:
-    The return values of f converted to tensor.
-  Raises:
-    ValueError: if the arguments are incorrect.
-  """
-
-  if return_dtypes and use_dummy_return:
-    raise ValueError('if use_dummy_return is True, return_dtypes must be empty')
-
-  n = len(arguments)
-  arg_is_tensor = tuple(map(tensor_util.is_tensor, arguments))
-  index_in_tensor_list = [0] * n
-  i = 0
-  for j in range(n):
-    index_in_tensor_list[j] = i
-    if arg_is_tensor[j]:
-      i += 1
-
-  def f_wrapper(*tensor_args):
-    f_args = tuple(tensor_args[index_in_tensor_list[i]]
-                   if arg_is_tensor[i] else arguments[i] for i in range(n))
-    retval = f(*f_args)
-    return 1 if use_dummy_return else retval
-
-  return script_ops.py_func(
-      f_wrapper, tuple(arguments[i] for i in range(n) if arg_is_tensor[i]),
-      dtypes.int64 if use_dummy_return else return_dtypes)
diff --git a/tensorflow/contrib/py2tf/utils/py_func_test.py b/tensorflow/contrib/py2tf/utils/py_func_test.py
deleted file mode 100644
index 776b5309c6f027bb2008aa83d48e4155e817ed97..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/py2tf/utils/py_func_test.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for wrap_py_func module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.py2tf.utils import py_func
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.platform import test
-
-
-class PyFuncTest(test.TestCase):
-
-  def test_wrap_py_func_simple(self):
-
-    def test_fn(a, b, c):
-      return a + b + c
-
-    with self.test_session() as sess:
-      tensor_1 = constant_op.constant(1)
-      self.assertEqual(3,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, dtypes.int64,
-                                                (1, tensor_1, 1))))
-      self.assertEqual(3,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, dtypes.int64,
-                                                (1, 1, 1))))
-      self.assertEqual(3,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, dtypes.int64,
-                                                (tensor_1, 1, tensor_1))))
-
-  def test_wrap_py_func_complex_args(self):
-
-    class TestClass(object):
-
-      def __init__(self):
-        self.foo = 5
-
-    def test_fn(a, b):
-      return a * b.foo
-
-    with self.test_session() as sess:
-      self.assertEqual(35,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, dtypes.int64,
-                                                (7, TestClass()))))
-      self.assertEqual(
-          35,
-          sess.run(
-              py_func.wrap_py_func(test_fn, dtypes.int64,
-                                   (constant_op.constant(7), TestClass()))))
-
-  def test_wrap_py_func_dummy_return(self):
-
-    side_counter = [0]
-
-    def test_fn(_):
-      side_counter[0] += 1
-
-    with self.test_session() as sess:
-      self.assertEqual(1,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, None, (5,), True)))
-      self.assertEqual([1], side_counter)
-      self.assertEqual(1,
-                       sess.run(
-                           py_func.wrap_py_func(test_fn, None,
-                                                (constant_op.constant(5),),
-                                                True)))
-      self.assertEqual([2], side_counter)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/quantization/BUILD b/tensorflow/contrib/quantization/BUILD
index c19a31afb2a1a86159eae5c94bbd83daa28caaeb..2de10e8faefa80d609e490f26ef97f6bf513debd 100644
--- a/tensorflow/contrib/quantization/BUILD
+++ b/tensorflow/contrib/quantization/BUILD
@@ -49,15 +49,3 @@ filegroup(
         "**/*.py",
     ]),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 0b7629620418340d803753be0df1f04c342dc490..b9918fdee1ece2bae1ab1459985066a35b6431be 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -246,15 +246,3 @@ py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/quantize/python/common.py b/tensorflow/contrib/quantize/python/common.py
index 3138149468c87b649f120a81b28c60947dcf2c4e..bf648e158ec15e1bfa962ba7dbe0567263c89c9b 100644
--- a/tensorflow/contrib/quantize/python/common.py
+++ b/tensorflow/contrib/quantize/python/common.py
@@ -123,3 +123,11 @@ def CreateOrGetQuantizationStep():
         # normal variables to return a tensor of the same name.
         return array_ops.identity(
             state_ops.assign_add(quantization_step_tensor, 1))
+
+
+def DropStringPrefix(s, prefix):
+  """If the string starts with this prefix, drops it."""
+  if s.startswith(prefix):
+    return s[len(prefix):]
+  else:
+    return s
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 1afcbb850496fa1f654b7340946e087d403d3549..4a8f8a04cc521d9ee7885b4318814a6f15008eef 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -134,9 +134,9 @@ def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
 
       nodes_modified_count = graph_editor.reroute_ts(bias_add_tensor,
                                                      match.output_tensor)
-      if nodes_modified_count != 1:
-        raise ValueError(
-            'Unexpected inputs to op: %s' % match.output_tensor.name)
+      if nodes_modified_count == 0:
+        raise ValueError('Folding batch norms failed, %s had no outputs.' %
+                         match.output_tensor.name)
 
 
 def _FindFusedBatchNorms(graph):
@@ -317,7 +317,8 @@ def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
   """
 
   g = ops.get_default_graph()
-  with g.name_scope(context + '/batch_norm_correction'):
+  prefix = '' if not context else context + '/'
+  with g.name_scope(prefix + 'batch_norm_correction'):
     recip_sigma_mv = math_ops.rsqrt(
         match.moving_variance_tensor + match.batch_epsilon)
     recip_sigma = math_ops.rsqrt(match.variance_tensor + match.batch_epsilon)
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index c90a18ab0357f1bcbc5d8ccd48edf894d7baf5f9..af31467476b1536adef2bb74308fd1093f7bea7a 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -128,6 +128,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
 
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def testFoldConv2d(self):
     self._RunTestOverParameters(self._TestFoldConv2d)
 
@@ -196,6 +199,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
 
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def testFoldConv2dUnknownShape(self):
     self._RunTestOverParameters(self._TestFoldConv2dUnknownShape)
 
@@ -260,6 +266,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
 
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def testFoldFullyConnectedLayer(self):
     self._RunTestOverParameters(self._TestFoldFullyConnectedLayer)
 
@@ -337,6 +346,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
 
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def testFoldDepthwiseConv2d(self):
     self._RunTestOverParameters(self._TestFoldDepthwiseConv2d)
 
diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py
index b458f039df0523b5b8b07cff7d14643154124b95..bacc707a3abb5539b3b119c1ebc17bd7b30efc5b 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher.py
@@ -103,7 +103,7 @@ class OneofPattern(Pattern):
 class MatchResult(object):
   r"""Encapsulates the result of a match done by GraphMatcher.
 
-  MatchResult contains a map from OpTypePattern to the matching op and tensor.
+  MatchResult contains a map from Pattern to the matching op and tensor.
   When the matching op has multiple output tensors, the matching tensor is the
   output tensor used by the matching op of the parent pattern. E.g., when we
   match graph
@@ -138,7 +138,7 @@ class MatchResult(object):
       self._name_to_pattern[pattern.name] = pattern
 
   def _to_pattern(self, pattern_or_name):
-    if isinstance(pattern_or_name, OpTypePattern):
+    if isinstance(pattern_or_name, Pattern):
       return pattern_or_name
 
     if isinstance(pattern_or_name, str):
@@ -146,8 +146,8 @@ class MatchResult(object):
         return None
       return self._name_to_pattern[pattern_or_name]
 
-    raise ValueError('pattern_or_name has type %s. Expect OpTypePattern or str.'
-                     % type(pattern_or_name))
+    raise ValueError('pattern_or_name has type %s. Expect Pattern or str.' %
+                     type(pattern_or_name))
 
   def _get_op_tensor(self, pattern_or_name):
     pattern = self._to_pattern(pattern_or_name)
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index ec721afbc8d1bedd9214ceeb38286af62aad5d65..d53d4d7b108e38bec3e2fa4727e85b5ed88f3a9e 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -34,9 +34,6 @@ _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
 # Activations that are supported by the quantization rewrite.
 _ACTIVATION_TYPES = {'Relu', 'Relu6', 'Identity'}
 
-# Weight types that are supported by the quantization rewrite.
-_WEIGHT_TYPES = {'Variable', 'VariableV2', 'VarHandleOp'}
-
 
 def Quantize(graph,
              is_training,
@@ -44,7 +41,7 @@ def Quantize(graph,
              activation_bits=8,
              ema_decay=0.999,
              quant_delay=None,
-             vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES):
+             vars_collection=ops.GraphKeys.GLOBAL_VARIABLES):
   """Updates graph with quantization operations.
 
   Args:
@@ -123,21 +120,61 @@ def Quantize(graph,
           vars_collection=vars_collection,
           bits=activation_bits)
 
+    # Quantize bypass ops that occur after the activation.
+    if layer_match.post_activation_bypass_op is not None:
+      post_activation_bypass_context = re.search(
+          r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1)
+      _InsertQuantOp(
+          post_activation_bypass_context,
+          'post_activation_bypass_quant',
+          layer_match.post_activation_bypass_op,
+          input_to_ops_map.ConsumerOperations(
+              layer_match.post_activation_bypass_op),
+          is_training,
+          moving_avg=True,
+          ema_decay=ema_decay,
+          quant_delay=quant_delay,
+          vars_collection=vars_collection,
+          bits=activation_bits)
+
 
 def _FindLayersToQuantize(graph):
   """Matches layers in graph to quantize.
 
+  The following patterns get matched. Nodes surrounded by [] will be
+  optionally matched:
+
+          weight|folded_weight
+                /
+         conv|fc
+            |
+    [post_conv_correction]
+            |
+     biasadd|folded_bias
+            |
+         [bypass]
+            |
+        activation
+            |
+   [post_activation_bypass]
+
+  Match replacements:
+    If weight|folded_weight is found, FakeQuant is added afterwards.
+    If bypass is found, FakeQuant is added before and after.
+    If activation is found, FakeQuant is added afterwards.
+    If post_activation_bypass is found, FakeQuant is added afterwards.
+
   Args:
     graph: Graph to perform match on.
 
-  Yields:
-    _LayerMatches.
+  Returns:
+    list of _LayerMatches.
   """
   input_pattern = graph_matcher.OpTypePattern('*')
-  weight_var_pattern = graph_matcher.OpTypePattern('|'.join(_WEIGHT_TYPES))
-  weight_pattern = graph_matcher.OpTypePattern(
-      'Identity|ReadVariableOp', inputs=[weight_var_pattern])
-
+  weight_var_pattern = graph_matcher.OpTypePattern('Variable|VariableV2')
+  weight_identity_pattern = graph_matcher.OpTypePattern(
+      'Identity', inputs=[weight_var_pattern])
+  weight_resource_var_pattern = graph_matcher.OpTypePattern('ReadVariableOp')
   folded_weight_pattern = graph_matcher.OpTypePattern('Mul')
 
   # The weights inputs to the layer operation can either be from the Variable or
@@ -146,7 +183,10 @@ def _FindLayersToQuantize(graph):
       '|'.join(_QUANTIZABLE_TYPES),
       inputs=[
           input_pattern,
-          graph_matcher.OneofPattern([weight_pattern, folded_weight_pattern])
+          graph_matcher.OneofPattern([
+              weight_identity_pattern, weight_resource_var_pattern,
+              folded_weight_pattern
+          ])
       ])
 
   folded_bias_mul_pattern = graph_matcher.OpTypePattern(
@@ -179,7 +219,7 @@ def _FindLayersToQuantize(graph):
               [bias_add_pattern, folded_bias_add_pattern])
       ])
 
-  # The input to the activation can come from bias add, fold bias add or the
+  # The input to the activation can come from bias add, fold bias add, the
   # bypasses.
   activation_pattern = graph_matcher.OpTypePattern(
       '|'.join(_ACTIVATION_TYPES),
@@ -190,10 +230,62 @@ def _FindLayersToQuantize(graph):
           ])
       ])
 
+  post_activation_bypass_pattern_a = graph_matcher.OpTypePattern(
+      'Add', inputs=['*', activation_pattern])
+  post_activation_bypass_pattern_b = graph_matcher.OpTypePattern(
+      'Add', inputs=[activation_pattern, '*'])
+
+  # The order of the following matching blocks is very important. Since matches
+  # aren't guaranteed to be disjoint, we structure matches from largest to
+  # smallest to guarantee that the largest match always wins. Additionally, we
+  # ensure that we don't match layers multiple times.
+
+  layer_matches = []
+  # We use matched_layer_set to ensure that layers aren't matched multiple
+  # times.
+  matched_layer_set = set()
+
+  # First, we match layers that have a post activation bypass. We do this first
+  # to ensure we don't match only the first part of this layer, missing the
+  # post activation bypass node.
+  post_activation_bypass_layer_matcher = graph_matcher.GraphMatcher(
+      graph_matcher.OneofPattern([
+          post_activation_bypass_pattern_a,
+          post_activation_bypass_pattern_b,
+      ]))
+  for match_result in post_activation_bypass_layer_matcher.match_graph(graph):
+    layer_op = match_result.get_op(layer_pattern)
+    weight_tensor = match_result.get_tensor(weight_identity_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(weight_resource_var_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(folded_weight_pattern)
+    activation_op = match_result.get_op(activation_pattern)
+    bias_add_op = match_result.get_op(bias_add_pattern)
+    if bias_add_op is None:
+      bias_add_op = match_result.get_op(folded_bias_add_pattern)
+    bypass_op = match_result.get_op(bypass_pattern_a)
+    if bypass_op is None:
+      bypass_op = match_result.get_op(bypass_pattern_b)
+    post_activation_bypass_op = match_result.get_op(
+        post_activation_bypass_pattern_a)
+    if post_activation_bypass_op is None:
+      post_activation_bypass_op = match_result.get_op(
+          post_activation_bypass_pattern_b)
+    if layer_op not in matched_layer_set:
+      matched_layer_set.add(layer_op)
+      layer_matches.append(
+          _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op,
+                      post_activation_bypass_op, bias_add_op))
+
+  # Now, we match the basic layer ending at an activation. We may get duplicate
+  # matches from above, but we don't add them to layer_matches.
   layer_matcher = graph_matcher.GraphMatcher(activation_pattern)
   for match_result in layer_matcher.match_graph(graph):
     layer_op = match_result.get_op(layer_pattern)
-    weight_tensor = match_result.get_tensor(weight_pattern)
+    weight_tensor = match_result.get_tensor(weight_identity_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(weight_resource_var_pattern)
     if weight_tensor is None:
       weight_tensor = match_result.get_tensor(folded_weight_pattern)
     activation_op = match_result.get_op(activation_pattern)
@@ -203,31 +295,54 @@ def _FindLayersToQuantize(graph):
     bypass_op = match_result.get_op(bypass_pattern_a)
     if bypass_op is None:
       bypass_op = match_result.get_op(bypass_pattern_b)
-    yield _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op,
-                      bias_add_op)
-
-  # Match the final layer, where there will not be an activation and instead
-  # the output of the final BiasAdd must be quantized, so we treat it as the
-  # 'activation_op' in the _LayerMatch.
-  # TODO(suharshs): Figure out how to quantize this final layer across many
-  # models.
-  final_layer_matcher = graph_matcher.GraphMatcher(bias_add_pattern)
+    if layer_op not in matched_layer_set:
+      matched_layer_set.add(layer_op)
+      layer_matches.append(
+          _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op, None,
+                      bias_add_op))
+
+  # Match the final layer, where there may not be an activation and instead
+  # the output of the final BiasAdd must be quantized. So we treat the BiasAdd
+  # as the 'activation_op' in the _LayerMatch, to ensure that it's output is
+  # quantized.
+  final_layer_matcher = graph_matcher.GraphMatcher(
+      graph_matcher.OneofPattern([bias_add_pattern, folded_bias_add_pattern]))
   for match_result in final_layer_matcher.match_graph(graph):
     layer_op = match_result.get_op(layer_pattern)
-    weight_tensor = match_result.get_tensor(weight_pattern)
+    weight_tensor = match_result.get_tensor(weight_identity_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(weight_resource_var_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(folded_weight_pattern)
     activation_op = match_result.get_op(bias_add_pattern)
-    yield _LayerMatch(layer_op, weight_tensor, activation_op, None, None)
+    if activation_op is None:
+      activation_op = match_result.get_op(folded_bias_add_pattern)
+    if layer_op not in matched_layer_set:
+      matched_layer_set.add(layer_op)
+      layer_matches.append(
+          _LayerMatch(layer_op, weight_tensor, activation_op, None, None, None))
+
+  return layer_matches
+
+
+def _HasPostActivationBypass(activation_op):
+  for activation_tensor in activation_op.outputs:
+    for output_op in activation_tensor.consumers():
+      if output_op.type == 'Add':
+        return True
+  return False
 
 
 class _LayerMatch(object):
   """Contains all information related to a matched Layer."""
 
   def __init__(self, layer_op, weight_tensor, activation_op, bypass_op,
-               bias_add_op):
+               post_activation_bypass_op, bias_add_op):
     self._layer_op = layer_op
     self._weight_tensor = weight_tensor
     self._activation_op = activation_op
     self._bypass_op = bypass_op
+    self._post_activation_bypass_op = post_activation_bypass_op
     self._bias_add_op = bias_add_op
 
   @property
@@ -246,6 +361,10 @@ class _LayerMatch(object):
   def bypass_op(self):
     return self._bypass_op
 
+  @property
+  def post_activation_bypass_op(self):
+    return self._post_activation_bypass_op
+
   @property
   def bias_add_op(self):
     return self._bias_add_op
@@ -262,7 +381,7 @@ def _InsertQuantOp(context,
                    bits=8,
                    ema_decay=0.999,
                    quant_delay=None,
-                   vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
+                   vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
                    narrow_range=False):
   """Inserts a quant op between a producer op and (multiple) consumer ops.
 
@@ -293,7 +412,25 @@ def _InsertQuantOp(context,
       consumer operation.
   """
   name_prefix = _AddContextToName(context, name)
+  # This is needed on TPU where name_scope == 'TPUReplicate/loop', and
+  # name_prefix starts with 'TPUReplicate/loop/'; without dropping it
+  # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which
+  # breaks things later.
+  name_scope = ops.get_name_scope()
+  if name_scope:
+    name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/')
+
   inputs = producer.outputs[0]
+  # Prevent ops from being quantized multiple times. Bypass ops can sometimes
+  # overlap between multiple matches, so we need to ensure that we don't
+  # add duplicate FakeQuant operations.
+  fake_quant_ops = set([
+      'FakeQuantWithMinMaxVars',
+      'FakeQuantWithMinMaxArgs'
+  ])
+  if fake_quant_ops.intersection(set([c.type for c in inputs.consumers()])):
+    return
+
   if moving_avg:
     quant = (
         quant_ops.MovingAvgQuantize(
@@ -329,11 +466,16 @@ def _InsertQuantOp(context,
         lambda: inputs,
         name=name_prefix + '/delayed_quant')
 
-  nodes_modified_count = graph_editor.reroute_ts(
-      [quant], [inputs], can_modify=consumers)
-  if nodes_modified_count != len(consumers):
-    raise ValueError('Some inputs not quantized for ops: [%s]' % ', '.join(
-        [consumer.name for consumer in consumers]))
+  if consumers:
+    tensors_modified_count = graph_editor.reroute_ts(
+        [quant], [inputs], can_modify=consumers)
+    # Some operations can have multiple output tensors going to the same
+    # consumer. Since consumers is a set, we need to ensure that
+    # tensors_modified_count is greater than or equal to the length of the set
+    # of consumers.
+    if tensors_modified_count < len(consumers):
+      raise ValueError('No inputs quantized for ops: [%s]' % ', '.join(
+          [consumer.name for consumer in consumers]))
 
 
 def _GetContextFromOp(op):
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 5abdcd2475ce481f5394e9d6e664f0aa2a928d4f..0b74b438ac317967bbe10ad936b451de6f69d62c 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -72,6 +72,8 @@ def _create_graph(input_graph=None,
 def create_training_graph(input_graph=None, quant_delay=0):
   """Rewrites a training input_graph in place for simulated quantization.
 
+  Variables added by the rewrite get added to the global variables collection.
+
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
   the expected behavior of previously held references to nodes and tensors may
@@ -97,16 +99,7 @@ def create_training_graph(input_graph=None, quant_delay=0):
   # TODO(raghuramank) Need to have freeze_bn_delay be a function of batch size
   # Currently the values below are hardcoded for mobilenetV1 on imagenet
   # Please use the experimental API if you need to tune these values.
-  if quant_delay == 0:
-    # Corresponds to case of restoring from a floating point checkpoint
-    # In this case, we can freeze the moving mean and variance early on and
-    # switch to using them during training. Therefore, freeze_bn_delay is set to
-    # 2e5.
-    freeze_bn_delay = int(2e5)
-  else:
-    # If training from scratch, set freeze_bn_delay to 100 epochs after quant
-    # delay. With a batch size of 64, this corresponds to 20000*100=2M steps.
-    freeze_bn_delay = quant_delay + int(2e6)
+  freeze_bn_delay = None
 
   _create_graph(
       input_graph=input_graph,
@@ -118,6 +111,8 @@ def create_training_graph(input_graph=None, quant_delay=0):
 def create_eval_graph(input_graph=None):
   """Rewrites an eval input_graph in place for simulated quantization.
 
+  Variables added by the rewrite get added to the global variables collection.
+
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
   the expected behavior of previously held references to nodes and tensors may
@@ -138,9 +133,11 @@ def experimental_create_training_graph(input_graph=None,
                                        weight_bits=8,
                                        activation_bits=8,
                                        quant_delay=0,
-                                       freeze_bn_delay=int(2e5)):
+                                       freeze_bn_delay=None):
   """Rewrites a training input_graph in place for simulated quantization.
 
+  Variables added by the rewrite get added to the global variables collection.
+
   This function has additional experimental options not (yet) available to
   create_training_graph. The resulting behavior may be undefined.
 
@@ -188,6 +185,8 @@ def experimental_create_eval_graph(input_graph=None,
                                    activation_bits=8):
   """Rewrites an eval input_graph in place for simulated quantization.
 
+  Variables added by the rewrite get added to the global variables collection.
+
   This function has additional experimental options not (yet) available to
   create_eval_graph. The resulting behavior may be undefined.
 
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index b2e5707a6d5bccd90ae9663ad7e8bf1a0482a297..8d057d3710579ef3be93ad58a602892a7aa07edf 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -135,6 +135,139 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       self.assertTrue('FakeQuantWithMinMaxVars' in
                       [op.type for op in bias_add_op.outputs[0].consumers()])
 
+  def testPostActivationBypassQuantized(self):
+    self._RunTestOverParameters(self._TestPostActivationBypassQuantized)
+
+  def _TestPostActivationBypassQuantized(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      input1 = array_ops.zeros((batch_size, height, width, depth))
+      input2 = array_ops.zeros((batch_size, height / 2, width / 2, 32))
+      conv = conv2d(
+          input1,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=array_ops.identity,
+          scope='test/test')
+      bypass_tensor = math_ops.add(conv, input2, name='test/add')
+      _ = array_ops.identity(bypass_tensor, name='test/output')
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+      # Ensure that the bypass node is preceded and followed by
+      # FakeQuantWithMinMaxVars operations.
+      self.assertTrue('FakeQuantWithMinMaxVars' in
+                      [c.type for c in bypass_tensor.consumers()])
+      self.assertTrue('FakeQuantWithMinMaxVars' in
+                      [i.op.type for i in bypass_tensor.op.inputs])
+
+  def testOverlappingPostActivationBypassQuantized(self):
+    self._RunTestOverParameters(
+        self._TestOverlappingPostActivationBypassQuantized)
+
+  def _TestOverlappingPostActivationBypassQuantized(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      conv_input = array_ops.zeros((batch_size, height, width, depth))
+      conv1 = conv2d(
+          conv_input,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=array_ops.identity,
+          scope='test/test1')
+
+      # The bypass of this conv is the post activation bypass of the previous
+      # conv.
+      conv2 = conv2d(
+          conv_input,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=None,
+          scope='test/test2')
+
+      bypass_tensor = math_ops.add(conv1, conv2, name='test/add')
+      _ = array_ops.identity(bypass_tensor, name='test/output')
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+      # Ensure that the bypass node is preceded and followed by
+      # FakeQuantWithMinMaxVars operations.
+      self.assertTrue('FakeQuantWithMinMaxVars' in
+                      [c.type for c in bypass_tensor.consumers()])
+      self.assertTrue('FakeQuantWithMinMaxVars' in
+                      [i.op.type for i in bypass_tensor.op.inputs])
+
+      # Ensure that all the convs and activations are quantized.
+      op_names = [op.name for op in graph.get_operations()]
+      self.assertTrue(
+          'test/test1/weights_quant/FakeQuantWithMinMaxVars' in op_names)
+      self.assertTrue(
+          'test/test2/weights_quant/FakeQuantWithMinMaxVars' in op_names)
+      self.assertTrue(
+          'test/test1/act_quant/FakeQuantWithMinMaxVars' in op_names)
+      self.assertTrue('test/act_quant/FakeQuantWithMinMaxVars' in op_names)
+      self.assertEqual(
+          'Identity',
+          graph.get_operation_by_name(
+              'test/test1/act_quant/FakeQuantWithMinMaxVars').inputs[0].op.type)
+      self.assertEqual(
+          'Identity',
+          graph.get_operation_by_name(
+              'test/act_quant/FakeQuantWithMinMaxVars').inputs[0].op.type)
+
+  def testWithNameScope(self):
+    self._RunTestOverParameters(self._TestWithNameScope)
+
+  def _TestWithNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope('name_scope'):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+        quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+    for op in graph.get_operations():
+      self.assertTrue(not op.name.startswith('name_scope/name_scope/'),
+                      'Broken op: %s' % op.name)
+
+  def testWithNullNameScope(self):
+    self._RunTestOverParameters(self._TestWithNullNameScope)
+
+  def _TestWithNullNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope(None):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+        quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+        # Passes if Quantize() does not crash.
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/receptive_field/BUILD b/tensorflow/contrib/receptive_field/BUILD
index e975aeaea7ee78f8e912be8ab1be61b9acc7b418..9325a14745c1db2f8c311602143175e736fc3c5f 100644
--- a/tensorflow/contrib/receptive_field/BUILD
+++ b/tensorflow/contrib/receptive_field/BUILD
@@ -106,15 +106,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/reduce_slice_ops/BUILD b/tensorflow/contrib/reduce_slice_ops/BUILD
index b31f4488f5882a0bc4e419668dba5da72d69b7fe..02b3d66e4612d0f7eb29959d6c9f8472379fe16c 100644
--- a/tensorflow/contrib/reduce_slice_ops/BUILD
+++ b/tensorflow/contrib/reduce_slice_ops/BUILD
@@ -101,15 +101,3 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
index 54c66271cd43c85bc7be139dbdf70b9178ad1622..3aa8a14f44f38de51ed61f0b894cfd77ea9329f8 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/BUILD
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -47,15 +47,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/resampler/BUILD b/tensorflow/contrib/resampler/BUILD
index f0ecc8b85a5db93075d3cf0b55e7df95732bcf94..48345d7030bea431152bbed934af9f500f2c15c5 100644
--- a/tensorflow/contrib/resampler/BUILD
+++ b/tensorflow/contrib/resampler/BUILD
@@ -85,14 +85,3 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 7e5e35d0b55c97946c022e55180765d982eaa87a..43c0f7595590802aa80e1012967d377a6ab83d29 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -321,19 +321,6 @@ tf_cc_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "tools/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_gen_op_libs(
     op_lib_names = [
         "lstm_ops",
diff --git a/tensorflow/contrib/rnn/ops/gru_ops.cc b/tensorflow/contrib/rnn/ops/gru_ops.cc
index e91d1e8a80ed252e5f89e116fb0a325be67e3941..9c8e40851a0cc5bd7f37f94a62ecdef7248660c1 100644
--- a/tensorflow/contrib/rnn/ops/gru_ops.cc
+++ b/tensorflow/contrib/rnn/ops/gru_ops.cc
@@ -69,7 +69,7 @@ Element-wise dot product of a and b is represented by ab
 Element-wise dot product is represented by \circ
 Matrix multiplication is represented by *
 
-Baises are initialized with :
+Biases are initialized with :
 `b_ru` - constant_initializer(1.0)
 `b_c` - constant_initializer(0.0)
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
index 7957edf68cc8a1461fccfc2de93ad5250dc9fdb5..ffd24218944e150a32b1b915288ab1df90afb45c 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -54,7 +54,7 @@ def blocks_match(sess, use_peephole):
   initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212)
 
   with variable_scope.variable_scope("test", initializer=initializer):
-    # magic naming so that the cells pick up these variables and resuse them
+    # magic naming so that the cells pick up these variables and reuse them
     if use_peephole:
       wci = variable_scope.get_variable(
           "rnn/lstm_cell/w_i_diag", shape=[cell_size], dtype=dtypes.float32)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 358b2eb02be81def1cc76c89e564700bbae15833..b12e2cd5eddc3f8abdba62781692673a40e41d9b 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -534,7 +534,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       initializer: (optional) The initializer to use for the weight and
         projection matrices, default None.
       num_unit_shards: (optional) int, default 1, How to split the weight
-        matrix. If > 1,the weight matrix is stored across num_unit_shards.
+        matrix. If > 1, the weight matrix is stored across num_unit_shards.
       forget_bias: (optional) float, default 1.0, The initial bias of the
         forget gates, used to reduce the scale of forgetting at the beginning
         of the training.
@@ -993,7 +993,7 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
       initializer: (optional) The initializer to use for the weight and
         projection matrices, default None.
       num_unit_shards: (optional) int, default 1, How to split the weight
-        matrix. If > 1,the weight matrix is stored across num_unit_shards.
+        matrix. If > 1, the weight matrix is stored across num_unit_shards.
       forget_bias: (optional) float, default 1.0, The initial bias of the
         forget gates, used to reduce the scale of forgetting at the beginning
         of the training.
@@ -2891,7 +2891,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
 
     output_size = weight.get_shape().as_list()[1]
     g = vs.get_variable(name, [output_size], dtype=weight.dtype)
-    return nn_impl.l2_normalize(weight, dim=0) * g
+    return nn_impl.l2_normalize(weight, axis=0) * g
 
   def _linear(self,
               args,
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index b10757df47a4c38b98692c3d1995b3f040f0b478..e431c464ef14e86faf30221ed06061f41da528fb 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -82,15 +82,3 @@ py_test(
         "//tensorflow/python/saved_model:utils",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/BUILD b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
index ea4da80ba38389fb63a5dfe3cf608b959939c7ca..3c616c555b88cf5ec948bef3df5c2fef5caed0d4 100644
--- a/tensorflow/contrib/saved_model/cc/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
@@ -49,9 +49,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index ab80c68b1a8e4ff151494e393b68c460846fa8fe..a62069a252155a8bd1c6251d9dd3a4564a81c295 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -211,15 +211,3 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
index dfa12e873a6aca806031c48d6f92e0432d0ea6e0..a9a32b7b25d6767cc1f944640722e128a9d728b5 100644
--- a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
@@ -74,7 +74,7 @@ class GatherTreeOp : public OpKernel {
         ctx,
         step_ids_shape.dim_size(1) == max_sequence_lengths.shape().dim_size(0),
         errors::InvalidArgument("batch size dimensions step_ids.shape[1] and "
-                                "max_seqeuence_lengths.shape[0] must match.  "
+                                "max_sequence_lengths.shape[0] must match.  "
                                 "but shapes are: ",
                                 step_ids_shape.DebugString(), " and ",
                                 max_sequence_lengths.shape().DebugString()));
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index b427dff88b2d586ccf8c512bb498cdaf879ac781..d508cf3f9db81aa7c3a1174ed13f2310b0595b04 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -222,6 +222,9 @@ class AttentionWrapperTest(test.TestCase):
           self.assertEqual(
               (None, batch_size, None),
               tuple(state_alignment_history.get_shape().as_list()))
+        nest.assert_same_structure(
+            cell.state_size,
+            cell.zero_state(batch_size, dtypes.float32))
         # Remove the history from final_state for purposes of the
         # remainder of the tests.
         final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
@@ -350,6 +353,42 @@ class AttentionWrapperTest(test.TestCase):
         attention_mechanism_depth=9,
         name='testLuongNotNormalized')
 
+  def testLuongScaledDType(self):
+    # Test case for GitHub issue 18099
+    for dtype in [np.float16, np.float32, np.float64]:
+      num_units = 128
+      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      batch_size = 64
+      attention_mechanism = wrapper.LuongAttention(
+          num_units=num_units,
+          memory=encoder_outputs,
+          memory_sequence_length=encoder_sequence_length,
+          scale=True,
+          dtype=dtype,
+      )
+      cell = rnn_cell.LSTMCell(num_units)
+      cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+      helper = helper_py.TrainingHelper(decoder_inputs,
+                                        decoder_sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtype, batch_size=batch_size))
+
+      final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
   def testLuongScaled(self):
     create_attention_mechanism = functools.partial(
         wrapper.LuongAttention, scale=True)
@@ -782,26 +821,31 @@ class AttentionWrapperTest(test.TestCase):
         wrapper.BahdanauAttention, wrapper.LuongAttention)
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=ResultSummary(
-            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.11798714846372604),
-        sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=7.933333333333334))
+        rnn_output=ResultSummary(shape=(5, 3, 20),
+                                 dtype=dtype('float32'),
+                                 mean=0.11723966),
+        sample_id=ResultSummary(shape=(5, 3),
+                                dtype=dtype('int32'),
+                                mean=9.2666666666666675))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0036486709),
-            h=ResultSummary(
-                shape=(5, 9), dtype=dtype('float32'), mean=-0.0018835809)),
-        attention=ResultSummary(
-            shape=(5, 20), dtype=dtype('float32'), mean=0.11798714846372604),
+            c=ResultSummary(shape=(5, 9),
+                            dtype=dtype('float32'),
+                            mean=-0.003545674),
+            h=ResultSummary(shape=(5, 9),
+                            dtype=dtype('float32'),
+                            mean=-0.0018327223)),
+        attention=ResultSummary(shape=(5, 20),
+                                dtype=dtype('float32'),
+                                mean=0.11728073),
         time=3,
         alignments=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        alignment_history=(),
         attention_state=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
-            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
-        alignment_history=())
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)))
     expected_final_alignment_history = (
         ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
         ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125))
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index 926554031775202d7f7d9018cf6ae4efb34fe96b..178328619f087789df040489cd150ba018cc8d14 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
@@ -70,6 +71,98 @@ class TestGatherTree(test.TestCase):
 
     self.assertAllEqual(expected_result, res_)
 
+  def _test_gather_tree_from_array(self,
+                                   depth_ndims=0,
+                                   merged_batch_beam=False):
+    array = np.array(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [0, 0, 0]],
+         [[2, 3, 4], [5, 6, 7], [8, 9, 10], [11, 12, 0]]]).transpose([1, 0, 2])
+    parent_ids = np.array(
+        [[[0, 0, 0], [0, 1, 1], [2, 1, 2], [-1, -1, -1]],
+         [[0, 0, 0], [1, 1, 0], [2, 0, 1], [0, 1, 0]]]).transpose([1, 0, 2])
+    expected_array = np.array(
+        [[[2, 2, 2], [6, 5, 6], [7, 8, 9], [0, 0, 0]],
+         [[2, 3, 2], [7, 5, 7], [8, 9, 8], [11, 12, 0]]]).transpose([1, 0, 2])
+    sequence_length = [[3, 3, 3], [4, 4, 3]]
+
+    array = ops.convert_to_tensor(
+        array, dtype=dtypes.float32)
+    parent_ids = ops.convert_to_tensor(
+        parent_ids, dtype=dtypes.int32)
+    expected_array = ops.convert_to_tensor(
+        expected_array, dtype=dtypes.float32)
+
+    max_time = array_ops.shape(array)[0]
+    batch_size = array_ops.shape(array)[1]
+    beam_width = array_ops.shape(array)[2]
+
+    def _tile_in_depth(tensor):
+      # Generate higher rank tensors by concatenating tensor and tensor + 1.
+      for _ in range(depth_ndims):
+        tensor = array_ops.stack([tensor, tensor + 1], -1)
+      return tensor
+
+    if merged_batch_beam:
+      array = array_ops.reshape(
+          array, [max_time, batch_size * beam_width])
+      expected_array = array_ops.reshape(
+          expected_array, [max_time, batch_size * beam_width])
+
+    if depth_ndims > 0:
+      array = _tile_in_depth(array)
+      expected_array = _tile_in_depth(expected_array)
+
+    sorted_array = beam_search_decoder.gather_tree_from_array(
+        array, parent_ids, sequence_length)
+
+    with self.test_session() as sess:
+      sorted_array = sess.run(sorted_array)
+      expected_array = sess.run(expected_array)
+      self.assertAllEqual(expected_array, sorted_array)
+
+  def test_gather_tree_from_array_scalar(self):
+    self._test_gather_tree_from_array()
+
+  def test_gather_tree_from_array_1d(self):
+    self._test_gather_tree_from_array(depth_ndims=1)
+
+  def test_gather_tree_from_array_1d_with_merged_batch_beam(self):
+    self._test_gather_tree_from_array(depth_ndims=1, merged_batch_beam=True)
+
+  def test_gather_tree_from_array_2d(self):
+    self._test_gather_tree_from_array(depth_ndims=2)
+
+
+class TestArrayShapeChecks(test.TestCase):
+
+  def _test_array_shape_dynamic_checks(self, static_shape, dynamic_shape,
+                                       batch_size, beam_width, is_valid=True):
+    t = array_ops.placeholder_with_default(
+        np.random.randn(*static_shape).astype(np.float32),
+        shape=dynamic_shape)
+
+    batch_size = array_ops.constant(batch_size)
+    check_op = beam_search_decoder._check_batch_beam(t, batch_size, beam_width)  # pylint: disable=protected-access
+
+    with self.test_session() as sess:
+      if is_valid:
+        sess.run(check_op)
+      else:
+        with self.assertRaises(errors.InvalidArgumentError):
+          sess.run(check_op)
+
+  def test_array_shape_dynamic_checks(self):
+    self._test_array_shape_dynamic_checks(
+        (8, 4, 5, 10), (None, None, 5, 10), 4, 5, is_valid=True)
+    self._test_array_shape_dynamic_checks(
+        (8, 20, 10), (None, None, 10), 4, 5, is_valid=True)
+    self._test_array_shape_dynamic_checks(
+        (8, 21, 10), (None, None, 10), 4, 5, is_valid=False)
+    self._test_array_shape_dynamic_checks(
+        (8, 4, 6, 10), (None, None, None, 10), 4, 5, is_valid=False)
+    self._test_array_shape_dynamic_checks(
+        (8, 4), (None, None), 4, 5, is_valid=False)
+
 
 class TestEosMasking(test.TestCase):
   """Tests EOS masking used in beam search."""
@@ -319,7 +412,8 @@ class TestLargeBeamStep(test.TestCase):
 
 class BeamSearchDecoderTest(test.TestCase):
 
-  def _testDynamicDecodeRNN(self, time_major, has_attention):
+  def _testDynamicDecodeRNN(self, time_major, has_attention,
+                            with_alignment_history=False):
     encoder_sequence_length = np.array([3, 2, 3, 1, 1])
     decoder_sequence_length = np.array([2, 0, 1, 2, 3])
     batch_size = 5
@@ -359,7 +453,7 @@ class BeamSearchDecoderTest(test.TestCase):
             cell=cell,
             attention_mechanism=attention_mechanism,
             attention_layer_size=attention_depth,
-            alignment_history=False)
+            alignment_history=with_alignment_history)
       cell_state = cell.zero_state(
           dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
       if has_attention:
@@ -420,6 +514,12 @@ class BeamSearchDecoderTest(test.TestCase):
   def testDynamicDecodeRNNBatchMajorYesAttention(self):
     self._testDynamicDecodeRNN(time_major=False, has_attention=True)
 
+  def testDynamicDecodeRNNBatchMajorYesAttentionWithAlignmentHistory(self):
+    self._testDynamicDecodeRNN(
+        time_major=False,
+        has_attention=True,
+        with_alignment_history=True)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index f8da5a3e1729a300a7bef3547d1819d7363a7194..f0f143ddfcf17c0e471add804ac4920b02da68e0 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -339,7 +339,8 @@ def _luong_score(query, keys, scale):
   if scale:
     # Scalar used in weight scaling
     g = variable_scope.get_variable(
-        "attention_g", dtype=dtype, initializer=1.)
+        "attention_g", dtype=dtype,
+        initializer=init_ops.ones_initializer, shape=())
     score = g * score
   return score
 
@@ -609,8 +610,8 @@ def monotonic_attention(p_choose_i, previous_attention, mode):
   addition, once an input sequence element is attended to at a given output
   timestep, elements occurring before it cannot be attended to at subsequent
   output timesteps.  This function generates attention distributions according
-  to these assumptions.  For more information, see ``Online and Linear-Time
-  Attention by Enforcing Monotonic Alignments''.
+  to these assumptions.  For more information, see `Online and Linear-Time
+  Attention by Enforcing Monotonic Alignments`.
 
   Args:
     p_choose_i: Probability of choosing input sequence/memory element i.  Should
@@ -736,7 +737,7 @@ class _BaseMonotonicAttentionMechanism(_BaseAttentionMechanism):
   """Base attention mechanism for monotonic attention.
 
   Simply overrides the initial_alignments function to provide a dirac
-  distribution,which is needed in order for the monotonic attention
+  distribution, which is needed in order for the monotonic attention
   distributions to have the correct behavior.
   """
 
@@ -763,7 +764,7 @@ class _BaseMonotonicAttentionMechanism(_BaseAttentionMechanism):
 class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Bahadanau-style energy function.
 
-  This type of attention encorces a monotonic constraint on the attention
+  This type of attention enforces a monotonic constraint on the attention
   distributions; that is once the model attends to a given point in the memory
   it can't attend to any prior points at subsequence output timesteps.  It
   achieves this by using the _monotonic_probability_fn instead of softmax to
@@ -867,7 +868,7 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
 class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Luong-style energy function.
 
-  This type of attention encorces a monotonic constraint on the attention
+  This type of attention enforces a monotonic constraint on the attention
   distributions; that is once the model attends to a given point in the memory
   it can't attend to any prior points at subsequence output timesteps.  It
   achieves this by using the _monotonic_probability_fn instead of softmax to
@@ -1133,7 +1134,7 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
       output_attention: Python bool.  If `True` (default), the output at each
         time step is the attention value.  This is the behavior of Luong-style
         attention mechanisms.  If `False`, the output at each time step is
-        the output of `cell`.  This is the beahvior of Bhadanau-style
+        the output of `cell`.  This is the behavior of Bhadanau-style
         attention mechanisms.  In both cases, the `attention` tensor is
         propagated to the next time step via the state and is used there.
         This flag only controls whether the attention mechanism is propagated
@@ -1278,7 +1279,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         attention_state=self._item_or_tuple(
             a.state_size for a in self._attention_mechanisms),
         alignment_history=self._item_or_tuple(
-            () for _ in self._attention_mechanisms))  # sometimes a TensorArray
+            a.alignments_size if self._alignment_history else ()
+            for a in self._attention_mechanisms))  # sometimes a TensorArray
 
   def zero_state(self, batch_size, dtype):
     """Return an initial (zero) state tuple for this `AttentionWrapper`.
@@ -1318,22 +1320,26 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         cell_state = nest.map_structure(
             lambda s: array_ops.identity(s, name="checked_cell_state"),
             cell_state)
+      initial_alignments = [
+          attention_mechanism.initial_alignments(batch_size, dtype)
+          for attention_mechanism in self._attention_mechanisms]
       return AttentionWrapperState(
           cell_state=cell_state,
           time=array_ops.zeros([], dtype=dtypes.int32),
           attention=_zero_state_tensors(self._attention_layer_size, batch_size,
                                         dtype),
-          alignments=self._item_or_tuple(
-              attention_mechanism.initial_alignments(batch_size, dtype)
-              for attention_mechanism in self._attention_mechanisms),
+          alignments=self._item_or_tuple(initial_alignments),
           attention_state=self._item_or_tuple(
               attention_mechanism.initial_state(batch_size, dtype)
               for attention_mechanism in self._attention_mechanisms),
           alignment_history=self._item_or_tuple(
-              tensor_array_ops.TensorArray(dtype=dtype, size=0,
-                                           dynamic_size=True)
+              tensor_array_ops.TensorArray(
+                  dtype,
+                  size=0,
+                  dynamic_size=True,
+                  element_shape=alignment.shape)
               if self._alignment_history else ()
-              for _ in self._attention_mechanisms))
+              for alignment in initial_alignments))
 
   def call(self, inputs, state):
     """Perform a step of attention-wrapped RNN.
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index 6adbb8be40fb183fadd15c3155aca287ac6563b7..184144f64a56358206014a0f75473b4a9b16617a 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 
 __all__ = [
@@ -121,14 +122,114 @@ def tile_batch(t, multiplier, name=None):
     return nest.map_structure(lambda t_: _tile_batch(t_, multiplier), t)
 
 
+def gather_tree_from_array(t, parent_ids, sequence_length):
+  """Calculates the full beams for `TensorArray`s.
+
+  Args:
+    t: A stacked `TensorArray` of size `max_time` that contains `Tensor`s of
+      shape `[batch_size, beam_width, s]` or `[batch_size * beam_width, s]`
+      where `s` is the depth shape.
+    parent_ids: The parent ids of shape `[max_time, batch_size, beam_width]`.
+    sequence_length: The sequence length of shape `[batch_size, beam_width]`.
+
+  Returns:
+    A `Tensor` which is a stacked `TensorArray` of the same size and type as
+    `t` and where beams are sorted in each `Tensor` according to `parent_ids`.
+  """
+  max_time = parent_ids.shape[0].value or array_ops.shape(parent_ids)[0]
+  batch_size = parent_ids.shape[1].value or array_ops.shape(parent_ids)[1]
+  beam_width = parent_ids.shape[2].value or array_ops.shape(parent_ids)[2]
+
+  # Generate beam ids that will be reordered by gather_tree.
+  beam_ids = array_ops.expand_dims(
+      array_ops.expand_dims(math_ops.range(beam_width), 0), 0)
+  beam_ids = array_ops.tile(beam_ids, [max_time, batch_size, 1])
+
+  mask = array_ops.sequence_mask(
+      sequence_length, maxlen=max_time, dtype=dtypes.int32)
+  mask = array_ops.transpose(mask, perm=[2, 0, 1])
+
+  # Use beam_width + 1 to mark the end of beam.
+  masked_beam_ids = (beam_ids * mask) + (1 - mask) * (beam_width + 1)
+
+  max_sequence_lengths = math_ops.to_int32(
+      math_ops.reduce_max(sequence_length, axis=1))
+  sorted_beam_ids = beam_search_ops.gather_tree(
+      step_ids=masked_beam_ids,
+      parent_ids=parent_ids,
+      max_sequence_lengths=max_sequence_lengths,
+      end_token=beam_width + 1)
+
+  # For out of range steps, simply copy the same beam.
+  sorted_beam_ids = array_ops.where(
+      math_ops.cast(mask, dtypes.bool), x=sorted_beam_ids, y=beam_ids)
+
+  # Generate indices for gather_nd.
+  time_ind = array_ops.tile(array_ops.reshape(
+      math_ops.range(max_time), [-1, 1, 1]), [1, batch_size, beam_width])
+  batch_ind = array_ops.tile(array_ops.reshape(
+      math_ops.range(batch_size), [-1, 1, 1]), [1, max_time, beam_width])
+  batch_ind = array_ops.transpose(batch_ind, perm=[1, 0, 2])
+  indices = array_ops.stack([time_ind, batch_ind, sorted_beam_ids], -1)
+
+  # Gather from a tensor with collapsed additional dimensions.
+  gather_from = t
+  final_shape = array_ops.shape(gather_from)
+  gather_from = array_ops.reshape(
+      gather_from, [max_time, batch_size, beam_width, -1])
+  ordered = array_ops.gather_nd(gather_from, indices)
+  ordered = array_ops.reshape(ordered, final_shape)
+
+  return ordered
+
+
 def _check_maybe(t):
-  if isinstance(t, tensor_array_ops.TensorArray):
-    raise TypeError(
-        "TensorArray state is not supported by BeamSearchDecoder: %s" % t.name)
   if t.shape.ndims is None:
     raise ValueError(
         "Expected tensor (%s) to have known rank, but ndims == None." % t)
 
+def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
+  """Raises an exception if dimensions are known statically and can not be
+  reshaped to [batch_size, beam_size, -1].
+  """
+  reshaped_shape = tensor_shape.TensorShape([batch_size, beam_width, None])
+  if (batch_size is not None and shape[0].value is not None
+      and (shape[0] != batch_size * beam_width
+           or (shape.ndims >= 2 and shape[1].value is not None
+               and (shape[0] != batch_size or shape[1] != beam_width)))):
+    tf_logging.warn("TensorArray reordering expects elements to be "
+                    "reshapable to %s which is incompatible with the "
+                    "current shape %s. Consider setting "
+                    "reorder_tensor_arrays to False to disable TensorArray "
+                    "reordering during the beam search."
+                    % (reshaped_shape, shape))
+    return False
+  return True
+
+def _check_batch_beam(t, batch_size, beam_width):
+  """Returns an Assert operation checking that the elements of the stacked
+  TensorArray can be reshaped to [batch_size, beam_size, -1]. At this point,
+  the TensorArray elements have a known rank of at least 1.
+  """
+  error_message = ("TensorArray reordering expects elements to be "
+                   "reshapable to [batch_size, beam_size, -1] which is "
+                   "incompatible with the dynamic shape of %s elements. "
+                   "Consider setting reorder_tensor_arrays to False to disable "
+                   "TensorArray reordering during the beam search."
+                   % (t.name))
+  rank = t.shape.ndims
+  shape = array_ops.shape(t)
+  if rank == 2:
+    condition = math_ops.equal(shape[1], batch_size * beam_width)
+  else:
+    condition = math_ops.logical_or(
+        math_ops.equal(shape[1], batch_size * beam_width),
+        math_ops.logical_and(
+            math_ops.equal(shape[1], batch_size),
+            math_ops.equal(shape[2], beam_width)))
+  return control_flow_ops.Assert(condition, [error_message])
+
+
 
 class BeamSearchDecoder(decoder.Decoder):
   """BeamSearch sampling decoder.
@@ -173,7 +274,8 @@ class BeamSearchDecoder(decoder.Decoder):
                initial_state,
                beam_width,
                output_layer=None,
-               length_penalty_weight=0.0):
+               length_penalty_weight=0.0,
+               reorder_tensor_arrays=True):
     """Initialize the BeamSearchDecoder.
 
     Args:
@@ -188,6 +290,12 @@ class BeamSearchDecoder(decoder.Decoder):
         `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
         to storing the result or sampling.
       length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell
+        state will be reordered according to the beam search path. If the
+        `TensorArray` can be reordered, the stacked form will be returned.
+        Otherwise, the `TensorArray` will be returned as is. Set this flag to
+        `False` if the cell state contains `TensorArray`s that are not amenable
+        to reordering.
 
     Raises:
       TypeError: if `cell` is not an instance of `RNNCell`,
@@ -202,6 +310,7 @@ class BeamSearchDecoder(decoder.Decoder):
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
     self._output_layer = output_layer
+    self._reorder_tensor_arrays = reorder_tensor_arrays
 
     if callable(embedding):
       self._embedding_fn = embedding
@@ -342,6 +451,11 @@ class BeamSearchDecoder(decoder.Decoder):
         outputs.parent_ids,
         max_sequence_lengths=max_sequence_lengths,
         end_token=self._end_token)
+    if self._reorder_tensor_arrays:
+      final_state = final_state._replace(cell_state=nest.map_structure(
+          lambda t: self._maybe_sort_array_beams(
+              t, outputs.parent_ids, final_state.lengths),
+          final_state.cell_state))
     outputs = FinalBeamSearchDecoderOutput(
         beam_search_decoder_output=outputs, predicted_ids=predicted_ids)
     return outputs, final_state
@@ -432,9 +546,10 @@ class BeamSearchDecoder(decoder.Decoder):
       returned unchanged.
 
     Raises:
-      TypeError: If `t` is an instance of `TensorArray`.
       ValueError: If the rank of `t` is not statically known.
     """
+    if isinstance(t, tensor_array_ops.TensorArray):
+      return t
     _check_maybe(t)
     if t.shape.ndims >= 1:
       return self._split_batch_beams(t, s)
@@ -455,15 +570,55 @@ class BeamSearchDecoder(decoder.Decoder):
       A reshaped version of t with shape `[batch_size, beam_width] + s`.
 
     Raises:
-      TypeError: If `t` is an instance of `TensorArray`.
       ValueError:  If the rank of `t` is not statically known.
     """
+    if isinstance(t, tensor_array_ops.TensorArray):
+      return t
     _check_maybe(t)
     if t.shape.ndims >= 2:
       return self._merge_batch_beams(t, s)
     else:
       return t
 
+  def _maybe_sort_array_beams(self, t, parent_ids, sequence_length):
+    """Maybe sorts beams within a `TensorArray`.
+
+    Args:
+      t: A `TensorArray` of size `max_time` that contains `Tensor`s of shape
+        `[batch_size, beam_width, s]` or `[batch_size * beam_width, s]` where
+        `s` is the depth shape.
+      parent_ids: The parent ids of shape `[max_time, batch_size, beam_width]`.
+      sequence_length: The sequence length of shape `[batch_size, beam_width]`.
+
+    Returns:
+      A `TensorArray` where beams are sorted in each `Tensor` or `t` itself if
+      it is not a `TensorArray` or does not meet shape requirements.
+    """
+    if not isinstance(t, tensor_array_ops.TensorArray):
+      return t
+    # pylint: disable=protected-access
+    if (not t._infer_shape or not t._element_shape
+        or t._element_shape[0].ndims is None
+        or t._element_shape[0].ndims < 1):
+      shape = (
+          t._element_shape[0] if t._infer_shape and t._element_shape
+          else tensor_shape.TensorShape(None))
+      tf_logging.warn("The TensorArray %s in the cell state is not amenable to "
+                      "sorting based on the beam search result. For a "
+                      "TensorArray to be sorted, its elements shape must be "
+                      "defined and have at least a rank of 1, but saw shape: %s"
+                      % (t.handle.name, shape))
+      return t
+    shape = t._element_shape[0]
+    # pylint: enable=protected-access
+    if not _check_static_batch_beam_maybe(
+        shape, tensor_util.constant_value(self._batch_size), self._beam_width):
+      return t
+    t = t.stack()
+    with ops.control_dependencies(
+        [_check_batch_beam(t, self._batch_size, self._beam_width)]):
+      return gather_tree_from_array(t, parent_ids, sequence_length)
+
   def step(self, time, inputs, state, name=None):
     """Perform a decoding step.
 
@@ -666,9 +821,9 @@ def _get_scores(log_probs, sequence_lengths, length_penalty_weight):
   Returns:
     The scores normalized by the length_penalty.
   """
-  length_penality_ = _length_penalty(
+  length_penalty_ = _length_penalty(
       sequence_lengths=sequence_lengths, penalty_factor=length_penalty_weight)
-  return log_probs / length_penality_
+  return log_probs / length_penalty_
 
 
 def _length_penalty(sequence_lengths, penalty_factor):
@@ -705,7 +860,7 @@ def _mask_probs(probs, eos_token, finished):
   unfinished beams remain unchanged.
 
   Args:
-    probs: Log probabiltiies of shape `[batch_size, beam_width, vocab_size]`
+    probs: Log probabilities of shape `[batch_size, beam_width, vocab_size]`
     eos_token: An int32 id corresponding to the EOS token to allocate
       probability to.
     finished: A boolean tensor of shape `[batch_size, beam_width]` that
@@ -758,6 +913,8 @@ def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
     output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)]
       or the original tensor if its dimensions are too small.
   """
+  if isinstance(gather_from, tensor_array_ops.TensorArray):
+    return gather_from
   _check_maybe(gather_from)
   if gather_from.shape.ndims >= len(gather_shape):
     return _tensor_gather_helper(
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index f14974b9d5ca8cbcfd9f91086ca0a90ceff48f43..898493662d7594f9996400a9636378db3c6b4cd1 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
@@ -39,6 +40,7 @@ __all__ = ["Decoder", "dynamic_decode"]
 
 
 _transpose_batch_time = rnn._transpose_batch_time  # pylint: disable=protected-access
+_zero_state_tensors = rnn_cell_impl._zero_state_tensors  # pylint: disable=protected-access
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -133,16 +135,8 @@ class Decoder(object):
 
 def _create_zero_outputs(size, dtype, batch_size):
   """Create a zero outputs Tensor structure."""
-  def _t(s):
-    return (s if isinstance(s, ops.Tensor) else constant_op.constant(
-        tensor_shape.TensorShape(s).as_list(),
-        dtype=dtypes.int32,
-        name="zero_suffix_shape"))
-
   def _create(s, d):
-    return array_ops.zeros(
-        array_ops.concat(
-            ([batch_size], _t(s)), axis=0), dtype=d)
+    return _zero_state_tensors(s, batch_size, d)
 
   return nest.map_structure(_create, size, dtype)
 
@@ -212,7 +206,8 @@ def dynamic_decode(decoder,
     initial_time = constant_op.constant(0, dtype=dtypes.int32)
 
     def _shape(batch_size, from_shape):
-      if not isinstance(from_shape, tensor_shape.TensorShape):
+      if (not isinstance(from_shape, tensor_shape.TensorShape) or
+          from_shape.ndims == 0):
         return tensor_shape.TensorShape(None)
       else:
         batch_size = tensor_util.constant_value(
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 3ad88a8a22966a072e0c43d6c41369333aba5cbe..9c0885918071c25ab65cb4044bc19ea22c55442a 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -1,9 +1,7 @@
 # Description:
 #   TensorFlow Serving session bundle.
 
-package(
-    default_visibility = ["//visibility:public"],
-)
+package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -19,18 +17,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-)
-
 # TODO(b/32673259): add a test to continuously validate these files.
 filegroup(
     name = "session_bundle_half_plus_two",
diff --git a/tensorflow/contrib/session_bundle/example/BUILD b/tensorflow/contrib/session_bundle/example/BUILD
index dbbae01f3661b81f35350470c08ec65b3488b7fc..9a56eab431d66c53c1c51341d48bf47eb8926829 100644
--- a/tensorflow/contrib/session_bundle/example/BUILD
+++ b/tensorflow/contrib/session_bundle/example/BUILD
@@ -10,19 +10,6 @@ exports_files(["LICENSE"])
 
 # vardef("PYTHON_BIN_PATH", "/usr/bin/python")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-)
-
 py_binary(
     name = "export_half_plus_two",
     srcs = [
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.cc b/tensorflow/contrib/session_bundle/session_bundle_test.cc
index 6d997bac9ee8e0fe242455686cc00a016d9bd768..612623ae309f6393beb258138b7b795c2a25d4e1 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
@@ -239,8 +240,8 @@ TEST(LoadSessionBundleFromPath, BasicTestRunOptionsThreadPoolInvalid) {
 
   // Expect failed session run calls with invalid run-options.
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Invalid inter_op_thread_pool: 2"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Invalid inter_op_thread_pool: 2"))
       << status.error_message();
 }
 
@@ -314,8 +315,8 @@ TEST_F(SessionBundleTest, ServingGraphEmpty) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message())
-                  .contains("Expected exactly one serving GraphDef"))
+  EXPECT_TRUE(str_util::StrContains(status_.error_message(),
+                                    "Expected exactly one serving GraphDef"))
       << status_.error_message();
 }
 
@@ -330,8 +331,9 @@ TEST_F(SessionBundleTest, ServingGraphAnyIncorrectType) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message())
-                  .contains("Expected Any type_url for: tensorflow.GraphDef"))
+  EXPECT_TRUE(
+      str_util::StrContains(status_.error_message(),
+                            "Expected Any type_url for: tensorflow.GraphDef"))
       << status_.error_message();
 }
 
@@ -347,7 +349,8 @@ TEST_F(SessionBundleTest, ServingGraphAnyValueCorrupted) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message()).contains("Failed to unpack"))
+  EXPECT_TRUE(
+      str_util::StrContains(status_.error_message(), "Failed to unpack"))
       << status_.error_message();
 }
 
@@ -362,9 +365,9 @@ TEST_F(SessionBundleTest, AssetFileAnyIncorrectType) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(
-      StringPiece(status_.error_message())
-          .contains("Expected Any type_url for: tensorflow.serving.AssetFile"))
+  EXPECT_TRUE(str_util::StrContains(
+      status_.error_message(),
+      "Expected Any type_url for: tensorflow.serving.AssetFile"))
       << status_.error_message();
 }
 
@@ -380,7 +383,8 @@ TEST_F(SessionBundleTest, AssetFileAnyValueCorrupted) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message()).contains("Failed to unpack"))
+  EXPECT_TRUE(
+      str_util::StrContains(status_.error_message(), "Failed to unpack"))
       << status_.error_message();
 }
 
@@ -395,8 +399,8 @@ TEST_F(SessionBundleTest, InitOpTooManyValues) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message())
-                  .contains("Expected exactly one serving init op"))
+  EXPECT_TRUE(str_util::StrContains(status_.error_message(),
+                                    "Expected exactly one serving init op"))
       << status_.error_message();
 }
 
diff --git a/tensorflow/contrib/session_bundle/signature_test.cc b/tensorflow/contrib/session_bundle/signature_test.cc
index 741b7fde9bdb40e8d0d7e4396676dfff036970d6..b1ff55552e0932ddc100adc4a257016fa3923120 100644
--- a/tensorflow/contrib/session_bundle/signature_test.cc
+++ b/tensorflow/contrib/session_bundle/signature_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 
@@ -33,8 +34,8 @@ namespace tensorflow {
 namespace serving {
 namespace {
 
-static bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+static bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
@@ -69,8 +70,8 @@ TEST(GetClassificationSignature, MissingSignature) {
   ClassificationSignature signature;
   const Status status = GetClassificationSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a classification signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a classification signature"))
       << status.error_message();
 }
 
@@ -86,8 +87,8 @@ TEST(GetClassificationSignature, WrongSignatureType) {
   ClassificationSignature signature;
   const Status status = GetClassificationSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a classification signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a classification signature"))
       << status.error_message();
 }
 
@@ -122,8 +123,8 @@ TEST(GetNamedClassificationSignature, MissingSignature) {
   const Status status =
       GetNamedClassificationSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Missing signature named \"foo\""))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Missing signature named \"foo\""))
       << status.error_message();
 }
 
@@ -141,9 +142,9 @@ TEST(GetNamedClassificationSignature, WrongSignatureType) {
   const Status status =
       GetNamedClassificationSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(
-      StringPiece(status.error_message())
-          .contains("Expected a classification signature for name \"foo\""))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "Expected a classification signature for name \"foo\""))
       << status.error_message();
 }
 
@@ -176,8 +177,8 @@ TEST(GetRegressionSignature, MissingSignature) {
   RegressionSignature signature;
   const Status status = GetRegressionSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a regression signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a regression signature"))
       << status.error_message();
 }
 
@@ -193,8 +194,8 @@ TEST(GetRegressionSignature, WrongSignatureType) {
   RegressionSignature signature;
   const Status status = GetRegressionSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a regression signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a regression signature"))
       << status.error_message();
 }
 
@@ -227,8 +228,8 @@ TEST(GetNamedSignature, MissingSignature) {
   Signature signature;
   const Status status = GetNamedSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Missing signature named \"foo\""))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Missing signature named \"foo\""))
       << status.error_message();
 }
 
@@ -370,7 +371,7 @@ TEST(RunClassification, RunNotOk) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Data is gone"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Data is gone"))
       << status.error_message();
 }
 
@@ -386,7 +387,8 @@ TEST(RunClassification, TooManyOutputs) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Expected 1 output"))
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "Expected 1 output"))
       << status.error_message();
 }
 
@@ -402,8 +404,9 @@ TEST(RunClassification, WrongBatchOutputs) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Input batch size did not match output batch size"))
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "Input batch size did not match output batch size"))
       << status.error_message();
 }
 
@@ -449,7 +452,7 @@ TEST_F(RunRegressionTest, RunNotOk) {
   const Status status =
       RunRegression(signature_, input_tensor_, &session_, &output_tensor_);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Data is gone"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Data is gone"))
       << status.error_message();
 }
 
@@ -460,8 +463,9 @@ TEST_F(RunRegressionTest, MismatchedSizeForBatchInputAndOutput) {
   const Status status =
       RunRegression(signature_, input_tensor_, &session_, &output_tensor_);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Input batch size did not match output batch size"))
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "Input batch size did not match output batch size"))
       << status.error_message();
 }
 
@@ -488,7 +492,7 @@ TEST(GetSignatures, MissingSignature) {
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Expected exactly one"))
+      str_util::StrContains(status.error_message(), "Expected exactly one"))
       << status.error_message();
 }
 
@@ -502,9 +506,9 @@ TEST(GetSignatures, WrongProtoInAny) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected Any type_url for: "
-                            "tensorflow.serving.Signatures"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected Any type_url for: "
+                                    "tensorflow.serving.Signatures"))
       << status.error_message();
 }
 
@@ -519,7 +523,7 @@ TEST(GetSignatures, JunkInAny) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Failed to unpack"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Failed to unpack"))
       << status.error_message();
 }
 
@@ -567,7 +571,7 @@ TEST(GetSignatures, MultipleSignaturesNotOK) {
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Expected exactly one"))
+      str_util::StrContains(status.error_message(), "Expected exactly one"))
       << status.error_message();
 }
 
@@ -641,8 +645,8 @@ TEST(GetGenericSignature, WrongSignatureType) {
   const Status status =
       GetGenericSignature("generic_bindings", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a generic signature:"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a generic signature:"))
       << status.error_message();
 }
 
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index a83fc20596c8ad7e1cf94ede8b10d82e25f47b17..fdecceff526a860a274354e53e824b98d11418a6 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -130,15 +130,3 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
index 1c052354b8afcc5fd8a53b783cc5c676588cf48c..64cc8c7ea54673ac748be73e677575331d8e1cc9 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
@@ -338,7 +338,7 @@ class FrameTest(test.TestCase):
 
   def test_constant_folding(self):
     """frame should be constant foldable for constant inputs."""
-    for pad_end in [False, True]:
+    for pad_end in [True, False]:
       g = ops.Graph()
       with g.as_default():
         frame_length, frame_step = 32, 16
diff --git a/tensorflow/contrib/slim/BUILD b/tensorflow/contrib/slim/BUILD
index c2f106c2b28029f05648716bb08cd2531729fb36..516e3ea073268e9b113a1e13577551ccacbf4206 100644
--- a/tensorflow/contrib/slim/BUILD
+++ b/tensorflow/contrib/slim/BUILD
@@ -178,15 +178,3 @@ py_test(
         "//tensorflow/python:summary",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/slim/python/slim/data/BUILD b/tensorflow/contrib/slim/python/slim/data/BUILD
index 7aa1684839184900547e3304358315246ac2b140..eef043e83276dcdffe491ee9b981c8de0894f592 100644
--- a/tensorflow/contrib/slim/python/slim/data/BUILD
+++ b/tensorflow/contrib/slim/python/slim/data/BUILD
@@ -194,15 +194,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
index b3343aef47d9f352c3bcbef4afbe8f9bf2560e6d..99ad48763031cc2f98009449cea050fd90d01eb5 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
@@ -115,8 +115,8 @@ class ParallelReader(io_ops.ReaderBase):
     reader needs to start reading from a new file since it has finished with
     the previous file).
 
-    A queue runner for enqueing in the `common_queue` is automatically added to
-    the TF QueueRunners collection.
+    A queue runner for enqueuing in the `common_queue` is automatically added
+    to the TF QueueRunners collection.
 
     Args:
       queue: A Queue or a mutable string Tensor representing a handle
diff --git a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
index 37e9c4754ca62fc02f9146632943a50c33f9423d..62bd20036126b41040ca4329c7f13ea7671a8045 100644
--- a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
+++ b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
@@ -36,9 +36,9 @@ def prefetch_queue(tensors,
                    dynamic_pad=False,
                    shared_name=None,
                    name=None):
-  """Creates a queue to prefetech tensors from `tensors`.
+  """Creates a queue to prefetch tensors from `tensors`.
 
-  A queue runner for enqueing tensors into the prefetch_queue is automatically
+  A queue runner for enqueuing tensors into the prefetch_queue is automatically
   added to the TF QueueRunners collection.
 
   Example:
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index b3b61e1dfe5671a7fbbee20b0c577ee5fad0fb9b..f2d31dc8db5688dc9a3308267109214277436040 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -124,7 +124,7 @@ class BoundingBox(ItemHandler):
     super(BoundingBox, self).__init__(self._full_keys)
 
   def tensors_to_item(self, keys_to_tensors):
-    """Maps the given dictionary of tensors to a contatenated list of bboxes.
+    """Maps the given dictionary of tensors to a concatenated list of bboxes.
 
     Args:
       keys_to_tensors: a mapping of TF-Example keys to parsed tensors.
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index 7f03aaf085cf26e3f5f940f4388828006a02ef42..8bbdf96384683c68648367c6433eeb89c64c22bf 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -317,15 +317,3 @@ py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/solvers/BUILD b/tensorflow/contrib/solvers/BUILD
index 87b67486ad413ad537aa8cb68f9f7bef729dd488..5247288d54aaf4e3020d38618b74f1118a69a105 100644
--- a/tensorflow/contrib/solvers/BUILD
+++ b/tensorflow/contrib/solvers/BUILD
@@ -93,16 +93,3 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-# All files
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/solvers/python/ops/least_squares.py b/tensorflow/contrib/solvers/python/ops/least_squares.py
index fb7c0eb649c5216736b239d1a423cdaf7079f582..6e164f53420675d149ded6c1f42ca87bd89b158c 100644
--- a/tensorflow/contrib/solvers/python/ops/least_squares.py
+++ b/tensorflow/contrib/solvers/python/ops/least_squares.py
@@ -33,7 +33,7 @@ def cgls(operator, rhs, tol=1e-6, max_iter=20, name="cgls"):
   r"""Conjugate gradient least squares solver.
 
   Solves a linear least squares problem \\(||A x - rhs||_2\\) for a single
-  righ-hand side, using an iterative, matrix-free algorithm where the action of
+  right-hand side, using an iterative, matrix-free algorithm where the action of
   the matrix A is represented by `operator`. The CGLS algorithm implicitly
   applies the symmetric conjugate gradient algorithm to the normal equations
   \\(A^* A x = A^* rhs\\). The iteration terminates when either
diff --git a/tensorflow/contrib/solvers/python/ops/linear_equations.py b/tensorflow/contrib/solvers/python/ops/linear_equations.py
index d791d467639b572e7831c1d1a582aa15585649b6..9305c6a11c4ec898c82553773e8e7277a54ab82e 100644
--- a/tensorflow/contrib/solvers/python/ops/linear_equations.py
+++ b/tensorflow/contrib/solvers/python/ops/linear_equations.py
@@ -41,7 +41,7 @@ def conjugate_gradient(operator,
   r"""Conjugate gradient solver.
 
   Solves a linear system of equations `A*x = rhs` for selfadjoint, positive
-  definite matrix `A` and righ-hand side vector `rhs`, using an iterative,
+  definite matrix `A` and right-hand side vector `rhs`, using an iterative,
   matrix-free algorithm where the action of the matrix A is represented by
   `operator`. The iteration terminates when either the number of iterations
   exceeds `max_iter` or when the residual norm has been reduced to `tol`
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index fcfaa2aba4e8ab086a9eac053188f8fbd4f6f39a..b729fff261192be22c6a56fa9ca0a641f302c570 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -65,15 +65,3 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/sparsemax/__init__.py b/tensorflow/contrib/sparsemax/__init__.py
index 19d213fb3e8f615190d67862b1928205f31146b4..7bc726f4a84d683517b73814193429220f864735 100644
--- a/tensorflow/contrib/sparsemax/__init__.py
+++ b/tensorflow/contrib/sparsemax/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Module that implements sparsemax and sparsemax loss, see [1].
 
-[1] https://arxiv.org/abs/1602.02068
+[1]: https://arxiv.org/abs/1602.02068
 
 ## Sparsemax
 
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index 890ca20f4cabd65146e803e54e554a5c97e72427..e617af2ff1b731eddb5b72469a1cd67e7cfd163f 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -31,7 +31,7 @@ def sparsemax(logits, name=None):
   """Computes sparsemax activations [1].
 
   For each batch `i` and class `j` we have
-    sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)
+    $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$
 
   [1]: https://arxiv.org/abs/1602.02068
 
diff --git a/tensorflow/contrib/specs/BUILD b/tensorflow/contrib/specs/BUILD
index 084953a0a226cde46ebd9d2031d20cb839180ca8..055b04db8a5654ebf6fee45547d58f0375f9a554 100644
--- a/tensorflow/contrib/specs/BUILD
+++ b/tensorflow/contrib/specs/BUILD
@@ -60,15 +60,3 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/staging/BUILD b/tensorflow/contrib/staging/BUILD
index bc4a289468c257e7e5e2bd437b8d6d1235980495..0c86f3db1d5bc262f27440754c86f8c63e16b690 100644
--- a/tensorflow/contrib/staging/BUILD
+++ b/tensorflow/contrib/staging/BUILD
@@ -6,18 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "staging",
     srcs = ["__init__.py"],
diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index 5fd02efbf6327b20eade6785007930eed3fd4e03..30be14c10cd8576ded75b8489cc89d439a9cc282 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -31,16 +31,5 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    tags = ["no_windows"],
 )
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
index 6e259e1d32be64f3b593faf73e8af4f704d72349..dcbef2881df7b5543d664c4b385927f52ae2cbaa 100644
--- a/tensorflow/contrib/stateless/BUILD
+++ b/tensorflow/contrib/stateless/BUILD
@@ -38,15 +38,3 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index b58c83fdaf574fb349fac57c922f1178b7d13b66..fda1367b156c86f385f31cc41c5fca747cf8668d 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -10,12 +10,6 @@ load(
     "tf_gen_op_wrapper_py",
 )
 
-tf_gen_op_wrapper_py(
-    name = "gen_summary_ops",
-    out = "gen_summary_ops.py",
-    deps = ["//tensorflow/core:summary_ops_op_lib"],
-)
-
 py_test(
     name = "summary_ops_test",
     srcs = ["summary_ops_test.py"],
@@ -61,7 +55,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":gen_summary_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -72,6 +65,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:summary_op_util",
+        "//tensorflow/python:summary_ops_gen",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
@@ -89,18 +83,6 @@ py_library(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # NOTE: target cannot be testonly because it needs to be in the pip
 # package. Sigh.
 py_library(
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
index c1724c6e43c0365ceac18bb1d9dfea7351cca060..bc763fe655edc455e2538e536d6efab314c8228c 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -26,7 +26,6 @@ import time
 
 import six
 
-from tensorflow.contrib.summary import gen_summary_ops
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -35,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_summary_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 07b6b1f142ba4d113c0d45db5a8a2ce3f66f4154..136856c0156c41046f9af61cdd6e3d5f8213309e 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -16,20 +16,6 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "kernels/v4/*",
-            "proto/*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # ---------------------------------- V2 ops ------------------------------------------#
 filegroup(
     name = "v2_op_sources",
diff --git a/tensorflow/contrib/tensor_forest/README.md b/tensorflow/contrib/tensor_forest/README.md
index 8b24430c71c16c2ed6b2e1a530e19fbc9ebb1698..9e1491ea666b51ba0d367610778c659c543dacf6 100644
--- a/tensorflow/contrib/tensor_forest/README.md
+++ b/tensorflow/contrib/tensor_forest/README.md
@@ -116,7 +116,7 @@ a different `feature_bagging_fraction * num_features` sized subset of the
 input features.  Defaults to 1.0 (no feature bagging).
 
 * `base_random_seed`.  By default (`base_random_seed = 0`), the random number
-generator for each tree is seeded by the current time (in microseconds) when
+generator for each tree is seeded by a 64-bit random value when
 each tree is first created.  Using a non-zero value causes tree training to
 be deterministic, in that the i-th tree's random number generator is seeded
 with the value `base_random_seed + i`.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/BUILD b/tensorflow/contrib/tensor_forest/hybrid/BUILD
index a2a3b485f6aa0ae827bbaa7812823730bd8db3b8..b7185e09c70fbeb33ed559cde1dfeaf348a7e126 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/BUILD
+++ b/tensorflow/contrib/tensor_forest/hybrid/BUILD
@@ -11,18 +11,6 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "custom_op_sources",
     srcs = glob(
diff --git a/tensorflow/contrib/tensor_forest/kernels/data_spec.h b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
index 0a3abe56dfc4f611ac8ed0815e4c74a639d2477e..bb33400214e5ef37be73b538455eecf5ae481db4 100644
--- a/tensorflow/contrib/tensor_forest/kernels/data_spec.h
+++ b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
@@ -21,6 +21,7 @@
 
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace tensorforest {
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
index 794b76d8583c3608d540d34a5aaf1d1a799f35e3..b1b1559383a1d26a80d4974e2773f5b27ce1f2be 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
@@ -11,11 +11,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(["**/*"]),
-)
-
 DECISION_TREE_RESOURCE_DEPS = [
     ":decision_node_evaluator",
     ":input_data",
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
index da600d34eacdf27514709240723e5bb730cfe7f0..63d4d9ba50603f65cc822ea74c97b923c29fea35 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
@@ -19,6 +19,7 @@
 #include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
 #include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h"
 #include "tensorflow/core/lib/random/distribution_sampler.h"
+#include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 namespace tensorforest {
@@ -122,9 +123,8 @@ ClassificationStats::ClassificationStats(const TensorForestParams& params,
     right_gini_.reset(new RunningGiniScores());
   }
 
-  uint64 time_seed = static_cast<uint64>(std::clock());
   single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-      new random::PhiloxRandom(time_seed));
+      new random::PhiloxRandom(random::New64()));
   rng_ = std::unique_ptr<random::SimplePhilox>(
       new random::SimplePhilox(single_rand_.get()));
 }
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
index c544a8c75e9bfe8fe6bbea8913e7be17d868bfef..95f75b4d7e6a961edf6b3da1dc1712e7ddaacf31 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
@@ -23,6 +23,7 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 
 namespace tensorflow {
@@ -44,18 +45,20 @@ class TensorDataSet {
     int column_count = 0;
     for (int i = 0; i < input_spec_.dense_size(); ++i) {
       for (int j = 0; j < input_spec_.dense(i).size(); ++j) {
-        decision_trees::FeatureId id;
-        id.mutable_id()->set_value(strings::StrCat(column_count));
-        available_features_.push_back(id);
         ++column_count;
       }
     }
+    available_features_.reserve(column_count);
+    decision_trees::FeatureId id;
+    for (int i = 0; i < column_count; i++) {
+      id.mutable_id()->set_value(strings::StrCat(i));
+      available_features_.emplace_back(id);
+    }
 
     // Set up the random number generator.
     if (split_sampling_random_seed_ == 0) {
-      uint64 time_seed = static_cast<uint64>(std::clock());
       single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-          new random::PhiloxRandom(time_seed));
+          new random::PhiloxRandom(random::New64()));
     } else {
       single_rand_ = std::unique_ptr<random::PhiloxRandom>(
           new random::PhiloxRandom(split_sampling_random_seed_));
diff --git a/tensorflow/contrib/tensor_forest/proto/BUILD b/tensorflow/contrib/tensor_forest/proto/BUILD
index 1cfef44af1aaee3c105664398200524f2770f7d7..04fd6a9839509d2d02b7cf947acc4505c28cbdcd 100644
--- a/tensorflow/contrib/tensor_forest/proto/BUILD
+++ b/tensorflow/contrib/tensor_forest/proto/BUILD
@@ -6,14 +6,6 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
 package(default_visibility = ["//visibility:public"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "fertile_stats_proto",
     srcs = ["fertile_stats.proto"],
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index db2e000ef875c354edcdf0fa50fd3ac6d4907c17..2b6a2b2f3c711f48812063e98e05735e2d9b4141 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -82,6 +82,7 @@ py_test(
     size = "small",
     srcs = ["plugins/trace/trace_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":trace",
         "//tensorflow/python:client_testlib",
@@ -89,15 +90,3 @@ py_test(
         "//tensorflow/python:platform",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD
index 4175d8adb58a85728519042a9870e8c4590232ba..3f6b4cdc9ad10f5089f28af35a8be408918c7f90 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/contrib/tensorboard/db/BUILD
@@ -135,9 +135,3 @@ tf_cc_binary(
         "//tensorflow/core/lib/db:sqlite",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
index c61b4655961664a6c9c22a5f6d6f26a55c34bfcd..cd3f712256f2293ed725745f8cbe48109856ef86 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/event.pb.h"
@@ -58,7 +59,7 @@ class SummaryFileWriterTest : public ::testing::Test {
     TF_CHECK_OK(env_.GetChildren(testing::TmpDir(), &files));
     bool found = false;
     for (const string& f : files) {
-      if (StringPiece(f).contains(test_name)) {
+      if (str_util::StrContains(f, test_name)) {
         if (found) {
           return errors::Unknown("Found more than one file for ", test_name);
         }
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 906cc3f0344e7cb641589bd522e33d658150d3b5..2f316767b35e190c7e438a253a7395b0c5c2ee16 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -272,15 +272,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index 461e627e99c38fe492dc8a8a1c9fb06e6cfda3a2..6eafc1754ca5102c8adf04f00e33dc2f8ff970f6 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -1,15 +1,15 @@
-Using TensorRT in TensorFlow
-============================
+# Using TensorRT in TensorFlow
+
 
 This module provides necessary bindings and introduces TRT_engine_op
 operator that wraps a subgraph in TensorRT. This is still a work in progress
 but should be useable with most common graphs.
 
-Compilation
------------
+## Compilation
+
 
 In order to compile the module, you need to have a local TensorRT
-installation (libnvinfer.so and respective include files). During the
+installation ( libnvinfer.so and respective include files ). During the
 configuration step, TensorRT should be enabled and installation path
 should be set. If installed through package managers (deb,rpm),
 configure script should find the necessary components from the system
@@ -22,4 +22,38 @@ bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
 ```
 
 After the installation of tensorflow package, TensorRT transformation
-will be available. An example use can be found in test/test_tftrt.py directory
+will be available. An example use can be found in test/test_tftrt.py script
+
+## Installing TensorRT 3.0.4
+
+In order to make use of TensorRT integration, you will need a local installation of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt). Due to compiler compatibility, you will need to download and install the TensorRT 3.0.4 tarball for _Ubuntu 14.04_, i.e., **_TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz_**, even if you are using Ubuntu 16.04 or later.
+
+### Preparing TensorRT installation
+
+Once you have downloaded TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz, you will need to unpack it to an installation directory, which will be referred to as <install_dir>. Please replace <install_dir> with the full path of actual installation directory you choose in commands below.
+
+```shell
+cd <install_dir> && tar -zxf /path/to/TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz
+```
+
+After unpacking the binaries, you have several options to use them:
+
+#### To run TensorFlow as a user without superuser privileges
+
+For a regular user without any sudo rights, you should add TensorRT to your `$LD_LIBRARY_PATH`:
+
+  ```shell
+   export LD_LIBRARY_PATH=<install_dir>/TensorRT-3.0.4/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+  ```
+
+Then you are ready to use TensorFlow-TensorRT integration. `$LD_LIBRARY_PATH` must contain the path to TensorRT installation for TensorFlow-TensorRT integration to work. If you are using a VirtualEnv-like setup, you can add the command above to your `bin/activate` script or to your `.bashrc` script.
+
+#### To run TensorFlow as a superuser
+
+ When running as a superuser, such as in a container or via sudo, the `$LD_LIBRARY_PATH` approach above may not work. The following is preferred when the user has superuser privileges:
+
+  ```shell
+  echo "<install_dir>/TensorRT-3.0.4/lib" | sudo tee /etc/ld.so.conf.d/tensorrt304.conf && sudo ldconfig
+  ```
+
+  Please ensure that any existing deb package installation of TensorRT is removed before following these instructions to avoid package conflicts.
\ No newline at end of file
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index eea8c8efa28c1b7f9a1979311e920ca5c89f830a..ff8cc6374d40dc0b49721a784e25015c76541d03 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -49,12 +49,13 @@ namespace tensorrt {
 namespace convert {
 namespace {
 
-bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
+bool IsTensorRTCandidate(const tensorflow::Node* node) {
   // LINT.IfChange
   // TODO(jie): Segmentation shouldn't associated with op name.
   //            Split it into a registration for each kernel.
   static const std::set<string> candidate_ops = {
       "Identity",
+      "Snapshot",
       "Const",
       "Conv2D",
       "MaxPool",
@@ -74,7 +75,7 @@ bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
       // TODO(ben,jie): ...
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
-  return candidate_ops.count(node_def.op());
+  return candidate_ops.count(node->type_string());
 }
 
 void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
@@ -84,10 +85,10 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
     const tensorflow::Node* node = graph.FindNodeId(node_id);
     for (const tensorflow::Edge* edge : node->in_edges()) {
       if (!subgraph_node_ids.count(edge->src()->id()) &&
-          !edge->src()->IsSource()) {
+          !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
       } else {
-        VLOG(2) << edge->src()->name() << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
       }
     }
   }
@@ -100,11 +101,11 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     const tensorflow::Node* node = graph.FindNodeId(node_id);
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
-          !edge->dst()->IsSink()) {
-        VLOG(2) << edge->dst()->name() << " Y, ";
+          !edge->dst()->IsSink() && !edge->IsControlEdge()) {
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << edge->dst()->name() << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
       }
     }
   }
@@ -409,8 +410,9 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
       if (status != tensorflow::Status::OK()) {
         LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
-                     << " due to: \n"
-                     << status.ToString() << " SKIPPING......";
+                     << " due to: \"" << status.ToString()
+                     << "\" SKIPPING......( " << subgraph_node_names.size()
+                     << " nodes)";
       }
       count++;
     }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index e1596e89e229813245b35e568cd349b738c2d879..e01e4a5328061ad527b2dac6e2e4ef1559bd914d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -35,7 +35,7 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
 
 // max_batch_size: maximum batch size which can be used for inference for
 //                 optimization targets inference run with max batch size.
-// max_workspace_size_bytes: The upper bound of memory allowence for
+// max_workspace_size_bytes: The upper bound of memory allowance for
 //                 engine building.
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 75a3c3d034dff3e9d044d31c1693395deb951582..e920a797fe428620ef62a2b67c07f35d85ef5211 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -53,8 +53,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
+using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
-
 namespace {
 
 inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
@@ -346,11 +346,10 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2(
-          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
+      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+               istrides, static_cast<Eigen::half*>(
+                             const_cast<void*>(oweights->GetValues())),
+               ostrides);
       break;
     }
     default:
@@ -430,9 +429,8 @@ class Converter {
   tensorflow::tensorrt::TRTWeightStore* weight_store_;
   bool fp16_;
   void register_op_converters();
-  std::vector<TRT_TensorOrWeights> get_inputs(
-      const tensorflow::NodeDef& node_def) {
-    std::vector<TRT_TensorOrWeights> inputs;
+  tensorflow::Status get_inputs(const tensorflow::NodeDef& node_def,
+                                std::vector<TRT_TensorOrWeights>* inputs) {
     for (auto const& input_name : node_def.input()) {
       /*************************************************************************
        * TODO(jie) handle case 1) here
@@ -453,13 +451,17 @@ class Converter {
 
       VLOG(2) << "retrieve input: " << name;
       if (trt_tensors_.count(name)) {
-        inputs.push_back(trt_tensors_.at(name));
+        inputs->push_back(trt_tensors_.at(name));
       } else {
-        LOG(FATAL) << "input: " << name << " not availabled for node at, "
-                   << node_def.name();
+        string str("Node ");
+        StrAppend(&str, node_def.name(), " should have an input named '", name,
+                  "' but it is not available");
+        LOG(WARNING) << "input: " << name << " not available for node at "
+                     << node_def.name();
+        return tensorflow::errors::InvalidArgument(str);
       }
     }
-    return inputs;
+    return tensorflow::Status::OK();
   }
 
  public:
@@ -483,7 +485,8 @@ class Converter {
   }
 
   tensorflow::Status convert_node(const tensorflow::NodeDef& node_def) {
-    std::vector<TRT_TensorOrWeights> inputs = this->get_inputs(node_def);
+    std::vector<TRT_TensorOrWeights> inputs;
+    TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs));
     string op = node_def.op();
     if (!op_registry_.count(op)) {
       return tensorflow::errors::Unimplemented(
@@ -548,6 +551,19 @@ class Converter {
   }
 };
 
+TRT_ShapedWeights ConvertFP32ToFP16(Converter& ctx,
+                                    const TRT_ShapedWeights& weights_src) {
+  auto dtype_new = tensorflow::DataType::DT_HALF;
+  TRT_ShapedWeights weights =
+      ctx.get_temp_weights(dtype_new, weights_src.shape_);
+  const float* src = static_cast<const float*>(weights_src.GetValues());
+  Eigen::half* dst = const_cast<Eigen::half*>(
+      static_cast<Eigen::half const*>(weights.GetValues()));
+  for (int64_t i = 0; i < weights_src.count(); i++) {
+    dst[i] = Eigen::half_impl::float_to_half_rtne(src[i]);
+  }
+  return weights;
+}
 // ****************************************************************************
 // Constant folding functions
 // TODO(jie): once optimizer kicks in, we should have done constant folding
@@ -875,7 +891,7 @@ tensorflow::Status BinaryTensorOpWeight(
 
   // Check type consistency
   nvinfer1::DataType ttype;
-  TF_CHECK_OK(ConvertDType(weights.type_, &ttype));
+  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &ttype));
 
   // Check scale mode
   auto dims_w = weights.shape_;
@@ -884,7 +900,7 @@ tensorflow::Status BinaryTensorOpWeight(
   // default to element-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
-  // TODO(jie): maybe use a permuatation instead to support more cases;
+  // TODO(jie): maybe use a permutation instead to support more cases;
   bool permutation_flag = false;
 
   if (weights.count() == 1) {
@@ -957,6 +973,10 @@ tensorflow::Status BinaryTensorOpWeight(
     }
   }
 
+  if (ctx.isFP16()) {
+    weights = ConvertFP32ToFP16(ctx, weights);
+  }
+
   // prepare weights
   TRT_ShapedWeights shift_weights(weights.type_);
   TRT_ShapedWeights scale_weights(weights.type_);
@@ -998,9 +1018,7 @@ enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
 tensorflow::Status ConvertConv2DHelper(
     Converter& ctx, const tensorflow::NodeDef& node_def,
     const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs,
-    int group  // group ==0 specifies depthwise conv
-) {
+    std::vector<TRT_TensorOrWeights>* outputs, int group) {
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
   TFAttrs attrs(node_def);
@@ -1025,6 +1043,10 @@ tensorflow::Status ConvertConv2DHelper(
   VLOG(2) << "groups count: " << num_groups;
 
   TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
+  if (ctx.isFP16()) {
+    weights_rsck = ConvertFP32ToFP16(ctx, inputs.at(1).weights());
+  }
+
   TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
   ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
   TRT_ShapedWeights biases(weights.type_);
@@ -1134,9 +1156,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -1295,8 +1317,11 @@ tensorflow::Status ConvertScale(Converter& ctx,
   // Implement tensor binaryOp weight [channel wise] for now;
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
-  // TODO(jie): handle NHWC/NCHW transpose;
   TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (ctx.isFP16()) {
+    weights = ConvertFP32ToFP16(ctx, inputs.at(1).weights());
+  }
+
   TRT_ShapedWeights empty_weights(weights.type_);
 
   TFAttrs attrs(node_def);
@@ -1376,8 +1401,11 @@ tensorflow::Status ConvertConst(Converter& ctx,
           scalar_shape.d[0] = weights_tensor.float_val_size();
           scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
         } else {
-          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
-                     << " kUNIFORM, at: " << node_def.name();
+          LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and"
+                       << " kUNIFORM, at: " << node_def.name();
+          string err_str("Broadcast method is not supported for '");
+          StrAppend(&err_str, node_def.name(), "' of type ", node_def.op());
+          return tensorflow::errors::InvalidArgument(err_str);
         }
       }
     } else {
@@ -1391,33 +1419,16 @@ tensorflow::Status ConvertConst(Converter& ctx,
         scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
       }
     }
-    if (ctx.isFP16()) {
-      auto dtype_new = tensorflow::DataType::DT_HALF;
-      size_t len_data = tensorflow::DataTypeSize(dtype_new);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      tensorflow::Tensor temp_tensor(tensorflow::DT_HALF, tensor.shape());
-      auto half_tensor = temp_tensor.flat<Eigen::half>();
-      Eigen::DefaultDevice defd;
-      half_tensor.device(defd) =
-          tensor.flat<float>().template cast<Eigen::half>();
-      memcpy(dst, half_tensor.data(), len_data);  // store into weight store
-      weights = TRT_ShapedWeights(dtype_new, dst, scalar_shape);
-    } else {
-      size_t len_data = tensorflow::DataTypeSize(dtype);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      std::vector<float> tensor_data(
-          weights_tensor.float_val().begin(),
-          weights_tensor.float_val()
-              .end());  //  make a local copy first to flatten
-      memcpy(dst, tensor_data.data(), len_data);  // store into weight store
-      weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
-    }
+    size_t len_data = tensorflow::DataTypeSize(dtype);
+    for (int i = 0; i < scalar_shape.nbDims; i++) len_data *= scalar_shape.d[i];
+    ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
+    void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
+    std::vector<float> tensor_data(
+        weights_tensor.float_val().begin(),
+        weights_tensor.float_val()
+            .end());  //  make a local copy first to flatten
+    memcpy(dst, tensor_data.data(), len_data);  // store into weight store
+    weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
   } else if (!weights_tensor.int_val().empty()) {
     VLOG(2) << "int!!!" << node_def.name();
     nvinfer1::Dims scalar_shape;
@@ -1432,8 +1443,11 @@ tensorflow::Status ConvertConst(Converter& ctx,
           scalar_shape.d[0] = weights_tensor.int_val_size();
           scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
         } else {
-          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
-                     << " kUNIFORM, at: " << node_def.name();
+          LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and"
+                       << " kUNIFORM, at: " << node_def.name();
+          string err_str("Broadcast method is not supported for '");
+          StrAppend(&err_str, node_def.name(), "' of type ", node_def.op());
+          return tensorflow::errors::InvalidArgument(err_str);
         }
       }
     } else {
@@ -1447,62 +1461,23 @@ tensorflow::Status ConvertConst(Converter& ctx,
         scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
       }
     }
-    if (ctx.isFP16()) {
-      auto dtype_new = tensorflow::DataType::DT_HALF;
-      size_t len_data = tensorflow::DataTypeSize(dtype_new);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      tensorflow::Tensor temp_tensor(tensorflow::DT_HALF, tensor.shape());
-      TTypes<Eigen::half>::Flat half_tensor = temp_tensor.flat<Eigen::half>();
-      Eigen::DefaultDevice defd;
-      switch (dtype) {
-        case (tensorflow::DT_INT32): {
-          half_tensor.device(defd) =
-              tensor.flat<int32>().template cast<Eigen::half>();
-          break;
-        }
-        case (tensorflow::DT_INT16): {
-          half_tensor.device(defd) =
-              tensor.flat<int16>().template cast<Eigen::half>();
-          break;
-        }
-        case (tensorflow::DT_INT8): {
-          half_tensor.device(defd) =
-              tensor.flat<int8>().template cast<Eigen::half>();
-          break;
-        }
-        case (tensorflow::DT_UINT8): {
-          half_tensor.device(defd) =
-              tensor.flat<uint8>().template cast<Eigen::half>();
-          break;
-        }
-        default:
-          return tensorflow::errors::InvalidArgument(
-              "Datatype " + tensorflow::DataTypeString(dtype) +
-              " for FP16 conversion");
-          break;
-      };
-      memcpy(dst, half_tensor.data(), len_data);  // store into weight store
-      weights = TRT_ShapedWeights(dtype_new, dst, scalar_shape);
-    } else {
-      size_t len_data = tensorflow::DataTypeSize(dtype);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      size_t len_tensor = weights_tensor.int_val_size() * sizeof(int32);
-      len_data = std::max(len_data, len_tensor);
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      std::vector<int32> tensor_data(
-          weights_tensor.int_val().begin(),
-          weights_tensor.int_val()
-              .end());  //  make a local copy first to flatten
-                        //  doesn't have to be contigous
-      memcpy(dst, tensor_data.data(), len_tensor);  // store into weight store
-      weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
-    }
+    //  we should not have converted //if (ctx.isFP16()) {
+    size_t len_data = tensorflow::DataTypeSize(dtype);
+    for (int i = 0; i < scalar_shape.nbDims; i++) len_data *= scalar_shape.d[i];
+    size_t len_tensor = weights_tensor.int_val_size() * sizeof(int32);
+    len_data = std::max(len_data, len_tensor);
+    ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
+    void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
+    std::vector<int32> tensor_data(
+        weights_tensor.int_val().begin(),
+        weights_tensor.int_val().end());  //  make a local copy first to flatten
+                                          //  doesn't have to be contigous
+    memcpy(dst, tensor_data.data(), len_tensor);  // store into weight store
+    weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
   } else if (!weights_tensor.tensor_content().empty()) {
+    //  obsolete method.
+    //  After optimization path, we do not see weights in this format.
+    //  fp16 conversion technically should be needed here.
     VLOG(2) << "TENSOR!!!" << node_def.name();
     const auto& content = weights_tensor.tensor_content();
 
@@ -1784,8 +1759,6 @@ tensorflow::Status ConvertConcat(Converter& ctx,
   TRT_ShapedWeights axis = inputs.at(input_size).weights();
 
   TFAttrs attrs(node_def);
-  // auto attr_size = attrs.at("N")->i();
-  // auto data_type = attrs.get<nvinfer1::DataType>("T");
   auto index_type = attrs.get<tensorflow::DataType>("Tidx");
 
   // TODO(jie): handle data type
@@ -1875,71 +1848,103 @@ tensorflow::Status ConvertFusedBatchNorm(
         "only is_training=false is supported, at " + node_def.name());
   }
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  TRT_ShapedWeights scale_weights = inputs.at(1).weights();
-  TRT_ShapedWeights offset_weights = inputs.at(2).weights();
-  TRT_ShapedWeights mean_weights = inputs.at(3).weights();
-  TRT_ShapedWeights variance_weights = inputs.at(4).weights();
-  TRT_ShapedWeights dummy_power_weights(scale_weights.type_);
-  TRT_ShapedWeights combined_scale_weights =
-      ctx.get_temp_weights_like(scale_weights);
-  TRT_ShapedWeights combined_offset_weights =
-      ctx.get_temp_weights_like(offset_weights);
-  size_t nweight = scale_weights.count();
-  if ((scale_weights.type_ == offset_weights.type_) &&
-      (mean_weights.type_ == variance_weights.type_) &&
-      (scale_weights.type_ == variance_weights.type_)) {
-    if ((scale_weights.type_ != tensorflow::DataType::DT_FLOAT) &&
-        (scale_weights.type_ != tensorflow::DataType::DT_HALF)) {
+
+  //  Check parameter types
+  auto parameter_type = inputs.at(1).weights().type_;
+  if ((parameter_type != tensorflow::DataType::DT_FLOAT) &&
+      (parameter_type != tensorflow::DataType::DT_HALF)) {
+    return tensorflow::errors::Unimplemented(
+        "only float32 or float16 weight data type is supported, for node " +
+        node_def.name() + " got " + tensorflow::DataTypeString(parameter_type));
+  }
+  for (int i = 1; i < 5; i++) {
+    if (inputs.at(i).weights().type_ != parameter_type) {
       return tensorflow::errors::Unimplemented(
-          "only float32 or float16 weight data type is supported, for node " +
-          node_def.name() + " got " +
-          tensorflow::DataTypeString(scale_weights.type_));
+          "Inconsistent parameter type for batchnormis not supported, at: " +
+          node_def.name());
     }
-    if (scale_weights.type_ == tensorflow::DT_FLOAT) {
-      for (size_t i = 0; i < nweight; ++i) {
-        float scale = (static_cast<float const*>(scale_weights.GetValues()))[i];
-        float offset =
-            (static_cast<float const*>(offset_weights.GetValues()))[i];
-        float mean = (static_cast<float const*>(mean_weights.GetValues()))[i];
-        float variance =
-            (static_cast<float const*>(variance_weights.GetValues()))[i];
-        float& combined_scale_ref = const_cast<float*>(
-            static_cast<float const*>(combined_scale_weights.GetValues()))[i];
-        float& combined_offset_ref = const_cast<float*>(
-            static_cast<float const*>(combined_offset_weights.GetValues()))[i];
-        combined_scale_ref = scale / sqrtf(variance + epsilon);
-        combined_offset_ref = offset - mean * combined_scale_ref;
-      }
-    } else {
-      const Eigen::half* scale_vals =
-          (static_cast<Eigen::half const*>(scale_weights.GetValues()));
-      const Eigen::half* off_vals =
-          (static_cast<Eigen::half const*>(offset_weights.GetValues()));
-      const Eigen::half* mean_vals =
-          (static_cast<Eigen::half const*>(mean_weights.GetValues()));
-      const Eigen::half* variance_vals =
-          (static_cast<Eigen::half const*>(variance_weights.GetValues()));
-      Eigen::half* comb_scale_vals = const_cast<Eigen::half*>(
-          static_cast<Eigen::half const*>(combined_scale_weights.GetValues()));
-      Eigen::half* comb_off_vals = const_cast<Eigen::half*>(
-          static_cast<Eigen::half const*>(combined_offset_weights.GetValues()));
-      for (size_t i = 0; i < nweight; ++i) {
-        float scale(scale_vals[i]);
-        float offset(off_vals[i]);
-        float mean(mean_vals[i]);
-        float variance(variance_vals[i]);
-        float combined_scale_ref = scale / sqrtf(variance + epsilon);
-        comb_scale_vals[i] = Eigen::half(combined_scale_ref);
-        float combined_offset_ref = offset - mean * combined_scale_ref;
-        comb_off_vals[i] = Eigen::half(combined_offset_ref);
+  }
+
+  TRT_ShapedWeights dummy_power_weights(parameter_type);
+  size_t nweight = 0;
+  for (int i = 1; i < 5; i++) {
+    nweight = std::max(nweight, (size_t)inputs.at(i).weights().count());
+  }
+  TRT_ShapedWeights* ptr_shape_weights = nullptr;
+  for (int i = 1; i < 5; i++) {
+    if (inputs.at(i).weights().count() == nweight) {
+      ptr_shape_weights =
+          const_cast<TRT_ShapedWeights*>(&(inputs.at(i).weights()));
+    } else if (inputs.at(i).weights().count() != 1) {
+      return tensorflow::errors::InvalidArgument(
+          "Inconsistent batchnorm parameter count, at: " + node_def.name());
+    }
+  }
+  //  We could technically have two weights with different shape.
+  //  that requires two addScale op, arguably less performant
+  TRT_ShapedWeights combined_scale_weights =
+      ctx.get_temp_weights_like(*ptr_shape_weights);
+  TRT_ShapedWeights combined_offset_weights =
+      ctx.get_temp_weights_like(*ptr_shape_weights);
+
+  const Eigen::half* cast_vals_array[4];
+  const float* vals_array[4];
+  for (int j = 0; j < 4; j++) {
+    cast_vals_array[j] =
+        static_cast<Eigen::half const*>(inputs.at(j + 1).weights().GetValues());
+    vals_array[j] =
+        static_cast<float const*>(inputs.at(j + 1).weights().GetValues());
+  }
+  Eigen::half* cast_combined_scale_vals = const_cast<Eigen::half*>(
+      static_cast<Eigen::half const*>(combined_scale_weights.GetValues()));
+  Eigen::half* cast_combined_offset_vals = const_cast<Eigen::half*>(
+      static_cast<Eigen::half const*>(combined_offset_weights.GetValues()));
+  float* combined_scale_vals = const_cast<float*>(
+      static_cast<float const*>(combined_scale_weights.GetValues()));
+  float* combined_offset_vals = const_cast<float*>(
+      static_cast<float const*>(combined_offset_weights.GetValues()));
+
+  for (size_t i = 0; i < nweight; ++i) {
+    float batchnorm_data[4];
+    for (int j = 0; j < 4; j++) {
+      if (inputs.at(j + 1).weights().count() != 1) {
+        if (parameter_type == tensorflow::DT_FLOAT) {
+          batchnorm_data[j] = vals_array[j][i];
+        } else if (parameter_type == tensorflow::DT_HALF) {
+          batchnorm_data[j] =
+              Eigen::half_impl::half_to_float(cast_vals_array[j][i]);
+        }
+      } else {
+        if (parameter_type == tensorflow::DT_FLOAT) {
+          batchnorm_data[j] = vals_array[j][0];
+        } else if (parameter_type == tensorflow::DT_HALF) {
+          batchnorm_data[j] =
+              Eigen::half_impl::half_to_float(cast_vals_array[j][0]);
+        }
       }
     }
+    float scale = batchnorm_data[0];
+    float offset = batchnorm_data[1];
+    float mean = batchnorm_data[2];
+    float variance = batchnorm_data[3];
+    float combined_scale_val = scale / sqrtf(variance + epsilon);
+    float combined_offset_val = offset - mean * combined_scale_val;
+    if (parameter_type == tensorflow::DT_FLOAT) {
+      combined_scale_vals[i] = combined_scale_val;
+      combined_offset_vals[i] = combined_offset_val;
+    } else if (parameter_type == tensorflow::DT_HALF) {
+      cast_combined_scale_vals[i] = Eigen::half(combined_scale_val);
+      cast_combined_offset_vals[i] = Eigen::half(combined_offset_val);
+    }
   }
-  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
-      combined_offset_weights.GetWeightsForTRT(),
-      combined_scale_weights.GetWeightsForTRT(),
-      dummy_power_weights.GetWeightsForTRT());
+
+  nvinfer1::ScaleMode mode = nweight == 1 ? nvinfer1::ScaleMode::kUNIFORM
+                                          : nvinfer1::ScaleMode::kCHANNEL;
+  nvinfer1::IScaleLayer* layer =
+      ctx.network()->addScale(*const_cast<nvinfer1::ITensor*>(tensor), mode,
+                              combined_offset_weights.GetWeightsForTRT(),
+                              combined_scale_weights.GetWeightsForTRT(),
+                              dummy_power_weights.GetWeightsForTRT());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
@@ -2050,6 +2055,7 @@ void Converter::register_op_converters() {
   op_registry_["Const"] = ConvertConst;
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
+  op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
 
   // resnet_50_v1 slim implementation
   op_registry_["Add"] = ConvertBinary;
@@ -2143,8 +2149,11 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->thr_->join();
   delete calib_res->thr_;
   if (!calib_res->engine_) {
-    LOG(FATAL) << "Calibration failed!, engine is nullptr. Did you run "
+    LOG(ERROR) << "Calibration failed!, engine does not exist. Did you run "
                   "calibration graph?";
+    return tensorflow::errors::FailedPrecondition(
+        "Calibration graph needs to be executed on"
+        " calibration data before convertsion to inference graph");
   }
   auto weight_rmgr = trt_rm->getManager("WeightStore");
   TF_CHECK_OK(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
@@ -2181,7 +2190,7 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
     return status;
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
-  TF_CHECK_OK(status);
+  TF_RETURN_IF_ERROR(status);
   for (size_t i = 0; i < out_edges.size(); i++) {
     VLOG(1) << "Connecting trt_engine_node output " << i << " with "
             << out_edges.at(i)->dst()->name() << " port "
@@ -2212,7 +2221,7 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
   std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
     if (s.subgraph_node_ids.count(node->id())) {
-      order.push_front(node);  // we want topological order to contstruct the
+      order.push_front(node);  // we want topological order to construct the
       // network layer by layer
     }
   }
@@ -2279,6 +2288,12 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
+    auto type_status = ConvertDType(tf_dtype, &dtype);
+    if (type_status != tensorflow::Status::OK()) {
+      LOG(WARNING) << "Data type conversion for input '" << node_name
+                   << "' failed";
+      return type_status;
+    }
     TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
 
     VLOG(2) << "accessing output index of: " << output_idx
@@ -2346,8 +2361,8 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument(
-          "Output node is weights not tensor");
+      return tensorflow::errors::InvalidArgument("Output node'" + tensor_name +
+                                                 "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
     if (!tensor) {
@@ -2504,7 +2519,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
+    auto type_status = ConvertDType(tf_dtype, &dtype);
+    if (type_status != tensorflow::Status::OK()) {
+      LOG(WARNING) << "Type conversion failed for " << node_name;
+      return type_status;
+    }
 
     VLOG(2) << "Accessing output index of: " << output_idx
             << ", at node: " << node_name
@@ -2515,8 +2534,12 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
     // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
     //            update the code once TRT 4.0 comes out.
-    if (op_info.shape().dim_size() != 4)
-      return tensorflow::errors::Unimplemented("require 4 dimensional input");
+    if (op_info.shape().dim_size() != 4) {
+      string err_str = "Require 4 dimensional input.";
+      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
+                shape_inference_node_name);
+      return tensorflow::errors::Unimplemented(err_str);
+    }
 
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
@@ -2577,8 +2600,8 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument(
-          "Output node is weights not tensor");
+      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
+                                                 "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
     if (!tensor) {
@@ -2622,7 +2645,8 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   }
   TF_RETURN_IF_ERROR(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
       engine_name, engine_name));
-  LOG(INFO) << "finished engine " << engine_name;
+  LOG(INFO) << "finished engine " << engine_name << " containing "
+            << s.subgraph_node_ids.size() << " nodes";
 
   // Build the TRT op
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 666220d78c76960b2a3217486b308c17b912b467..338475d90ea55ab2c1bb8df77f27a71a4a36a5dd 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -41,7 +41,7 @@ def create_inference_graph(input_graph_def,
                            max_workspace_size_bytes=2 << 20,
                            precision_mode="FP32",
                            minimum_segment_size=3):
-  """Python wrapper for the TRT transormation.
+  """Python wrapper for the TRT transformation.
 
   Args:
     input_graph_def: GraphDef object containing a model to be transformed.
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
index 74df75902ed4ba12d34ed89c04666c38411d6d7d..dc7c93f869f5ef7c8eaa2a87eed26cfe69597fdb 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
@@ -61,7 +61,7 @@ bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
 
     // TODO(aaroey): we should not use sync copy on default stream. Make sure
     // stream->ThenMemcpy() is used in future PRs.
-    // TODO(sami,aaroey): Need to figureout a way to ensure synchronization
+    // TODO(sami,aaroey): Need to figure out a way to ensure synchronization
     // between stream, perhaps using a tensor?
     auto status = cudaMemcpyAsync(d.first, it.second, d.second,
                                   cudaMemcpyDeviceToDevice, stream);
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 6193f0b0a13f6985d5fc8dd4c6fc09b15f72f139..8fc4697c513057c668d31a341cb13f60dc107e81 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -80,13 +80,20 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
   std::vector<const tensorflow::Edge*> in_edges(dst->in_edges().begin(),
                                                 dst->in_edges().end());
   for (const tensorflow::Edge* in_edge : in_edges) {
-    if (in_edge->src() != src) {
-      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
-      if (e->src() == graph->source_node()) {
-        graph->AddEdge(e->src(), e->src_output(), src,
-                       tensorflow::Graph::kControlSlot);
-      } else {
-        graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
+    if (in_edge->IsControlEdge()) {
+      if (in_edge->src() != src) {
+        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        graph->AddControlEdge(e->src(), src);
+      }
+    } else {
+      if (in_edge->src() != src) {
+        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        if (e->src() == graph->source_node()) {
+          graph->AddEdge(e->src(), e->src_output(), src,
+                         tensorflow::Graph::kControlSlot);
+        } else {
+          graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
+        }
       }
     }
   }
@@ -94,12 +101,19 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
   std::vector<const tensorflow::Edge*> out_edges(dst->out_edges().begin(),
                                                  dst->out_edges().end());
   for (const tensorflow::Edge* out_edge : out_edges) {
-    tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
-    if (e->dst() == graph->sink_node()) {
-      graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
-                     e->dst_input());
+    if (out_edge->IsControlEdge()) {
+      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      graph->AddControlEdge(src, e->dst());
     } else {
-      graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
+      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      if (e->dst() == graph->sink_node()) {
+        VLOG(1) << " edge to sink node " << src->name() << " -> "
+                << e->dst()->name();
+        graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
+                       e->dst_input());
+      } else {
+        graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
+      }
     }
   }
 
@@ -118,7 +132,7 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
 
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments) {
   // Create a Graph representation of the GraphDef.
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
@@ -136,7 +150,7 @@ tensorflow::Status SegmentGraph(
   for (int i = 0; i < graph.num_node_ids(); ++i) {
     tensorflow::Node* node = graph.FindNodeId(i);
     if (options.exclude_node_list.count(node->name()) != 0 ||
-        !candidate_fn(node->def())) {
+        !candidate_fn(node)) {
       node = nullptr;
     }
     node_segments.emplace_back(node);
@@ -155,7 +169,7 @@ tensorflow::Status SegmentGraph(
 
   for (const tensorflow::Node* node : order) {
     // All output nodes of 'node' have been visited...
-    VLOG(2) << "Trying node " << node->name();
+    VLOG(2) << "Trying node " << node->name() << " id=" << node->id();
 
     // 'node' must be a TRT candidate...
     if (node_segments[node->id()].Value() == nullptr) {
@@ -169,8 +183,12 @@ tensorflow::Status SegmentGraph(
     while (true) {
       std::set<const tensorflow::Edge*> contract_edges;
       for (const tensorflow::Edge* out_edge : node->out_edges()) {
-        VLOG(2) << "... out node " << out_edge->dst()->name();
-
+        VLOG(2) << "... out node " << out_edge->dst()->name() << " ( "
+                << out_edge->dst()->id() << " <- " << node->id() << " )";
+        if (out_edge->IsControlEdge()) {
+          VLOG(2) << "... ... Control Edge, Skipping";
+          continue;
+        }
         // Out node must be TRT candidate...
         if (node_segments[out_edge->dst()->id()].Value() == nullptr) {
           VLOG(2) << "... ... not a TRT candidate";
@@ -196,7 +214,8 @@ tensorflow::Status SegmentGraph(
         const tensorflow::Node* src = contract_edge->src();
         const tensorflow::Node* dst = contract_edge->dst();
 
-        VLOG(2) << "Merge " << src->name() << " <- " << dst->name();
+        VLOG(2) << "Merge " << src->name() << " <- " << dst->name() << " ("
+                << src->id() << " <- " << dst->id();
         node_segments[src->id()].Merge(&node_segments[dst->id()]);
 
         // Contracting the edge leaves disconnected graph edges.
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index ee6e2b3ed26cd1fabc0e952d882d549046cd9a30..7e8685f44a8c8a20fd7159ee40a8835531e78e9f 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -20,10 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+
 namespace tensorrt {
 namespace segment {
 
@@ -46,7 +48,7 @@ struct SegmentOptions {
 // @return the status.
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments);
 
 }  // namespace segment
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 74cbc5f2b376b76324eed06d251767da6f928e3e..7ddabec268d4ef7b5c679001e5fb99aa7d83aec0 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -35,7 +35,7 @@ class SegmentTest : public ::testing::Test {
   TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                     TF_Status* s, const char* name);
 
-  std::function<bool(const NodeDef&)> MakeCandidateFn(
+  std::function<bool(const Node*)> MakeCandidateFn(
       const std::set<string>& node_names);
 
  protected:
@@ -60,10 +60,10 @@ bool SegmentTest::GetGraphDef(TF_Graph* graph,
   return ret;
 }
 
-std::function<bool(const NodeDef&)> SegmentTest::MakeCandidateFn(
+std::function<bool(const Node*)> SegmentTest::MakeCandidateFn(
     const std::set<string>& node_names) {
-  return [node_names](const NodeDef& node) -> bool {
-    return node_names.find(node.name()) != node_names.end();
+  return [node_names](const Node* node) -> bool {
+    return node_names.find(node->name()) != node_names.end();
   };
 }
 
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 0b661bd536c7c7f4566c89176420c3b40007bb74..ad01bedd8fa066e914b05b20dbc47d9aabe790d9 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -75,7 +75,7 @@ def run_graph(gdef, dumm_inp):
   return val
 
 
-# Use real data that is representatitive of the inference dataset
+# Use real data that is representative of the inference dataset
 # for calibration. For this test script it is random data.
 def run_calibration(gdef, dumm_inp):
   """Run given calibration graph multiple times."""
diff --git a/tensorflow/contrib/testing/BUILD b/tensorflow/contrib/testing/BUILD
index 0be6aa755bee50451f6717139fd8e1315789b389..8a40e111d7723b0d1c332b9d2381169c8bed510f 100644
--- a/tensorflow/contrib/testing/BUILD
+++ b/tensorflow/contrib/testing/BUILD
@@ -22,15 +22,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/text/BUILD b/tensorflow/contrib/text/BUILD
index 698fdd830f57eb64c3c4119371f545908bf726e5..38d91f7e496d47ac74415da3bae91bad7f431dce 100644
--- a/tensorflow/contrib/text/BUILD
+++ b/tensorflow/contrib/text/BUILD
@@ -111,14 +111,3 @@ py_test(
         "//tensorflow/python:training",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/tfprof/BUILD b/tensorflow/contrib/tfprof/BUILD
index 28adce71d414d267bd53109751689c6f4d5d7b3b..e7f4ebdd36aa9d21ec1dc71ed200001eb0331704 100644
--- a/tensorflow/contrib/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/BUILD
@@ -20,15 +20,3 @@ py_library(
         "//tensorflow/python/profiler:tfprof_logger",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/BUILD b/tensorflow/contrib/timeseries/BUILD
index 6ba069778ccf5bfba94921ac47db9233c63c0cfe..f2b8786a527289fe20de86447355fbf552cd265e 100644
--- a/tensorflow/contrib/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/BUILD
@@ -31,15 +31,3 @@ py_library(
         "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 70bf67c7793ba96a8be0ced58421f778a5776d6c..32e948a009741b126e21a64473ac2d020a25a7af 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -109,15 +109,3 @@ py_test(
         "//tensorflow/python/estimator:estimator_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly.py b/tensorflow/contrib/timeseries/examples/known_anomaly.py
index c08c0b0acb917f527d7efa91874d6405b9220083..e77628ddd390374d6336e3583e07ce03cdec7aea 100644
--- a/tensorflow/contrib/timeseries/examples/known_anomaly.py
+++ b/tensorflow/contrib/timeseries/examples/known_anomaly.py
@@ -53,6 +53,15 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
   one_hot_feature = tf.feature_column.indicator_column(
       categorical_column=string_feature)
 
+  def _exogenous_update_condition(times, features):
+    del times  # unused
+    # Make exogenous updates sparse by setting an update condition. This in
+    # effect allows missing exogenous features: if the condition evaluates to
+    # False, no update is performed. Otherwise we sometimes end up with "leaky"
+    # updates which add unnecessary uncertainty to the model even when there is
+    # no changepoint.
+    return tf.equal(tf.squeeze(features["is_changepoint"], axis=-1), "yes")
+
   estimator = tf.contrib.timeseries.StructuralEnsembleRegressor(
       periodicities=12,
       # Extract a smooth period by constraining the number of latent values
@@ -60,13 +69,7 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
       cycle_num_latent_values=3,
       num_features=1,
       exogenous_feature_columns=[one_hot_feature],
-      # Make exogenous updates sparse by setting an update condition. This in
-      # effect allows missing exogenous features: if the condition evaluates to
-      # False, no update is performed. Otherwise we sometimes end up with
-      # "leaky" updates which add unnecessary uncertainty to the model even when
-      # there is no changepoint.
-      exogenous_update_condition=
-      lambda times, features: tf.equal(features["is_changepoint"], "yes"))
+      exogenous_update_condition=_exogenous_update_condition)
   reader = tf.contrib.timeseries.CSVReader(
       csv_file_name,
       # Indicate the format of our CSV file. First we have two standard columns,
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index 2eee878196bb64b523c491ca808ca8d6ff5dd36c..b1c7475442c58b9a190c818b752760a4fb4fe6f0 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -236,20 +236,36 @@ def train_and_predict(
       [evaluation["mean"][0], predictions["mean"]], axis=0))
   all_times = numpy.concatenate([times, predictions["times"]], axis=0)
 
-  # Export the model in SavedModel format.
+  # Export the model in SavedModel format. We include a bit of extra boilerplate
+  # for "cold starting" as if we didn't have any state from the Estimator, which
+  # is the case when serving from a SavedModel. If Estimator output is
+  # available, the result of "Estimator.evaluate" can be passed directly to
+  # `tf.contrib.timeseries.saved_model_utils.predict_continuation` as the
+  # `continue_from` argument.
+  with tf.Graph().as_default():
+    filter_feature_tensors, _ = evaluation_input_fn()
+    with tf.train.MonitoredSession() as session:
+      # Fetch the series to "warm up" our state, which will allow us to make
+      # predictions for its future values. This is just a dictionary of times,
+      # values, and exogenous features mapping to numpy arrays. The use of an
+      # input_fn is just a convenience for the example; they can also be
+      # specified manually.
+      filter_features = session.run(filter_feature_tensors)
   if export_directory is None:
     export_directory = tempfile.mkdtemp()
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
   export_location = estimator.export_savedmodel(
       export_directory, input_receiver_fn)
-  # Predict using the SavedModel
+  # Warm up and predict using the SavedModel
   with tf.Graph().as_default():
     with tf.Session() as session:
       signatures = tf.saved_model.loader.load(
           session, [tf.saved_model.tag_constants.SERVING], export_location)
+      state = tf.contrib.timeseries.saved_model_utils.cold_start_filter(
+          signatures=signatures, session=session, features=filter_features)
       saved_model_output = (
           tf.contrib.timeseries.saved_model_utils.predict_continuation(
-              continue_from=evaluation, signatures=signatures,
+              continue_from=state, signatures=signatures,
               session=session, steps=100,
               exogenous_features=predict_exogenous_features))
       # The exported model gives the same results as the Estimator.predict()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 64f5cd83575e1eeaa35360c4dc8c5a0d1f384066..d2746032a04946cdfab4b5ac968ea3add5f6b51d 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -88,10 +88,14 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:export",
+        "//tensorflow/python/feature_column",
     ],
 )
 
@@ -132,7 +136,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":feature_keys",
-        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -141,6 +144,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator:estimator_py",
@@ -158,19 +162,28 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip_gpu"],  # b/63391119
     deps = [
+        ":estimators",
         ":feature_keys",
         ":head",
+        ":input_pipeline",
         ":model",
         ":state_management",
+        "//tensorflow/contrib/timeseries/examples:lstm",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -233,6 +246,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_pip",  # b/64527635
         "no_pip_gpu",  # b/63391119
     ],
@@ -441,15 +455,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index ff140efd48104e386826eab7abbc94bec220f9df..4f6527a5465ca01ed34150a26ba26d73a858cd74 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -70,7 +70,7 @@ class ARModel(model.TimeSeriesModel):
       input_window_size: Number of past time steps of data to look at when doing
         the regression.
       output_window_size: Number of future time steps to predict. Note that
-        setting it to > 1 empiricaly seems to give a better fit.
+        setting it to > 1 empirically seems to give a better fit.
       num_features: number of input features per time step.
       num_time_buckets: Number of buckets into which to divide (time %
         periodicity) for generating time based features.
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index 8d13343e82340dae11b0be54e3bc3152060dca36..886e1846e2a4f75503a47a3ff92adf97f814053f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -33,16 +33,18 @@ from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.training import training as train
+from tensorflow.python.util import nest
 
 
 class TimeSeriesRegressor(estimator_lib.Estimator):
   """An Estimator to fit and evaluate a time series model."""
 
   def __init__(self, model, state_manager=None, optimizer=None, model_dir=None,
-               config=None):
+               config=None, head_type=ts_head_lib.TimeSeriesRegressionHead):
     """Initialize the Estimator.
 
     Args:
@@ -53,6 +55,8 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
           from tf.train.Optimizer. Defaults to Adam with step size 0.02.
       model_dir: See `Estimator`.
       config: See `Estimator`.
+      head_type: The kind of head to use for the model (inheriting from
+          `TimeSeriesRegressionHead`).
     """
     input_statistics_generator = math_utils.InputStatisticsFromMiniBatch(
         dtype=model.dtype, num_features=model.num_features)
@@ -61,8 +65,8 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
     if optimizer is None:
       optimizer = train.AdamOptimizer(0.02)
     self._model = model
-    ts_regression_head = ts_head_lib.time_series_regression_head(
-        model, state_manager, optimizer,
+    ts_regression_head = head_type(
+        model=model, state_manager=state_manager, optimizer=optimizer,
         input_statistics_generator=input_statistics_generator)
     model_fn = ts_regression_head.create_estimator_spec
     super(TimeSeriesRegressor, self).__init__(
@@ -98,11 +102,11 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
     def _serving_input_receiver_fn():
       """A receiver function to be passed to export_savedmodel."""
       placeholders = {}
-      placeholders[feature_keys.TrainEvalFeatures.TIMES] = (
-          array_ops.placeholder(
-              name=feature_keys.TrainEvalFeatures.TIMES,
-              dtype=dtypes.int64,
-              shape=[default_batch_size, default_series_length]))
+      time_placeholder = array_ops.placeholder(
+          name=feature_keys.TrainEvalFeatures.TIMES,
+          dtype=dtypes.int64,
+          shape=[default_batch_size, default_series_length])
+      placeholders[feature_keys.TrainEvalFeatures.TIMES] = time_placeholder
       # Values are only necessary when filtering. For prediction the default
       # value will be ignored.
       placeholders[feature_keys.TrainEvalFeatures.VALUES] = (
@@ -145,15 +149,29 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
       # use only static metadata from the returned Tensors.
       with ops.Graph().as_default():
         self._model.initialize_graph()
-        model_start_state = self._model.get_start_state()
-      for prefixed_state_name, state_tensor in ts_head_lib.state_to_dictionary(
-          model_start_state).items():
+        # Evaluate the initial state as same-dtype "zero" values. These zero
+        # constants aren't used, but are necessary for feeding to
+        # placeholder_with_default for the "cold start" case where state is not
+        # fed to the model.
+        def _zeros_like_constant(tensor):
+          return tensor_util.constant_value(array_ops.zeros_like(tensor))
+        start_state = nest.map_structure(
+            _zeros_like_constant, self._model.get_start_state())
+      batch_size_tensor = array_ops.shape(time_placeholder)[0]
+      for prefixed_state_name, state in ts_head_lib.state_to_dictionary(
+          start_state).items():
         state_shape_with_batch = tensor_shape.TensorShape(
-            (default_batch_size,)).concatenate(state_tensor.get_shape())
-        placeholders[prefixed_state_name] = array_ops.placeholder(
+            (default_batch_size,)).concatenate(state.shape)
+        default_state_broadcast = array_ops.tile(
+            state[None, ...],
+            multiples=array_ops.concat(
+                [batch_size_tensor[None],
+                 array_ops.ones(len(state.shape), dtype=dtypes.int32)],
+                axis=0))
+        placeholders[prefixed_state_name] = array_ops.placeholder_with_default(
+            input=default_state_broadcast,
             name=prefixed_state_name,
-            shape=state_shape_with_batch,
-            dtype=state_tensor.dtype)
+            shape=state_shape_with_batch)
       return export_lib.ServingInputReceiver(placeholders, placeholders)
 
     return _serving_input_receiver_fn
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index f4304f2560a82b666f87f302a821a39b0e9e140e..9f161c1695f415ad28c41ad0c00bc0b056399b96 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import tempfile
 
 import numpy
+import six
 
 from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import estimators
@@ -126,6 +127,33 @@ class TimeSeriesRegressorTest(test.TestCase):
             signatures=signatures,
             session=sess)
 
+        # Test cold starting
+        six.assertCountEqual(
+            self,
+            [feature_keys.FilteringFeatures.TIMES,
+             feature_keys.FilteringFeatures.VALUES],
+            signatures.signature_def[
+                feature_keys.SavedModelLabels.COLD_START_FILTER].inputs.keys())
+        batch_numpy_times = numpy.tile(
+            numpy.arange(30, dtype=numpy.int64)[None, :], (10, 1))
+        batch_numpy_values = numpy.ones([10, 30, 1])
+        state = saved_model_utils.cold_start_filter(
+            signatures=signatures,
+            session=sess,
+            features={
+                feature_keys.FilteringFeatures.TIMES: batch_numpy_times,
+                feature_keys.FilteringFeatures.VALUES: batch_numpy_values
+            }
+        )
+        predict_times = numpy.tile(
+            numpy.arange(30, 45, dtype=numpy.int64)[None, :], (10, 1))
+        predictions = saved_model_utils.predict_continuation(
+            continue_from=state,
+            times=predict_times,
+            signatures=signatures,
+            session=sess)
+        self.assertAllEqual([10, 15, 1], predictions["mean"].shape)
+
   def test_fit_restore_fit_ar_regressor(self):
     def _estimator_fn(model_dir):
       return estimators.ARRegressor(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py b/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py
index 970b9aa8acd6f55db843a4e023052b122992baf4..56566ee2e3207abd81ef665da10f851c9dc98ccb 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py
@@ -72,3 +72,4 @@ class SavedModelLabels(object):
   """Names of signatures exported with export_savedmodel."""
   PREDICT = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
   FILTER = "filter"
+  COLD_START_FILTER = "cold_start_filter"
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index f4d9351432ef32f22334667766528907eac77f19..a28a5872b850b51630240bdeb3ff22f372613523 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -39,27 +39,18 @@ from tensorflow.python.util import nest
 from tensorflow.python.summary import summary
 
 
-def time_series_regression_head(model,
-                                state_manager,
-                                optimizer,
-                                input_statistics_generator=None):
-  """Creates a `_Head` for time series regression.
+class _NoStatePredictOutput(export_lib.PredictOutput):
 
-  Args:
-    model: A model for time series regression.
-    state_manager: A state manager.
-    optimizer: An optimizer.
-    input_statistics_generator: A input statistics generator.
-
-  Returns:
-    An instance of `_Head` for time series regression.
-  """
-  return _TimeSeriesRegressionHead(model, state_manager, optimizer,
-                                   input_statistics_generator)
+  def as_signature_def(self, receiver_tensors):
+    no_state_receiver_tensors = {
+        key: value for key, value in receiver_tensors.items()
+        if not key.startswith(feature_keys.State.STATE_PREFIX)}
+    return super(_NoStatePredictOutput, self).as_signature_def(
+        receiver_tensors=no_state_receiver_tensors)
 
 
-class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
-  """See `time_series_regression_head`."""
+class TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
+  """Determines input and output signatures for a time series model."""
 
   def __init__(self,
                model,
@@ -67,6 +58,15 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                optimizer,
                input_statistics_generator=None,
                name=None):
+    """Creates a `_Head` for time series regression.
+
+    Args:
+      model: A model for time series regression.
+      state_manager: A state manager.
+      optimizer: An optimizer.
+      input_statistics_generator: A input statistics generator.
+      name: An optional name for the model.
+    """
     self.model = model
     self.state_manager = state_manager
     self.optimizer = optimizer
@@ -150,6 +150,14 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
     with variable_scope.variable_scope("model", reuse=True):
       filtering_outputs = self.create_loss(
           features, estimator_lib.ModeKeys.EVAL)
+    with variable_scope.variable_scope("model", reuse=True):
+      no_state_features = {
+          k: v for k, v in features.items()
+          if not k.startswith(feature_keys.State.STATE_PREFIX)}
+      # Ignore any state management when cold-starting. The model's default
+      # start state is replicated across the batch.
+      cold_filtering_outputs = self.model.define_loss(
+          features=no_state_features, mode=estimator_lib.ModeKeys.EVAL)
     return estimator_lib.EstimatorSpec(
         mode=estimator_lib.ModeKeys.PREDICT,
         export_outputs={
@@ -157,7 +165,10 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                 export_lib.PredictOutput(prediction_outputs),
             feature_keys.SavedModelLabels.FILTER:
                 export_lib.PredictOutput(
-                    state_to_dictionary(filtering_outputs.end_state))
+                    state_to_dictionary(filtering_outputs.end_state)),
+            feature_keys.SavedModelLabels.COLD_START_FILTER:
+                _NoStatePredictOutput(
+                    state_to_dictionary(cold_filtering_outputs.end_state))
         },
         # Likely unused, but it is necessary to return `predictions` to satisfy
         # the Estimator's error checking.
@@ -244,6 +255,58 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
         return self._serving_ops(features)
 
 
+class OneShotPredictionHead(TimeSeriesRegressionHead):
+  """A time series head which exports a single stateless serving signature.
+
+  The serving default signature exported by this head expects `times`, `values`,
+  and any exogenous features, but no state. `values` has shape `[batch_size,
+  filter_length, num_features]` and `times` has shape `[batch_size,
+  total_length]`, where `total_length > filter_length`. Any exogenous features
+  must have their shapes prefixed by the shape of the `times` feature.
+
+  When serving, first performs filtering on the series up to `filter_length`
+  starting from the default start state for the model, then computes predictions
+  on the remainder of the series, returning them.
+
+  Model state is neither accepted nor returned, so filtering must be performed
+  each time predictions are requested when using this head.
+  """
+
+  def _serving_ops(self, features):
+    """Add ops for serving to the graph."""
+    with variable_scope.variable_scope("model", use_resource=True):
+      filtering_features = {}
+      prediction_features = {}
+      values_length = array_ops.shape(
+          features[feature_keys.FilteringFeatures.VALUES])[1]
+      for key, value in features.items():
+        if key == feature_keys.State.STATE_TUPLE:
+          # Ignore state input. The model's default start state is replicated
+          # across the batch.
+          continue
+        if key == feature_keys.FilteringFeatures.VALUES:
+          filtering_features[key] = value
+        else:
+          filtering_features[key] = value[:, :values_length]
+          prediction_features[key] = value[:, values_length:]
+      cold_filtering_outputs = self.model.define_loss(
+          features=filtering_features, mode=estimator_lib.ModeKeys.EVAL)
+      prediction_features[feature_keys.State.STATE_TUPLE] = (
+          cold_filtering_outputs.end_state)
+    with variable_scope.variable_scope("model", reuse=True):
+      prediction_outputs = self.model.predict(
+          features=prediction_features)
+    return estimator_lib.EstimatorSpec(
+        mode=estimator_lib.ModeKeys.PREDICT,
+        export_outputs={
+            feature_keys.SavedModelLabels.PREDICT:
+                _NoStatePredictOutput(prediction_outputs),
+        },
+        # Likely unused, but it is necessary to return `predictions` to satisfy
+        # the Estimator's error checking.
+        predictions={})
+
+
 def _check_feature_shapes_compatible_with(features,
                                           compatible_with_name,
                                           compatible_with_value,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 3415061cfd87358cccaf36dcb301fb36986bbde6..c606db76a668235ab6a837159b9dec072b5fd801 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -18,12 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy
+import six
+
+from tensorflow.contrib.timeseries.examples import lstm as lstm_example
+from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
 from tensorflow.contrib.timeseries.python.timeseries import model
 from tensorflow.contrib.timeseries.python.timeseries import state_management
 
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -31,6 +39,9 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import adam
 from tensorflow.python.training import coordinator as coordinator_lib
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import training as train
@@ -90,7 +101,7 @@ class EvaluationMetricsTests(test.TestCase):
                       .count_up_to(10),
                       dtype=dtypes.float32), (1, 1, 1))
       }
-      model_fn = ts_head_lib.time_series_regression_head(
+      model_fn = ts_head_lib.TimeSeriesRegressionHead(
           model=_TickerModel(),
           state_manager=state_management.PassthroughStateManager(),
           optimizer=train.GradientDescentOptimizer(0.001)).create_estimator_spec
@@ -127,7 +138,7 @@ class _StubModel(object):
 
 
 def _stub_model_fn():
-  return ts_head_lib.time_series_regression_head(
+  return ts_head_lib.TimeSeriesRegressionHead(
       model=_StubModel(),
       state_manager=state_management.PassthroughStateManager(),
       optimizer=train.AdamOptimizer(0.001)).create_estimator_spec
@@ -263,5 +274,76 @@ class PredictFeatureCheckingTests(test.TestCase):
           mode=estimator_lib.ModeKeys.PREDICT)
 
 
+class OneShotTests(test.TestCase):
+
+  def test_one_shot_prediction_head_export(self):
+    model_dir = self.get_temp_dir()
+    categorical_column = feature_column.categorical_column_with_hash_bucket(
+        key="categorical_exogenous_feature", hash_bucket_size=16)
+    exogenous_feature_columns = [
+        feature_column.numeric_column(
+            "2d_exogenous_feature", shape=(2,)),
+        feature_column.embedding_column(
+            categorical_column=categorical_column, dimension=10)]
+    estimator = ts_estimators.TimeSeriesRegressor(
+        model=lstm_example._LSTMModel(
+            num_features=5, num_units=128,
+            exogenous_feature_columns=exogenous_feature_columns),
+        optimizer=adam.AdamOptimizer(0.001),
+        config=estimator_lib.RunConfig(tf_random_seed=4),
+        state_manager=state_management.ChainingStateManager(),
+        head_type=ts_head_lib.OneShotPredictionHead,
+        model_dir=model_dir)
+    train_features = {
+        feature_keys.TrainEvalFeatures.TIMES: numpy.arange(
+            20, dtype=numpy.int64),
+        feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
+            20, dtype=numpy.float32)[:, None], [1, 5]),
+        "2d_exogenous_feature": numpy.ones([20, 2]),
+        "categorical_exogenous_feature": numpy.array(
+            ["strkey"] * 20)[:, None]
+    }
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(train_features), shuffle_seed=2,
+        num_threads=1, batch_size=16, window_size=16)
+    estimator.train(input_fn=train_input_fn, steps=5)
+    input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
+    export_location = estimator.export_savedmodel(self.get_temp_dir(),
+                                                  input_receiver_fn)
+    graph = ops.Graph()
+    with graph.as_default():
+      with session_lib.Session() as session:
+        signatures = loader.load(
+            session, [tag_constants.SERVING], export_location)
+        self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
+                         list(signatures.signature_def.keys()))
+        predict_signature = signatures.signature_def[
+            feature_keys.SavedModelLabels.PREDICT]
+        six.assertCountEqual(
+            self,
+            [feature_keys.FilteringFeatures.TIMES,
+             feature_keys.FilteringFeatures.VALUES,
+             "2d_exogenous_feature",
+             "categorical_exogenous_feature"],
+            predict_signature.inputs.keys())
+        features = {
+            feature_keys.TrainEvalFeatures.TIMES: numpy.tile(
+                numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
+            feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
+                20, dtype=numpy.float32)[None, :, None], [2, 1, 5]),
+            "2d_exogenous_feature": numpy.ones([2, 35, 2]),
+            "categorical_exogenous_feature": numpy.tile(numpy.array(
+                ["strkey"] * 35)[None, :, None], [2, 1, 1])
+        }
+        feeds = {
+            graph.as_graph_element(input_value.name): features[input_key]
+            for input_key, input_value in predict_signature.inputs.items()}
+        fetches = {output_key: graph.as_graph_element(output_value.name)
+                   for output_key, output_value
+                   in predict_signature.outputs.items()}
+        output = session.run(fetches, feed_dict=feeds)
+        self.assertAllEqual((2, 15, 5), output["mean"].shape)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py b/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py
index 04225333b9377447f46d32663df76aece97a51e7..403c6e2cb4aeb665fb112b6322109a6a90f7a261 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py
@@ -492,8 +492,7 @@ class CSVReader(ReaderBaseTimeSeriesParser):
       features_lists.setdefault(column_name, []).append(value)
     features = {}
     for column_name, values in features_lists.items():
-      if (len(values) == 1 and
-          column_name != feature_keys.TrainEvalFeatures.VALUES):
+      if column_name == feature_keys.TrainEvalFeatures.TIMES:
         features[column_name] = values[0]
       else:
         features[column_name] = array_ops.stack(values, axis=1)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index 23452a81c397da3516016d72b7bc9b80f7d6447f..26793c80bfbb3c9394e81a5bbfae360deb95ca58 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -185,7 +185,7 @@ def batch_matrix_pow(matrices, powers):
                     { matmul(A, power(matmul(A, A), (p - 1) / 2)) for odd p
       power(A, 0) = I
 
-    The power(A, 0) = I case is handeled by starting with accumulator set to the
+    The power(A, 0) = I case is handled by starting with accumulator set to the
     identity matrix; matrices with zero residual powers are passed through
     unchanged.
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py b/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
index 97f6d36a879532c12684ffdd700ef40b72750567..0461abdc19c08767114e3d26d1134ea4bc5481f8 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
@@ -15,6 +15,7 @@
 """Convenience functions for working with time series saved_models.
 
 @@predict_continuation
+@@cold_start_filter
 @@filter_continuation
 """
 
@@ -30,10 +31,12 @@ from tensorflow.contrib.timeseries.python.timeseries import model_utils as _mode
 from tensorflow.python.util.all_util import remove_undocumented
 
 
-def _colate_features_to_feeds_and_fetches(continue_from, signature, features,
-                                          graph):
+def _colate_features_to_feeds_and_fetches(signature, features, graph,
+                                          continue_from=None):
   """Uses a saved model signature to construct feed and fetch dictionaries."""
-  if _feature_keys.FilteringResults.STATE_TUPLE in continue_from:
+  if continue_from is None:
+    state_values = {}
+  elif _feature_keys.FilteringResults.STATE_TUPLE in continue_from:
     # We're continuing from an evaluation, so we need to unpack/flatten state.
     state_values = _head.state_to_dictionary(
         continue_from[_feature_keys.FilteringResults.STATE_TUPLE])
@@ -115,6 +118,55 @@ def predict_continuation(continue_from,
   return output
 
 
+def cold_start_filter(signatures, session, features):
+  """Perform filtering using an exported saved model.
+
+  Filtering refers to updating model state based on new observations.
+  Predictions based on the returned model state will be conditioned on these
+  observations.
+
+  Starts from the model's default/uninformed state.
+
+  Args:
+    signatures: The `MetaGraphDef` protocol buffer returned from
+      `tf.saved_model.loader.load`. Used to determine the names of Tensors to
+      feed and fetch. Must be from the same model as `continue_from`.
+    session: The session to use. The session's graph must be the one into which
+      `tf.saved_model.loader.load` loaded the model.
+    features: A dictionary mapping keys to Numpy arrays, with several possible
+      shapes (requires keys `FilteringFeatures.TIMES` and
+      `FilteringFeatures.VALUES`):
+        Single example; `TIMES` is a scalar and `VALUES` is either a scalar or a
+          vector of length [number of features].
+        Sequence; `TIMES` is a vector of shape [series length], `VALUES` either
+          has shape [series length] (univariate) or [series length x number of
+          features] (multivariate).
+        Batch of sequences; `TIMES` is a vector of shape [batch size x series
+          length], `VALUES` has shape [batch size x series length] or [batch
+          size x series length x number of features].
+      In any case, `VALUES` and any exogenous features must have their shapes
+      prefixed by the shape of the value corresponding to the `TIMES` key.
+  Returns:
+    A dictionary containing model state updated to account for the observations
+    in `features`.
+  """
+  filter_signature = signatures.signature_def[
+      _feature_keys.SavedModelLabels.COLD_START_FILTER]
+  features = _input_pipeline._canonicalize_numpy_data(  # pylint: disable=protected-access
+      data=features,
+      require_single_batch=False)
+  output_tensors_by_name, feed_dict = _colate_features_to_feeds_and_fetches(
+      signature=filter_signature,
+      features=features,
+      graph=session.graph)
+  output = session.run(output_tensors_by_name, feed_dict=feed_dict)
+  # Make it easier to chain filter -> predict by keeping track of the current
+  # time.
+  output[_feature_keys.FilteringResults.TIMES] = features[
+      _feature_keys.FilteringFeatures.TIMES]
+  return output
+
+
 def filter_continuation(continue_from, signatures, session, features):
   """Perform filtering using an exported saved model.
 
@@ -124,8 +176,8 @@ def filter_continuation(continue_from, signatures, session, features):
 
   Args:
     continue_from: A dictionary containing the results of either an Estimator's
-      evaluate method or a previous filter_continuation. Used to determine the
-      model state to start filtering from.
+      evaluate method or a previous filter step (cold start or
+      continuation). Used to determine the model state to start filtering from.
     signatures: The `MetaGraphDef` protocol buffer returned from
       `tf.saved_model.loader.load`. Used to determine the names of Tensors to
       feed and fetch. Must be from the same model as `continue_from`.
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 07df7bc9a5cd45e0b54e78aaa6ea02fedc584cc3..5d33e23a427bd54fd02b0eb7489f84d189e05e35 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -269,15 +269,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
index 1afc58cfb240c52a9f001da787addfb7fbb46789..6746dd7b433466c473402e0e8374377093a73492 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
@@ -107,7 +107,7 @@ class VARMA(state_space_model.StateSpaceModel):
 
     Returns:
       the state transition matrix. It has shape
-        [self.state_dimendion, self.state_dimension].
+        [self.state_dimension, self.state_dimension].
     """
     # Pad any unused AR blocks with zeros. The extra state is necessary if
     # ma_order >= ar_order.
@@ -127,7 +127,7 @@ class VARMA(state_space_model.StateSpaceModel):
 
     Returns:
       the state noise transform matrix. It has shape
-        [self.state_dimendion, self.num_features].
+        [self.state_dimension, self.num_features].
     """
     # Noise is broadcast, through the moving average coefficients, to
     # un-observed parts of the latent state.
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index f9d433a45b12630ef46048dae4be4c922e84053e..4de09dd9881474e1c84f19acd9598237b58f5eed 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -119,6 +119,8 @@ py_library(
     srcs = ["python/profiler/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_pb2_grpc",
+        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_proto_py",
         "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
         "//tensorflow/python:util",
     ],
@@ -157,6 +159,7 @@ py_library(
     name = "tpu_lib",
     srcs = [
         "python/tpu/__init__.py",
+        "python/tpu/bfloat16.py",
         "python/tpu/device_assignment.py",
         "python/tpu/topology.py",
         "python/tpu/tpu.py",
@@ -212,6 +215,7 @@ tf_py_test(
         ":datasets",
     ],
     grpc_enabled = True,
+    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -239,6 +243,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["python/tpu/bfloat16_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
 tf_py_test(
     name = "tpu_infeed_test",
     size = "small",
@@ -272,15 +287,13 @@ tf_py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
+tf_py_test(
+    name = "tpu_estimator_signals_test",
+    size = "small",
+    srcs = ["python/tpu/tpu_estimator_signals_test.py"],
+    additional_deps = [
+        ":tpu_estimator",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
 )
diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index ea6e874f2d952b03e8cdabeee00ccfe1b076a0d0..bb60f3e2d771b351058322b908dfe68df4abed30 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -53,6 +53,7 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.contrib.tpu.python import profiler
 from tensorflow.contrib.tpu.python.ops.tpu_ops import *
+from tensorflow.contrib.tpu.python.tpu.bfloat16 import *
 from tensorflow.contrib.tpu.python.tpu.device_assignment import *
 from tensorflow.contrib.tpu.python.tpu.topology import *
 from tensorflow.contrib.tpu.python.tpu.tpu import *
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
index cba71c6b98e1079de6c6c4c32fa2ffc44a9ce71e..3bdf7c2f83b037984a45cea99910df87c967aa40 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -27,6 +27,7 @@ REGISTER_OP("TPUReplicateMetadata")
     .Attr("topology: string = \"\"")
     .Attr("device_assignment: list(int) = []")
     .Attr("computation_shape: list(int) = []")
+    .Attr("host_compute_core: list(string) = []")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -68,6 +69,7 @@ REGISTER_OP("TPUReplicate")
     .Attr("num_replicas: int >= 1")
     .Attr("topology: string = \"\"")
     .Attr("device_assignment: list(int) = []")
+    .Attr("host_compute_core: list(string) = []")
     .Attr("computation_shape: list(int) = []")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Tbroadcast_inputs: list(type) >= 0")
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 198da0203a7d17249c4f50110713121b74d5ca4f..1c32993e8e546a17b8b3c289a306ad8f8388c345 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -6,19 +6,7 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_cc")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-tf_proto_library_cc(
+tf_proto_library(
     name = "tpu_profiler_proto",
     srcs = ["tpu_profiler.proto"],
     has_services = 1,
@@ -98,16 +86,34 @@ tf_cc_test(
     ],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "op_profile_proto",
     srcs = ["op_profile.proto"],
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "tf_op_stats_proto",
     srcs = ["tf_op_stats.proto"],
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
+
+tf_proto_library(
+    name = "tpu_profiler_analysis_proto",
+    srcs = ["tpu_profiler_analysis.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    protodeps = [":tpu_profiler_proto"] + tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "tpu_profiler_analysis_pb2_grpc",
+    srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [":tpu_profiler_analysis_proto_py"],
+)
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index b1ef9fde37fe0647965f0818895be37d2d56d207..f2003e04dd061a57353951fb601ebca2f5bccb74 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -29,6 +29,9 @@ limitations under the License.
 #include "tensorflow/contrib/tpu/profiler/version.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -62,10 +65,17 @@ Status ValidateHostPortPair(const string& host_port) {
 }
 
 ProfileResponse Profile(const string& service_addr, int duration_ms,
+                        const string& repository_root, const string& session_id,
                         const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
+  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
+    // For backward compatibilities, only generate tracetable etc when the
+    // user provide a GCS path for model directory.
+    request.set_repository_root(repository_root);
+    request.set_session_id(session_id);
+  }
   request.add_tools("input_pipeline");
   request.add_tools("overview_page");
   *request.mutable_opts() = opts;
@@ -137,10 +147,17 @@ int main(int argc, char** argv) {
   opts.set_include_dataset_ops(FLAGS_include_dataset_ops);
   tensorflow::ProfileResponse response;
 
+  // Use the current timestamp as the run name.
+  tensorflow::string session_id =
+      tensorflow::tpu::GetCurrentTimeStampAsString();
+  constexpr char kProfilePluginDirectory[] = "plugins/profile/";
+  tensorflow::string repository_root =
+      ::tensorflow::io::JoinPath(FLAGS_logdir, kProfilePluginDirectory);
   while (true) {
     std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
               << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
-    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts);
+    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms,
+                                        repository_root, session_id, opts);
     if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break;
     std::cout << "No trace event is collected. Automatically retrying."
               << std::endl
@@ -158,10 +175,8 @@ int main(int argc, char** argv) {
     return 0;
   }
 
-  // Use the current timestamp as the run name.
-  tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString();
   TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
-      FLAGS_logdir, run, response, &std::cout));
+      FLAGS_logdir, session_id, response, &std::cout));
   // Print this at the end so that it's not buried in irrelevant LOG messages.
   std::cout
       << "NOTE: using the trace duration " << duration_ms << "ms." << std::endl
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index a730d6142d890cc41f72176cf617ac0b0434192c..0b78cf8695091daf797bcb80586397e7ab1c6284 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -76,7 +76,7 @@ def main(unused_argv=None):
   else:
     tpu_cluster_resolver = (
         tf.contrib.cluster_resolver.TPUClusterResolver(
-            tpu_names=[FLAGS.tpu_name],
+            [FLAGS.tpu_name],
             zone=FLAGS.tpu_zone,
             project=FLAGS.gcp_project))
     service_addr = tpu_cluster_resolver.get_master()
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index e5c798aa2f463a1a2d7cb041ba9b51569958f4fd..2a158756279b5be7f818950b7f7b70571b5bc38e 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -79,6 +79,10 @@ message StepInfoResult {
   optional uint64 infeed_duration_ps = 3;
   // The start time of this step in picoseconds.
   optional uint64 begin_ps = 4;
+  // The waiting time within this step in picoseconds.
+  optional uint64 wait_duration_ps = 5;
+  // The time spent on cross-replica-sum in picoseconds.
+  optional uint64 crs_duration_ps = 6;
 }
 
 // Result proto for a sequence of steps.
@@ -199,10 +203,22 @@ message HostOpsPerTpuStep {
   map<int32, int32> step_diffs = 5;
 }
 
+message HostOpsDetailsPerCore {
+  // Map from core id to HostOpsPerTpuStep.
+  map<int32, HostOpsPerTpuStep> core_map = 1;
+}
+
+message HostOpsDetailsPerHost {
+  // Map from hostname to a map from core id to HostOpsPerTpuStep.
+  map<string, HostOpsDetailsPerCore> host_map = 1;
+}
+
 // Result proto for the host ops for all TPU steps.
 message HostOpsResult {
-  // A sequence of HostOpsPerTpuStep (one for each TPU step)
-  repeated HostOpsPerTpuStep host_op_sequence = 1;
+  reserved 1;  // (was repeated HostOpsPerTpuStep host_op_sequence)
+  // A sequence of records with one for each TPU step. Each record
+  // is a map from hostname to a map from core id to HostOpsPerTpuStep.
+  repeated HostOpsDetailsPerHost hostops_details = 2;
 }
 
 // Result proto for TfStatsHelper.
@@ -223,4 +239,6 @@ message TfOpStats {
   optional RunEnvironmentResult run_environment = 7;
   // The result for the host operations.
   optional HostOpsResult host_ops = 8;
+  // A map from core ID to name.
+  map<uint32, string> core_id_to_name_map = 9;
 }
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index f3f3302ceb3d27dbb21bdce753aeb2d7fcd77448..cddc3cd1b41d6e00409222170e69c429fe6f91f8 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -36,10 +36,17 @@ message ProfileRequest {
   // Optional profiling options that control how a TF session will be profiled.
   ProfileOptions opts = 4;
 
+  // The place where we will dump profile data. We will normally use
+  // MODEL_DIR/plugin/profile/ as our repository root.
+  string repository_root = 5;
+
+  // The user provided profile session identifier.
+  string session_id = 6;
+
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
-  // next-field: 5
+  // next-field: 7
 }
 
 message ProfileToolData {
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
new file mode 100644
index 0000000000000000000000000000000000000000..a4fc8d4e879eb85522f35663c9c628ecd5ef562c
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
@@ -0,0 +1,73 @@
+syntax = "proto3";
+package tensorflow;
+
+import "tensorflow/contrib/tpu/profiler/tpu_profiler.proto";
+
+message NewProfileSessionRequest {
+  ProfileRequest request = 1;
+  string repository_root = 2;
+  repeated string hosts = 3;
+}
+
+message NewProfileSessionResponse {
+  // Auxiliary error_message.
+  string error_message = 1;
+  // If success, return session identifier for future reference.
+  string session_id = 2;
+}
+
+message EnumProfileSessionsAndToolsRequest {
+  string repository_root = 1;
+}
+
+message ProfileSessionInfo {
+  string session_id = 1;
+  // Which tool data is available for consumption.
+  repeated string available_tools = 2;
+}
+
+message EnumProfileSessionsAndToolsResponse {
+  // Auxiliary error_message.
+  string error_message = 1;
+  // If success, the returned sessions information are stored here.
+  repeated ProfileSessionInfo sessions = 2;
+}
+
+message ProfileSessionDataRequest {
+  string repository_root = 1;
+  string session_id = 2;
+  // Which tool
+  string tool_name = 3;
+  // Tool's specific parameters. e.g. TraceViewer's viewport etc
+  map<string, string> parameters = 4;
+}
+
+message ProfileSessionDataResponse {
+  // Auxiliary error_message.
+  string error_message = 1;
+
+  // Output format. e.g. "json" or "proto" or "blob"
+  string output_format = 2;
+
+  // TODO(jiesun): figure out whether to put bytes or oneof tool specific proto.
+  bytes output = 3;
+}
+////////////////////////////////////////////////////////////////////////////////
+// TPUProfileAnalysis service provide entry point for profiling TPU and for
+// serving profiled data to Tensorboard through GRPC
+////////////////////////////////////////////////////////////////////////////////
+service TPUProfileAnalysis {
+  // Starts a profiling session, blocks until it completes.
+  // TPUProfileAnalysis service delegate this to TPUProfiler service.
+  // Populate the profiled data in repository, then return status to caller.
+  rpc NewSession(NewProfileSessionRequest) returns (NewProfileSessionResponse) {
+  }
+  // Enumerate existing sessions and return available profile tools.
+  rpc EnumSessions(EnumProfileSessionsAndToolsRequest)
+      returns (EnumProfileSessionsAndToolsResponse) {
+  }
+  // Retrieve specific tool's data for specific session.
+  rpc GetSessionToolData(ProfileSessionDataRequest)
+      returns (ProfileSessionDataResponse) {
+  }
+}
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5148828878b1c03bf35d1d11dc11942128b20c
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+#
+# Do not use pylint on generated code.
+# pylint: disable=missing-docstring,g-short-docstring-punctuation,g-no-space-after-docstring-summary,invalid-name,line-too-long,unused-argument,g-doc-args
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import grpc
+
+from tensorflow.contrib.tpu.profiler import tpu_profiler_analysis_pb2 as third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2
+
+
+class TPUProfileAnalysisStub(object):
+  """//////////////////////////////////////////////////////////////////////////////
+
+  TPUProfileAnalysis service provide entry point for profiling TPU and for
+  serving profiled data to Tensorboard through GRPC
+  //////////////////////////////////////////////////////////////////////////////
+  """
+
+  def __init__(self, channel):
+    """Constructor.
+
+    Args:
+      channel: A grpc.Channel.
+    """
+    self.NewSession = channel.unary_unary(
+        '/tensorflow.TPUProfileAnalysis/NewSession',
+        request_serializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        NewProfileSessionRequest.SerializeToString,
+        response_deserializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        NewProfileSessionResponse.FromString,
+    )
+    self.EnumSessions = channel.unary_unary(
+        '/tensorflow.TPUProfileAnalysis/EnumSessions',
+        request_serializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        EnumProfileSessionsAndToolsRequest.SerializeToString,
+        response_deserializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        EnumProfileSessionsAndToolsResponse.FromString,
+    )
+    self.GetSessionToolData = channel.unary_unary(
+        '/tensorflow.TPUProfileAnalysis/GetSessionToolData',
+        request_serializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        ProfileSessionDataRequest.SerializeToString,
+        response_deserializer=
+        third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+        ProfileSessionDataResponse.FromString,
+    )
+
+
+class TPUProfileAnalysisServicer(object):
+  """//////////////////////////////////////////////////////////////////////////////
+
+  TPUProfileAnalysis service provide entry point for profiling TPU and for
+  serving profiled data to Tensorboard through GRPC
+  //////////////////////////////////////////////////////////////////////////////
+  """
+
+  def NewSession(self, request, context):
+    """Starts a profiling session, blocks until it completes.
+    TPUProfileAnalysis service delegate this to TPUProfiler service.
+    Populate the profiled data in repository, then return status to caller.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+  def EnumSessions(self, request, context):
+    """Enumerate existing sessions and return available profile tools.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+  def GetSessionToolData(self, request, context):
+    """Retrieve specific tool's data for specific session.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+
+def add_TPUProfileAnalysisServicer_to_server(servicer, server):
+  rpc_method_handlers = {
+      'NewSession':
+          grpc.unary_unary_rpc_method_handler(
+              servicer.NewSession,
+              request_deserializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              NewProfileSessionRequest.FromString,
+              response_serializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              NewProfileSessionResponse.SerializeToString,
+          ),
+      'EnumSessions':
+          grpc.unary_unary_rpc_method_handler(
+              servicer.EnumSessions,
+              request_deserializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              EnumProfileSessionsAndToolsRequest.FromString,
+              response_serializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              EnumProfileSessionsAndToolsResponse.SerializeToString,
+          ),
+      'GetSessionToolData':
+          grpc.unary_unary_rpc_method_handler(
+              servicer.GetSessionToolData,
+              request_deserializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              ProfileSessionDataRequest.FromString,
+              response_serializer=
+              third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2.
+              ProfileSessionDataResponse.SerializeToString,
+          ),
+  }
+  generic_handler = grpc.method_handlers_generic_handler(
+      'tensorflow.TPUProfileAnalysis', rpc_method_handlers)
+  server.add_generic_rpc_handlers((generic_handler,))
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/contrib/tpu/proto/BUILD
index e1660985676e8c2efe3b01e32b48b211391885b7..fcfbbe1a213b6959b82c20beff02df48517b5e98 100644
--- a/tensorflow/contrib/tpu/proto/BUILD
+++ b/tensorflow/contrib/tpu/proto/BUILD
@@ -4,17 +4,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "tpu_embedding_config_proto",
     srcs = [
diff --git a/tensorflow/contrib/tpu/python/profiler/__init__.py b/tensorflow/contrib/tpu/python/profiler/__init__.py
index bde13f0527a1d8c5f71dd9684b93144ae07d60e4..15ce6aceec299adacd7025f0021cf8b6f6ef765b 100644
--- a/tensorflow/contrib/tpu/python/profiler/__init__.py
+++ b/tensorflow/contrib/tpu/python/profiler/__init__.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.contrib.tpu.profiler.tpu_profiler_analysis_pb2 import *
 from tensorflow.contrib.tpu.profiler.trace_events_pb2 import *
 # pylint: enable=wildcard-import,unused-import
 
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16.py b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e49af6408e8aaf2d6bd56335a60724853ac14c2
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper context for running models with bfloat16."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import tf_contextlib
+
+
+def _get_custom_getter():
+  """Returns a custom getter that this class's methods must be called under.
+
+  All methods of this class must be called under a variable scope that was
+  passed this custom getter. Example:
+
+  ```python
+  network = ConvNetBuilder(...)
+  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
+    network.conv(...)
+    # Call more methods of network here
+  ```
+
+  Currently, this custom getter only does anything if self.use_tf_layers is
+  True. In that case, it causes variables to be stored as dtype
+  self.variable_type, then casted to the requested dtype, instead of directly
+  storing the variable as the requested dtype.
+  """
+
+  def inner_custom_getter(getter, *args, **kwargs):
+    """Custom getter that forces variables to have type self.variable_type."""
+    cast_to_bfloat16 = False
+    requested_dtype = kwargs['dtype']
+    if requested_dtype == dtypes.bfloat16:
+      # Only change the variable dtype if doing so does not decrease variable
+      # precision.
+      kwargs['dtype'] = dtypes.float32
+      cast_to_bfloat16 = True
+    var = getter(*args, **kwargs)
+    # This if statement is needed to guard the cast, because batch norm
+    # assigns directly to the return value of this custom getter. The cast
+    # makes the return value not a variable so it cannot be assigned. Batch
+    # norm variables are always in fp32 so this if statement is never
+    # triggered for them.
+    if cast_to_bfloat16:
+      var = math_ops.cast(var, dtypes.bfloat16)
+    return var
+
+  return inner_custom_getter
+
+
+@tf_contextlib.contextmanager
+def bfloat16_scope():
+  """Scope class for bfloat16 variables so that the model uses custom getter.
+
+  This enables variables to be read as bfloat16 type when using get_variable.
+  """
+  with variable_scope.variable_scope(
+      'bfloat16', custom_getter=_get_custom_getter()) as varscope:
+    yield varscope
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a01c7308fbf14d2fb3bd29382d98a6ade1d810
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for bfloat16 helper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.tpu import bfloat16
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variable_scope
+
+from tensorflow.python.platform import test
+
+
+class BFloat16ScopeTest(test.TestCase):
+
+  def testScopeName(self):
+    """Test if name for the variable scope is propogated correctly.
+    """
+    with bfloat16.bfloat16_scope() as bf:
+      self.assertEqual(bf.name, "bfloat16")
+
+  def testRequestedDType(self):
+    """Test if requested dtype is honored in the getter.
+    """
+    with bfloat16.bfloat16_scope() as scope:
+      v1 = variable_scope.get_variable("v1", [])
+      self.assertEqual(v1.dtype.base_dtype, dtypes.float32)
+      v2 = variable_scope.get_variable("v2", [], dtype=dtypes.bfloat16)
+      self.assertEqual(v2.dtype.base_dtype, dtypes.bfloat16)
+      self.assertEqual([dtypes.float32, dtypes.float32],
+                       [v.dtype.base_dtype for v in scope.global_variables()])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index d5f54ff4fd278f0c84f79e0079bfb7a409dfba8d..3f2db548ace9e10df7844d8fb461670d27234670 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -201,7 +201,7 @@ def replicate(computation,
       `DeviceAssignment` may be omitted if each replica of the computation uses
       only one core, and there is either only one replica, or the number of
       replicas is equal to the number of cores in the TPU system.
-    name: The name of the operator.
+    name: (Deprecated) Does nothing.
   Returns:
     A list of lists of output tensors, indexed by `[replica_num][output_num]`.
   Raises:
@@ -209,8 +209,7 @@ def replicate(computation,
     ValueError: If the number of inputs per replica does not match
       the number of formal parameters to `computation`.
   """
-  if name is None:
-    name = "TPUReplicate"
+  del name
   inputs = [[]] if inputs is None else inputs
 
   metadata_kwargs = {}
@@ -274,118 +273,117 @@ def replicate(computation,
 
   graph = ops.get_default_graph()
 
-  with ops.name_scope(name, "replicate"):
-    # Fan-in: Builds a TPUReplicatedInput node for each input.
-    computation_inputs = []
-    for i in range(0, input_arity):
-      replicas = [inputs[replica][i] for replica in xrange(num_replicas)]
-      computation_inputs.append(
-          tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
+  # Fan-in: Builds a TPUReplicatedInput node for each input.
+  computation_inputs = []
+  for i in range(0, input_arity):
+    replicas = [inputs[replica][i] for replica in xrange(num_replicas)]
+    computation_inputs.append(
+        tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
+
+  context = TPUReplicateContext(name=graph.unique_name("cluster"))
+  try:
+    context.Enter()
+
+    metadata = tpu_ops.tpu_replicate_metadata(
+        num_replicas=num_replicas, **metadata_kwargs)
+
+    with tpu_function.tpu_shard_context(
+        num_replicas), ops.control_dependencies([metadata]):
+
+      # The EncapsulateTPUComputations rewrite needs to identify the
+      # replicated arguments inside each computation. Adds identity operators
+      # tagged with an attribute _tpu_replicated_input to identify the
+      # replicated inputs.
+      # pylint: disable=protected-access
+      with graph._attr_scope({"_tpu_replicated_input":
+                              attr_value_pb2.AttrValue(b=True)}):
+        computation_inputs = [
+            array_ops.identity(x, name="replicated_input_{}".format(i))
+            for i, x in enumerate(computation_inputs)]
+      # pylint: enable=protected-access
+
+      # If there is an infeed queue, adds the dequeued values to the
+      # computation's inputs.
+      if infeed_queue is not None:
+        infeed_queue.set_number_of_shards(num_replicas)
+        for t in infeed_queue.generate_dequeue_op():
+          computation_inputs.append(t)
+
+      # Only resource variables work inside a TPU computation, so turn on
+      # resource variables for the computation.
+      # TODO(phawkins): consider removing this code. It will
+      # be less confusing to clients if they knowingly choose to use resource
+      # variables.
+      vscope = variable_scope.get_variable_scope()
+      saved_use_resource = vscope.use_resource
+      vscope.set_use_resource(True)
+
+      outputs = computation(*computation_inputs)
+
+      vscope.set_use_resource(saved_use_resource)
+
+    # If the computation only returned one value, makes it a tuple.
+    if not isinstance(outputs, (list, tuple)):
+      outputs = (outputs,)
 
-    context = TPUReplicateContext(name=graph.unique_name("cluster"))
     try:
-      context.Enter()
-
-      metadata = tpu_ops.tpu_replicate_metadata(
-          num_replicas=num_replicas, **metadata_kwargs)
-
-      with tpu_function.tpu_shard_context(
-          num_replicas), ops.control_dependencies([metadata]):
-
-        # The EncapsulateTPUComputations rewrite needs to identify the
-        # replicated arguments inside each computation. Adds identity operators
-        # tagged with an attribute _tpu_replicated_input to identify the
-        # replicated inputs.
-        # pylint: disable=protected-access
-        with graph._attr_scope({"_tpu_replicated_input":
-                                attr_value_pb2.AttrValue(b=True)}):
-          computation_inputs = [
-              array_ops.identity(x, name="replicated_input_{}".format(i))
-              for i, x in enumerate(computation_inputs)]
-        # pylint: enable=protected-access
-
-        # If there is an infeed queue, adds the dequeued values to the
-        # computation's inputs.
-        if infeed_queue is not None:
-          infeed_queue.set_number_of_shards(num_replicas)
-          for t in infeed_queue.generate_dequeue_op():
-            computation_inputs.append(t)
-
-        # Only resource variables work inside a TPU computation, so turn on
-        # resource variables for the computation.
-        # TODO(phawkins): consider removing this code. It will
-        # be less confusing to clients if they knowingly choose to use resource
-        # variables.
-        vscope = variable_scope.get_variable_scope()
-        saved_use_resource = vscope.use_resource
-        vscope.set_use_resource(True)
-
-        outputs = computation(*computation_inputs)
-
-        vscope.set_use_resource(saved_use_resource)
-
-      # If the computation only returned one value, makes it a tuple.
-      if not isinstance(outputs, (list, tuple)):
-        outputs = (outputs,)
-
-      try:
-        with ops.device(core(0)):
-          outputs = [
-              o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-              for o in outputs
-          ]
-      except Exception as e:
-        raise ValueError(
-            "TPU function return values must all either be Operations or "
-            "convertible to Tensors. Got '%s'" % str(e))
-
-      # Separates the returned Operations and Tensors.
-      output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-      output_tensors = [o for o in outputs
-                        if not isinstance(o, ops.Operation)]
-
-      if outputs != output_tensors + output_operations:
-        raise ValueError(
-            "TPU functions must return zero-or more Tensor values followed by "
-            "zero or more Operations.")
-      output_arity = len(output_tensors)
-
-      # Wraps outputs in Identity ops. Otherwise a replicated input copied
-      # straight to an output would bypass the replicate(). This would be bad
-      # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
-      # be rewritten away, leading to a runtime error.
-      # TODO(phawkins): extend the rewrite to elide these nodes instead.
-      new_output_tensors = []
-      for t in output_tensors:
-        with ops.device(t.device if t.device else core(0)):
-          new_output_tensors.append(array_ops.identity(t))
-      output_tensors = new_output_tensors
-    finally:
-      context.report_unsupported_operations()
-      context.Exit()
-
-    # Fan-out: Builds a TPUReplicatedOutput node for each output.
-    outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
-                                             name="output{}".format(i))
-               for i in xrange(output_arity)]
-
-    with ops.control_dependencies(output_operations):
-      if output_arity == 0:
-        # Returns a list of NoOps dependent on the replication Op, indexed by
-        # [replica_num].
-        return [
-            control_flow_ops.no_op(name="%s_shard_%d" % (name, i))
-            for i in range(num_replicas)
-        ]
-      else:
-        # Wraps the outputs in identity operators so the names of any possible
-        # `fetch` nodes are preserved by the replication rewrite.
-        return [
-            [array_ops.identity(outputs[out][replica],
-                                name="output_%d_shard_%d" % (out, replica))
-             for out in xrange(output_arity)]
-            for replica in xrange(num_replicas)
+      with ops.device(core(0)):
+        outputs = [
+            o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+            for o in outputs
         ]
+    except Exception as e:
+      raise ValueError(
+          "TPU function return values must all either be Operations or "
+          "convertible to Tensors. Got '%s'" % str(e))
+
+    # Separates the returned Operations and Tensors.
+    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+    output_tensors = [o for o in outputs
+                      if not isinstance(o, ops.Operation)]
+
+    if outputs != output_tensors + output_operations:
+      raise ValueError(
+          "TPU functions must return zero-or more Tensor values followed by "
+          "zero or more Operations.")
+    output_arity = len(output_tensors)
+
+    # Wraps outputs in Identity ops. Otherwise a replicated input copied
+    # straight to an output would bypass the replicate(). This would be bad
+    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+    # be rewritten away, leading to a runtime error.
+    # TODO(phawkins): extend the rewrite to elide these nodes instead.
+    new_output_tensors = []
+    for t in output_tensors:
+      with ops.device(t.device if t.device else core(0)):
+        new_output_tensors.append(array_ops.identity(t))
+    output_tensors = new_output_tensors
+  finally:
+    context.report_unsupported_operations()
+    context.Exit()
+
+  # Fan-out: Builds a TPUReplicatedOutput node for each output.
+  outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
+                                           name="output{}".format(i))
+             for i in xrange(output_arity)]
+
+  with ops.control_dependencies(output_operations):
+    if output_arity == 0:
+      # Returns a list of NoOps dependent on the replication Op, indexed by
+      # [replica_num].
+      return [
+          control_flow_ops.no_op(name="shard_%d" % i)
+          for i in range(num_replicas)
+      ]
+    else:
+      # Wraps the outputs in identity operators so the names of any possible
+      # `fetch` nodes are preserved by the replication rewrite.
+      return [
+          [array_ops.identity(outputs[out][replica],
+                              name="output_%d_shard_%d" % (out, replica))
+           for out in xrange(output_arity)]
+          for replica in xrange(num_replicas)
+      ]
 
 
 def shard(computation,
@@ -450,7 +448,7 @@ def shard(computation,
       `DeviceAssignment` may be omitted if each shard of the computation uses
       only one core, and there is either only one shard, or the number of shards
       is equal to the number of cores in the TPU system.
-    name: The name of the operator.
+    name: (Deprecated) Does nothing.
   Returns:
     A list of output tensors.
   Raises:
@@ -579,7 +577,7 @@ def batch_parallel(computation,
       `DeviceAssignment` may be omitted if each shard of the computation uses
       only one core, and there is either only one shard, or the number of shards
       is equal to the number of cores in the TPU system.
-    name: The name of the operator.
+    name: (Deprecated) Does nothing.
   Returns:
     A list of output tensors.
   Raises:
@@ -613,7 +611,7 @@ def rewrite(computation,
       mapping between logical cores in the computation with physical cores in
       the TPU topology. May be omitted for a single-core computation, in which
       case the core attached to task 0, TPU device 0 is used.
-    name: The name of the operator.
+    name: (Deprecated) Does nothing.
   Returns:
     A list of output tensors.
   """
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 38b5ea23103730630ae8e1cdd7b9180a501013c5..cc1a7fd801506e3f0b758c4848205f1c375403d2 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -35,10 +35,16 @@ _TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
 _SERVICE_KEY = run_config_lib._SERVICE_KEY
 _TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
 _NUM_CORES_PER_HOST = 8
-
 # pylint: enable=protected-access
 
 
+class InputPipelineConfig(object):
+  r"""Please see the definition of these values in TPUConfig."""
+  PER_SHARD_V1 = 1
+  PER_HOST_V1 = 2
+  PER_HOST_V2 = 3
+
+
 # TODO(b/72511246) Provide a simplified api to configure model parallelism.
 class TPUConfig(
     collections.namedtuple('TPUConfig', [
@@ -68,13 +74,16 @@ class TPUConfig(
       partitioned across 4 cores which span two cores in both x and y
       coordinates.  Please refer to @{tf.contrib.tpu.Topology} for the
       geometry of a TPU mesh.
-    per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host
-      rather than Per-Core. With Per-Host input pipeline deployment, `input_fn`
-      is invoked once on each host. With Per-Core input pipeline deployment, it
-      is invoked once for each core. To be precise, with a global batch size
-      `train_batch_size` in `TPUEstimator` constructor, the batch size for each
-      shard is `train_batch_size` // #hosts. With Per-Core input pipeline
-      deployment, the shard batch size is `train_batch_size` // #cores.
+    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
+      `input_fn` is invoked per-host rather than per-core. With per-host input
+      pipeline configuration, `input_fn` is invoked once on each host. With the
+      per-core input pipeline configuration, it is invoked once for each core.
+      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
+      the batch size for each shard is `train_batch_size` // #hosts in the
+      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
+      `train_batch_size` // #cores. With the per-core input pipeline
+      configuration, the shard batch size is also `train_batch_size` // #cores.
+      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
     tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
       within TPUEstimator, however when using ClusterSpec propagation in more
       esoteric cluster configurations, you may need to specify the job name as a
@@ -117,6 +126,13 @@ class TPUConfig(
         raise ValueError('computation_shape elements can only be 1 or 2; got '
                          'computation_shape={}'.format(computation_shape))
 
+    # per_host_input_for_training may be True, False, or integer in [1..3].
+    # Map legacy values (True, False) to numeric values.
+    if per_host_input_for_training is False:
+      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
+    elif per_host_input_for_training is True:
+      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
+
     # Check initial_infeed_sleep_secs.
     if initial_infeed_sleep_secs:
       util_lib.check_positive_integer(initial_infeed_sleep_secs,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 3bac2db77e95520a6c9c4c17658267a9a6588d94..fbc1173e49fd6e8912f6bfae8a88198eda4f6d5b 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -24,6 +24,7 @@ import copy
 import numpy as np
 
 from tensorflow.contrib.tpu.python.tpu import device_assignment  as tpu_device_assignment
+from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.platform import tf_logging as logging
@@ -205,7 +206,13 @@ class _TPUContext(object):
     """Return true if input_fn is invoked per-core (other than per-host)."""
     mode = self._assert_mode()
     return (mode == model_fn_lib.ModeKeys.TRAIN and
-            not self._config.tpu_config.per_host_input_for_training)
+            (self._config.tpu_config.per_host_input_for_training is
+             tpu_config.InputPipelineConfig.PER_SHARD_V1))
+
+  def is_input_per_host_with_iterators(self):
+    """Return true if input_fn should be run in the per-host v2 config."""
+    return (self._config.tpu_config.per_host_input_for_training is
+            tpu_config.InputPipelineConfig.PER_HOST_V2)
 
   def is_running_on_cpu(self, is_export_mode=False):
     """Determines whether the input_fn and model_fn should be invoked on CPU.
@@ -271,7 +278,8 @@ class _TPUContext(object):
       return global_batch_size
 
     # On TPU
-    if self.is_input_sharded_per_core():
+    if self.is_input_sharded_per_core() or (
+        self.is_input_per_host_with_iterators()):
       # We prohibit per core input sharding for the model parallelism case,
       # therefore it is safe to use num_cores here.
       return global_batch_size // self.num_cores
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 52a3ee8283dc374ab14b07e0265f5a7c15a5589f..6834600b7919ff7c3a2f2e4b01e843b711329bbf 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -25,6 +25,7 @@ import threading
 import time
 import traceback
 
+import numpy as np
 import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -48,6 +49,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -61,6 +63,7 @@ from tensorflow.python.training import evaluation
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 _INITIAL_LOSS = 1e7
@@ -69,6 +72,7 @@ _TPU_ESTIMATOR = 'tpu_estimator'
 _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
+_ONE_GIGABYTE = 1024 * 1024 * 1024
 
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
@@ -676,8 +680,11 @@ def generate_per_host_enqueue_ops_fn_for_host(
         raise TypeError(
             'For mode PREDICT, `input_fn` must return `Dataset` instead of '
             '`features` and `labels`.')
+      if batch_axis is not None:
+        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
       inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset, batch_size=ctx.batch_size_for_input_fn)
+          dataset=inputs.dataset, batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
 
     if is_dataset:
       hooks.append(inputs.dataset_initializer_hook())
@@ -733,6 +740,61 @@ def generate_per_host_enqueue_ops_fn_for_host(
   return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
 
+def generate_per_host_v2_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, device, host_id):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  del host_id  # unused
+  captured_infeed_queue = _CapturedObject()
+  hooks = []
+
+  with ops.device(device):
+    inputs = _Inputs.from_input_fn(input_fn())
+
+    is_dataset = inputs.is_dataset
+    if not is_dataset:
+      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
+                      'input pipeline configuration.')
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      # TODO(b/XXX): Add predict support for PER_HOST_V2
+      raise TypeError('Most PREDICT not yet supported in PER_HOST_V2 mode.')
+
+    hooks.append(inputs.dataset_initializer_hook())
+
+  def enqueue_ops_fn():
+    """Generates the per_host enqueue ops."""
+    control_deps = []
+    per_host_sharded_inputs = []
+    num_replicas_per_host = ctx.num_of_replicas_per_host
+    with ops.device(device):
+      if not inputs.is_dataset:
+        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
+      for _ in range(num_replicas_per_host):
+        # Use control dependencies to ensure a deterministic ordering.
+        with ops.control_dependencies(control_deps):
+          features, labels = inputs.features_and_labels()  # Calls get_next()
+
+        inputs_structure_recorder.validate_and_record_structure(
+            features, labels)
+        flattened_inputs = (
+            inputs_structure_recorder.flatten_features_and_labels(
+                features, labels))
+
+        control_deps.extend(flattened_inputs)
+        per_host_sharded_inputs.append(flattened_inputs)
+
+    infeed_queue = tpu_feed.InfeedQueue(
+        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
+    captured_infeed_queue.capture(infeed_queue)
+    infeed_queue.set_configuration_from_sharded_input_tensors(
+        per_host_sharded_inputs)
+
+    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+        per_host_sharded_inputs, tpu_ordinal_function=ctx.tpu_ordinal_function)
+    return per_host_enqueue_ops
+
+  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
+
+
 class _InputPipeline(object):
   """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
 
@@ -924,8 +986,7 @@ class _InputPipeline(object):
       # In the model-parallel case, both the host-side and device-side
       # computations must agree on the core on which infeed takes place. We
       # choose to perform infeed on logical core 0 of each replica.
-      with ops.device(tpu.core(0)):
-        values = self._infeed_queue.generate_dequeue_op()
+      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
       # The unflatten process uses the structure information recorded above.
       return self._inputs_structure_recorder.unflatten_features_and_labels(
           values)
@@ -969,10 +1030,17 @@ class _InputPipeline(object):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
-                generate_per_host_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    self._batch_axis, host_device, host_id))
+            if self._ctx.is_input_per_host_with_iterators():
+              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
+                  generate_per_host_v2_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, host_device, host_id))
+            else:
+              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
+                  generate_per_host_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, self._batch_axis,
+                      host_device, host_id))
             all_hooks.extend(hooks)
 
             # NOTE(xiejw): We dispatch here based on the return type of the
@@ -1037,8 +1105,8 @@ class _ModelFnWrapper(object):
     self._params = params
     self._ctx = ctx
 
-  def call_without_tpu(self, features, labels):
-    return self._call_model_fn(features, labels)
+  def call_without_tpu(self, features, labels, is_export_mode):
+    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
 
   def convert_to_single_tpu_train_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single train step on TPU.
@@ -1197,7 +1265,7 @@ class _ModelFnWrapper(object):
 
     return predict_step, host_calls, captured_scaffold_fn
 
-  def _call_model_fn(self, features, labels, is_export_mode=True):
+  def _call_model_fn(self, features, labels, is_export_mode=False):
     """Calls the model_fn with required parameters."""
     model_fn_args = util.fn_args(self._model_fn)
     kwargs = {}
@@ -1223,7 +1291,11 @@ class _ModelFnWrapper(object):
                        'required by TPUEstimator to pass batch size as '
                        'params[\'batch_size\']'.format(self._model_fn))
 
-    batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
+    if is_export_mode:
+      batch_size_for_model_fn = None
+    else:
+      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
+
     if batch_size_for_model_fn is not None:
       params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
 
@@ -1618,11 +1690,6 @@ class TPUEstimator(estimator_lib.Estimator):
   2. `input_fn` must return a `Dataset` instance rather than `features`. In
   fact, .train() and .evaluate() also support Dataset as return value.
 
-  3. Each batch returned by `Dataset`'s iterator must have the *same static*
-     shape. This means two things:
-     - batch_size cannot be `None`
-     - the final batch must be padded by user to a full batch.
-
   Example (MNIST):
   ----------------
   ```
@@ -1637,41 +1704,9 @@ class TPUEstimator(estimator_lib.Estimator):
         [total_examples, height, width, 3], minval=-1, maxval=1)
 
     dataset = tf.data.Dataset.from_tensor_slices(images)
-    dataset = dataset.batch(batch_size)
     dataset = dataset.map(lambda images: {'image': images})
 
-    def pad(tensor, missing_count):
-        # Pads out the batch dimension to the complete batch_size.
-        rank = len(tensor.shape)
-        assert rank > 0
-        padding = tf.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
-        padded_shape = (batch_size,) + tuple(tensor.shape[1:])
-        padded_tensor = tf.pad(tensor, padding)
-        padded_tensor.set_shape(padded_shape)
-        return padded_tensor
-
-    def pad_batch_if_incomplete(batch_features):
-      # Pads out the batch dimension for all features.
-      real_batch_size = tf.shape(batch_features["image"])[0]
-
-      missing_count = tf.constant(batch_size, tf.int32) - real_batch_size
-
-      padded_features = {
-          key: pad(tensor, missing_count)
-          for key, tensor in batch_features.iteritems()
-      }
-      padding_mask = tf.concat(
-          [
-              tf.zeros((real_batch_size, 1), dtype=tf.int32),
-              tf.ones((missing_count, 1), dtype=tf.int32)
-          ],
-          axis=0)
-      padding_mask.set_shape((batch_size, 1))
-      padded_features["is_padding"] = padding_mask
-      return padded_features
-
-    dataset = dataset.map(pad_batch_if_incomplete)
-
+    dataset = dataset.batch(batch_size)
     return dataset
 
   def model_fn(features, labels, params, mode):
@@ -1751,7 +1786,7 @@ class TPUEstimator(estimator_lib.Estimator):
         labels to match up with the corresponding images. If None is supplied,
         and per_host_input_for_training is True, batches will be sharded based
         on the major dimension. If tpu_config.per_host_input_for_training is
-        False, batch_axis is ignored.
+        False or `PER_HOST_V2`, batch_axis is ignored.
 
     Raises:
       ValueError: `params` has reserved keys already.
@@ -1771,7 +1806,8 @@ class TPUEstimator(estimator_lib.Estimator):
         raise ValueError('`train_batch_size` cannot be `None`')
       util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
 
-      if (not config.tpu_config.per_host_input_for_training and
+      if (config.tpu_config.per_host_input_for_training is
+          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
           config.tpu_config.computation_shape):
         raise ValueError(
             'Model parallelism only supports per host input for training. '
@@ -1808,6 +1844,8 @@ class TPUEstimator(estimator_lib.Estimator):
         eval_batch_size, predict_batch_size,
         use_tpu)
 
+    self._is_input_fn_invoked = None
+
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
 
@@ -1890,6 +1928,9 @@ class TPUEstimator(estimator_lib.Estimator):
     if 'mode' in input_fn_args:
       kwargs['mode'] = mode
 
+    # Records the fact input_fn has been invoked.
+    self._is_input_fn_invoked = True
+
     with self._ctx.with_mode(mode) as ctx:
       # Setting the batch size in params first. This helps user to have same
       # input_fn for use_tpu=True/False.
@@ -1937,15 +1978,24 @@ class TPUEstimator(estimator_lib.Estimator):
       with self._ctx.with_mode(mode) as ctx:
         model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
 
-        # For export_savedmodel, input_fn is never passed to Estimator. So,
-        # if features is callable, it means it is the input_fn passed by
-        # TPUEstimator._call_input_fn. Then we can know if the mode == PREDICT,
-        # it implies, it is the .predict API, not export_savedmodel API.
-        is_export_mode = not callable(features)
+        if mode != model_fn_lib.ModeKeys.PREDICT:
+          is_export_mode = False
+        else:
+          # For export_savedmodel, input_fn is never passed to Estimator. So, by
+          # checking the self._is_input_fn_invoked bit, we can know, given the
+          # mode == PREDICT, it is the .predict API, not export_savedmodel API.
+          if self._is_input_fn_invoked:
+            is_export_mode = False
+          else:
+            is_export_mode = True
+
+        # Clear the bit.
+        self._is_input_fn_invoked = None
 
         if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
           logging.info('Running %s on CPU', mode)
-          return model_fn_wrapper.call_without_tpu(features, labels)
+          return model_fn_wrapper.call_without_tpu(
+              features, labels, is_export_mode=is_export_mode)
 
         assert labels is None, '`labels` passed to `model_fn` must be `None`.'
         # TPUEstimator._call_input_fn passes `input_fn` as features to here.
@@ -1969,7 +2019,8 @@ class TPUEstimator(estimator_lib.Estimator):
                   host_ops,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator)),
-              ExamplesPerSecondHook(ctx.global_batch_size),
+              ExamplesPerSecondHook(ctx.global_batch_size,
+                                    output_dir=self.model_dir),
               InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
@@ -2083,12 +2134,18 @@ class TPUEstimator(estimator_lib.Estimator):
           host_ops = host_call_ret['host_call']
 
         predictions = host_call_ret['predictions']
-        stopping_signals = host_call_ret['signals']
+        _verify_cross_hosts_transfer_size(
+            predictions, message=(
+                'The estimated size for TPUEstimatorSpec.predictions is too '
+                'large.'))
+        signals = host_call_ret['signals']
 
         with ops.control_dependencies(host_ops):
           host_ops = []  # Empty, we do do not need it anymore.
           scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
-              stopping_signals)
+              signals)
+          predictions = _PaddingSignals.slice_tensor_or_dict(
+              predictions, signals)
 
         hooks = [
             _StoppingPredictHook(scalar_stopping_signal),
@@ -2096,13 +2153,6 @@ class TPUEstimator(estimator_lib.Estimator):
                                                      host_ops),
         ] + input_hooks
 
-        # TODO(b/73813593): Delete this logging once the bug is resolved.
-        logging.info(
-            'If the Tensors in TPUEstimatorSpec.predictions dict are large, '
-            'you might observe the TPU program getting stuck (b/73813593). '
-            'Consider using small Tensors in the predictions dict to verify '
-            'the issue and report on the bug.')
-
         return model_fn_lib.EstimatorSpec(
             mode,
             prediction_hooks=hooks,
@@ -2122,8 +2172,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   def multi_tpu_eval_steps_on_single_shard():
     return training_loop.repeat(
         iterations_per_loop_var,
-        single_tpu_eval_step, [_ZERO_LOSS],
-        name='loop')
+        single_tpu_eval_step, [_ZERO_LOSS])
 
   (loss,) = tpu.shard(
       multi_tpu_eval_steps_on_single_shard,
@@ -2146,8 +2195,7 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   def multi_tpu_train_steps_on_single_shard():
     return training_loop.repeat(
         iterations_per_loop_var,
-        single_tpu_train_step, [_INITIAL_LOSS],
-        name=b'loop')
+        single_tpu_train_step, [_INITIAL_LOSS])
 
   (loss,) = tpu.shard(
       multi_tpu_train_steps_on_single_shard,
@@ -2378,6 +2426,10 @@ class _Inputs(object):
   def features_and_labels(self):
     """Gets `features` and `labels`."""
     if self.is_dataset:
+      if self._iterator is None:
+        raise RuntimeError('Internal error: Must call dataset_initializer_hook '
+                           'before calling features_and_labels(). Please file '
+                           'a bug!')
       return _Inputs._parse_inputs(self._iterator.get_next())
 
     return (self._features, self._labels)
@@ -2390,20 +2442,19 @@ class _Inputs(object):
     return self._dataset
 
 
-# TODO(xiejw): Extend this to support final partial batch.
 class _InputsWithStoppingSignals(_Inputs):
   """Inputs with `_StopSignals` inserted into the dataset."""
 
-  def __init__(self, dataset, batch_size):
+  def __init__(self, dataset, batch_size, add_padding=False):
 
     assert dataset is not None
 
     user_provided_dataset = dataset.map(
         _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=False, batch_size=batch_size))
+            stop=False, batch_size=batch_size, add_padding=add_padding))
     final_batch_dataset = dataset.take(1).map(
         _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=True, batch_size=batch_size))
+            stop=True, batch_size=batch_size, add_padding=add_padding))
     dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
 
     super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
@@ -2433,7 +2484,7 @@ class _InputsWithStoppingSignals(_Inputs):
     return signals
 
   @staticmethod
-  def insert_stopping_signal(stop, batch_size):
+  def insert_stopping_signal(stop, batch_size, add_padding=False):
     """Inserts stopping_signal into dataset via _map_fn.
 
     Here we change the data structure in the dataset, such that the return value
@@ -2444,6 +2495,7 @@ class _InputsWithStoppingSignals(_Inputs):
     Args:
       stop: bool, state of current stopping signals.
       batch_size: int, batch size.
+      add_padding: bool, whether to pad the tensor to full batch size.
 
     Returns:
       A map_fn passed to dataset.map API.
@@ -2457,11 +2509,25 @@ class _InputsWithStoppingSignals(_Inputs):
         args = args[0]
       features, labels = _Inputs._parse_inputs(args)
       new_input_dict = {}
-      new_input_dict['features'] = features
-      if labels is not None:
-        new_input_dict['labels'] = labels
+
+      if add_padding:
+        padding_mask, features, labels = (
+            _PaddingSignals.pad_features_and_labels(
+                features, labels, batch_size))
+
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+
+      else:
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+        padding_mask = None
+
       new_input_dict['signals'] = _StopSignals(
-          stop=stop, batch_size=batch_size).as_dict()
+          stop=stop, batch_size=batch_size, padding_mask=padding_mask).as_dict()
+
       return new_input_dict
 
     return _map_fn
@@ -2470,23 +2536,28 @@ class _InputsWithStoppingSignals(_Inputs):
 class _StopSignals(object):
   """Signals class holding all logic to handle TPU stopping condition."""
 
-  NON_STOPPING_SIGNAL = 0.0
-  STOPPING_SIGNAL = 1.0
+  NON_STOPPING_SIGNAL = False
+  STOPPING_SIGNAL = True
 
-  def __init__(self, stop, batch_size):
+  def __init__(self, stop, batch_size, padding_mask=None):
     self._stop = stop
     self._batch_size = batch_size
+    self._padding_mask = padding_mask
 
   def as_dict(self):
+    """Returns the signals as Python dict."""
     shape = [self._batch_size, 1]
-    dtype = dtypes.float32
+    dtype = dtypes.bool
 
     if self._stop:
       stopping = array_ops.ones(shape=shape, dtype=dtype)
     else:
       stopping = array_ops.zeros(shape=shape, dtype=dtype)
 
-    return {'stopping': stopping}
+    signals = {'stopping': stopping}
+    if self._padding_mask is not None:
+      signals['padding_mask'] = self._padding_mask
+    return signals
 
   @staticmethod
   def as_scalar_stopping_signal(signals):
@@ -2494,7 +2565,118 @@ class _StopSignals(object):
 
   @staticmethod
   def should_stop(scalar_stopping_signal):
-    return scalar_stopping_signal >= _StopSignals.STOPPING_SIGNAL
+    if isinstance(scalar_stopping_signal, ops.Tensor):
+      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
+      # way to express the bool check whether scalar_stopping_signal is True.
+      return math_ops.logical_and(
+          scalar_stopping_signal, _StopSignals.STOPPING_SIGNAL)
+    else:
+      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
+      # the graph anymore. Here, we use pure Python.
+      return bool(scalar_stopping_signal)
+
+
+class _PaddingSignals(object):
+  """Signals class holding all logic to handle padding."""
+
+  @staticmethod
+  def pad_features_and_labels(features, labels, batch_size):
+    """Pads out the batch dimension of features and labels."""
+    real_batch_size = array_ops.shape(
+        _PaddingSignals._find_any_tensor(features))[0]
+
+    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
+
+    check_greater = check_ops.assert_greater_equal(
+        batch_size_tensor, real_batch_size,
+        data=(batch_size_tensor, real_batch_size),
+        message='The real batch size should not be greater than batch_size.')
+
+    with ops.control_dependencies([check_greater]):
+      missing_count = batch_size_tensor - real_batch_size
+
+    def pad_single_tensor(tensor):
+      """Pads out the batch dimension of a tensor to the complete batch_size."""
+      rank = len(tensor.shape)
+      assert rank > 0
+      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
+      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
+      padded_tensor = array_ops.pad(tensor, padding)
+      padded_tensor.set_shape(padded_shape)
+      return padded_tensor
+
+    def nest_pad(tensor_or_dict):
+      return nest.map_structure(pad_single_tensor, tensor_or_dict)
+
+    features = nest_pad(features)
+    if labels is not None:
+      labels = nest_pad(labels)
+
+    padding_mask = _PaddingSignals._padding_mask(
+        real_batch_size, missing_count, batch_size)
+
+    return padding_mask, features, labels
+
+  @staticmethod
+  def slice_tensor_or_dict(tensor_or_dict, signals):
+    """Slice the real Tensors according to padding mask in signals."""
+
+    padding_mask = signals['padding_mask']
+    batch_size = array_ops.shape(padding_mask)[0]
+
+    def verify_batch_size(tensor):
+      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
+      with ops.control_dependencies([check_batch_size]):
+        return array_ops.identity(tensor)
+
+    def slice_single_tensor(tensor):
+      rank = len(tensor.shape)
+      assert rank > 0
+      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
+      return verify_batch_size(tensor)[0:real_batch_size]
+
+    # As we split the Tensors to all TPU cores and concat them back, it is
+    # important to ensure the real data is placed before padded ones, i.e.,
+    # order is preserved. By that, the sliced padding mask should have all 0's.
+    # If this assertion failed, # the slice logic here would not hold.
+    sliced_padding_mask = slice_single_tensor(padding_mask)
+    assert_padding_mask = math_ops.equal(
+        math_ops.reduce_sum(sliced_padding_mask), 0)
+
+    with ops.control_dependencies([assert_padding_mask]):
+      should_stop = _StopSignals.should_stop(
+          _StopSignals.as_scalar_stopping_signal(signals))
+
+    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
+
+    def slice_fn(tensor):
+      # If the current batch is full batch or part of stopping signals, we do
+      # not need to slice to save performance.
+      return control_flow_ops.cond(
+          math_ops.logical_or(should_stop, is_full_batch),
+          (lambda: verify_batch_size(tensor)),
+          (lambda: slice_single_tensor(tensor)))
+
+    return nest.map_structure(slice_fn, tensor_or_dict)
+
+  @staticmethod
+  def _find_any_tensor(batch_features):
+    tensors = [x for x in nest.flatten(batch_features)
+               if isinstance(x, ops.Tensor)]
+    if not tensors:
+      raise ValueError('Cannot find any Tensor in features dict.')
+    return tensors[0]
+
+  @staticmethod
+  def _padding_mask(real_batch_size, missing_count, batch_size):
+    padding_mask = array_ops.concat(
+        [
+            array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
+            array_ops.ones((missing_count,), dtype=dtypes.int32)
+        ],
+        axis=0)
+    padding_mask.set_shape((batch_size,))
+    return padding_mask
 
 
 class _SignalsHelper(object):
@@ -2515,3 +2697,21 @@ class _SignalsHelper(object):
   @staticmethod
   def as_tensor_list(signals):
     return [signals[key] for key in sorted(signals.iterkeys())]
+
+
+def _verify_cross_hosts_transfer_size(tensor_dict, message):
+  total_size = 0
+  tensor_structure = {}
+  for key, tensor in tensor_dict.items():
+    shape = tensor.shape
+    size = np.product(shape) * tensor.dtype.size
+    tensor_structure[key] = shape
+    total_size += size
+  if total_size >= _ONE_GIGABYTE:
+    raise ValueError(
+        '{} The transfer size is larger than the protobuf limit. Please '
+        'consider to use Tensors with smaller shapes or reduce batch '
+        'size. Given:\n'
+        '{}'.format(message, '\n'.join([
+            ' -- Key: {}, Shape: {}'.format(k, v)
+            for k, v in tensor_structure.items()])))
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e90957e6dea7ff1777dd3e26cdf1c6fdb340dd3
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
@@ -0,0 +1,291 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU Estimator Signalling Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
+from tensorflow.python import data as dataset_lib
+from tensorflow.python.client import session
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+def make_input_fn(num_samples):
+  a = np.linspace(0, 100.0, num=num_samples)
+  b = np.reshape(np.array(a, dtype=np.float32), (len(a), 1))
+
+  def input_fn(params):
+    batch_size = params['batch_size']
+    da1 = dataset_lib.Dataset.from_tensor_slices(a)
+    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+
+    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset.map(lambda fa, fb: {'a': fa, 'b': fb})
+    dataset = dataset.batch(batch_size)
+    return dataset
+  return input_fn, (a, b)
+
+
+def make_input_fn_with_labels(num_samples):
+  a = np.linspace(0, 100.0, num=num_samples)
+  b = np.reshape(np.array(a, dtype=np.float32), (len(a), 1))
+
+  def input_fn(params):
+    batch_size = params['batch_size']
+    da1 = dataset_lib.Dataset.from_tensor_slices(a)
+    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+
+    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset.map(lambda fa, fb: ({'a': fa}, fb))
+    dataset = dataset.batch(batch_size)
+    return dataset
+  return input_fn, (a, b)
+
+
+class TPUEstimatorStoppingSignalsTest(test.TestCase):
+
+  def test_normal_output_without_signals(self):
+    num_samples = 4
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      features = dataset.make_one_shot_iterator().get_next()
+
+      # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
+      self.assertIsNone(features['a'].shape.as_list()[0])
+
+      with session.Session() as sess:
+        result = sess.run(features)
+        self.assertAllEqual(a[:batch_size], result['a'])
+        self.assertAllEqual(b[:batch_size], result['b'])
+
+        # This run should work as num_samples / batch_size = 2.
+        result = sess.run(features)
+        self.assertAllEqual(a[batch_size:num_samples], result['a'])
+        self.assertAllEqual(b[batch_size:num_samples], result['b'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          # Given num_samples and batch_size, this run should fail.
+          sess.run(features)
+
+  def test_output_with_stopping_signals(self):
+    num_samples = 4
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size)
+      hook = inputs.dataset_initializer_hook()
+      features, _ = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
+      self.assertIsNone(features['a'].shape.as_list()[0])
+
+      with session.Session() as sess:
+        hook.begin()
+        hook.after_create_session(sess, coord=None)
+
+        result, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual(a[:batch_size], result['a'])
+        self.assertAllEqual(b[:batch_size], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # This run should work as num_samples / batch_size = 2.
+        result, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual(a[batch_size:num_samples], result['a'])
+        self.assertAllEqual(b[batch_size:num_samples], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # This run should work, *but* see STOP ('1') as signals
+        _, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(features)
+
+
+class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
+
+  def test_num_samples_divisible_by_batch_size(self):
+    num_samples = 4
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
+                                                        add_padding=True)
+      hook = inputs.dataset_initializer_hook()
+      features, _ = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      # With padding, all shapes are static now.
+      self.assertEqual(batch_size, features['a'].shape.as_list()[0])
+
+      with session.Session() as sess:
+        hook.begin()
+        hook.after_create_session(sess, coord=None)
+
+        result, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual(a[:batch_size], result['a'])
+        self.assertAllEqual(b[:batch_size], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([0.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        # This run should work as num_samples / batch_size = 2.
+        result, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual(a[batch_size:num_samples], result['a'])
+        self.assertAllEqual(b[batch_size:num_samples], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([0.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        # This run should work, *but* see STOP ('1') as signals
+        _, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(features)
+
+  def test_num_samples_not_divisible_by_batch_size(self):
+    num_samples = 5
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn_with_labels(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
+                                                        add_padding=True)
+      hook = inputs.dataset_initializer_hook()
+      features, labels = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      # With padding, all shapes are static.
+      self.assertEqual(batch_size, features['a'].shape.as_list()[0])
+
+      with session.Session() as sess:
+        hook.begin()
+        hook.after_create_session(sess, coord=None)
+
+        evaluated_features, evaluated_labels, evaluated_signals = (
+            sess.run([features, labels, signals]))
+        self.assertAllEqual(a[:batch_size], evaluated_features['a'])
+        self.assertAllEqual(b[:batch_size], evaluated_labels)
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([0.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        # This run should work as num_samples / batch_size >= 2.
+        evaluated_features, evaluated_labels, evaluated_signals = (
+            sess.run([features, labels, signals]))
+        self.assertAllEqual(a[batch_size:2*batch_size], evaluated_features['a'])
+        self.assertAllEqual(b[batch_size:2*batch_size], evaluated_labels)
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([0.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        # This is the final partial batch.
+        evaluated_features, evaluated_labels, evaluated_signals = (
+            sess.run([features, labels, signals]))
+        real_batch_size = num_samples % batch_size
+
+        # Assert the real part.
+        self.assertAllEqual(a[2*batch_size:num_samples],
+                            evaluated_features['a'][:real_batch_size])
+        self.assertAllEqual(b[2*batch_size:num_samples],
+                            evaluated_labels[:real_batch_size])
+        # Assert the padded part.
+        self.assertAllEqual([0.0] * (batch_size - real_batch_size),
+                            evaluated_features['a'][real_batch_size:])
+        self.assertAllEqual([[0.0]] * (batch_size - real_batch_size),
+                            evaluated_labels[real_batch_size:])
+
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        padding = ([.0] * real_batch_size
+                   + [1.] * (batch_size - real_batch_size))
+        self.assertAllEqual(padding, evaluated_signals['padding_mask'])
+
+        # This run should work, *but* see STOP ('1') as signals
+        _, evaluated_signals = sess.run([features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(features)
+
+  def test_slice(self):
+    num_samples = 3
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
+                                                        add_padding=True)
+      hook = inputs.dataset_initializer_hook()
+      features, _ = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      sliced_features = (
+          tpu_estimator._PaddingSignals.slice_tensor_or_dict(
+              features, signals))
+
+      with session.Session() as sess:
+        hook.begin()
+        hook.after_create_session(sess, coord=None)
+
+        result, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertAllEqual(a[:batch_size], result['a'])
+        self.assertAllEqual(b[:batch_size], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # This is the final partial batch.
+        result, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertEqual(1, len(result['a']))
+        self.assertAllEqual(a[batch_size:num_samples], result['a'])
+        self.assertAllEqual(b[batch_size:num_samples], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # This run should work, *but* see STOP ('1') as signals
+        _, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(sliced_features)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index 42ac6eb680437ec82287468bcba2b770ac0e5749..604e6600c81a4136a1f10e79a725a887a96f4d86 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_sharding
 
 from tensorflow.python.framework import dtypes
@@ -368,13 +369,20 @@ class InfeedQueue(object):
       policy.freeze()
     self._validate()
 
-  def generate_dequeue_op(self):
+  def generate_dequeue_op(self, tpu_device=0):
     """Generates the device-side Op to dequeue a tuple from the queue.
 
     Implicitly freezes the queue configuration if it is not already
     frozen, which will raise errors if the shapes and types have not
     been fully specified.
 
+    Args:
+      tpu_device: The TPU device ordinal where the infeed instruction should be
+        placed. If None, no explicit placement will be performed, and it is up
+        to the user to call this API from within a proper TPU device scope.
+        The XLA code will fail if the TPU dequeue instruction is not bound to
+        any device.
+
     Returns:
       A list of Outputs corresponding to a shard of infeed dequeued
       into XLA, suitable for use within a replicated block.
@@ -392,8 +400,13 @@ class InfeedQueue(object):
         policy.get_sharded_shape(shape)
         for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
     ]
-    return tpu_ops.infeed_dequeue_tuple(
-        dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    if tpu_device is not None:
+      with ops.device(tpu.core(tpu_device)):
+        return tpu_ops.infeed_dequeue_tuple(
+            dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    else:
+      return tpu_ops.infeed_dequeue_tuple(
+          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
 
   def _generate_enqueue_op(self,
                            inputs,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index 493d1848c072caa5254fc87c67badc2e99ec16ee..3ae350c7bb345cabdb74783c3233354d67394d3a 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -72,9 +72,9 @@ def _query_tpu_system_metadata(master_address, run_config,
               tpu_core_count += 1
           break
     except errors.DeadlineExceededError:
-      msg = ('Fail to connect Tensorflow master. It could be the TPU worker is '
-             'not ready (still under scheduling) or Tensorflow '
-             'master address is correct: got (%s).' %
+      msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
+             'not be ready (still scheduling) or the Tensorflow master address '
+             'is incorrect: got (%s).' %
              (master_address))
 
       # TODO(xiejw): For local or grpc master we might not need retry logic
@@ -120,7 +120,8 @@ def _query_tpu_system_metadata(master_address, run_config,
     logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
     logging.info('*** Num TPU Cores Per Worker: %d',
                  metadata.num_of_cores_per_host)
-    logging.info('*** Available Devices: %s', metadata.devices)
+    for device in metadata.devices:
+      logging.info('*** Available Device: %s', device)
   else:
     logging.info('Failed to find TPU: %s', metadata)
   return metadata
diff --git a/tensorflow/contrib/tpu/python/tpu/training_loop.py b/tensorflow/contrib/tpu/python/tpu/training_loop.py
index 82a75d02552b7b013452945a76b16c2c2fb9fa82..10a8bccf3b23add75188e16eb3591c32eb8621ee 100644
--- a/tensorflow/contrib/tpu/python/tpu/training_loop.py
+++ b/tensorflow/contrib/tpu/python/tpu/training_loop.py
@@ -44,7 +44,7 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
       None (equivalent to an empty list).
     infeed_queue: if not None, the infeed queue from which to append a tuple
       of arguments as inputs to condition.
-    name: an optional name for the loop.
+    name: (Deprecated) Does nothing.
 
   Returns:
     The final values of the loop-carried tensors.
@@ -52,7 +52,7 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
   Raises:
     TypeError: if body or condition has the wrong signature.
   """
-
+  del name
   # Converts inputs to Tensors.
   inputs = [] if inputs is None else [ops.convert_to_tensor(x) for
                                       x in inputs]
@@ -166,7 +166,7 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
   if input_arity == 0:
     inputs = [array_ops.constant(0)]
   return control_flow_ops.while_loop(condition_wrapper, body_wrapper, inputs,
-                                     name=name)
+                                     name="")
 
 
 def repeat(n, body, inputs=None, infeed_queue=None, name=None):
@@ -183,7 +183,7 @@ def repeat(n, body, inputs=None, infeed_queue=None, name=None):
       None (equivalent to an empty list).
     infeed_queue: if not None, the infeed queue from which to append a tuple
       of arguments as inputs to condition.
-    name: an optional name for the loop.
+    name: (Deprecated) Does nothing.
   Returns:
     The final values of the loop-carried tensors.
   Raises:
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index 4ef8f9eebdb165e5fe221be8670276bf943159b3..639e70816926aaed850cee62dca6aa819b38de8b 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -172,7 +172,7 @@ It is always recommended to port a small, simple model first to make sure that
 you are familiar with the basic concepts of `TPUEstimator` and test end-to-end
 behavior. Once your simple model runs, gradually add more functionality.
 In addition, there are several sample models, available at
-[github.com/tensorflow/tpu-demos](https://github.com/tensorflow/tpu-demos).
+[github.com/tensorflow/tpu](https://github.com/tensorflow/tpu).
 
 To convert your code from the vanilla `Estimator` class to use TPUs, change the
 following (note some of the details may change over time):
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 6ae2f382528c37ae647b73ea01a7f88c07580c78..4d2bfd3e434e60b3fac408931688e8e486b7e494 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -308,18 +308,6 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index dbdbb08a8252c799924812c83fff7f0631424761..f305197c190b67355338c407a7895a0507941ddb 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -517,6 +518,7 @@ class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
     ops._USE_C_API = self._prev_value
 
 
+@test_util.with_c_api
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index 1a5fb45be0ad463f2b4189d97ce4bd41a67a1937..4bb53e867811b27dc95857cfdfe936dd2e3b5c6e 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -36,9 +36,8 @@ out the metrics values to stdout:
 
   # Choose the metrics to compute:
   names_to_values, names_to_updates = tf.contrib.metrics.aggregate_metric_map({
-      "accuracy": tf.contrib.metrics.streaming_accuracy(predictions, labels),
-      "mse": tf.contrib.metrics.streaming_mean_squared_error(
-        predictions, labels),
+      "accuracy": tf.metrics.accuracy(labels, predictions),
+      "mse": tf.metrics.mean_squared_error(labels, predictions),
   })
 
   # Define the summaries to write:
@@ -81,9 +80,8 @@ more summaries and call the evaluate_repeatedly method:
 
   # Choose the metrics to compute:
   names_to_values, names_to_updates = tf.contrib.metrics.aggregate_metric_map({
-      "accuracy": tf.contrib.metrics.streaming_accuracy(predictions, labels),
-      "mse": tf.contrib.metrics.streaming_mean_squared_error(
-          predictions, labels),
+      "accuracy": tf.metrics.accuracy(labels, predictions),
+      "mse": tf.metrics.mean_squared_error(labels, predictions),
   })
 
   # Define the summaries to write:
diff --git a/tensorflow/contrib/training/python/training/evaluation_test.py b/tensorflow/contrib/training/python/training/evaluation_test.py
index b07039916c203940039732c12938e7f342fa72a3..c36d00e8425ccbfe9338b50fc492dc1334d59731 100644
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@@ -27,7 +27,6 @@ import numpy as np
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.losses.python.losses import loss_ops
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.training.python.training import evaluation
 from tensorflow.contrib.training.python.training import training
 from tensorflow.core.protobuf import config_pb2
@@ -38,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
@@ -196,7 +196,8 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     checkpoint_path = evaluation.wait_for_new_checkpoint(checkpoint_dir)
 
@@ -311,7 +312,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     final_values = evaluation.evaluate_repeatedly(
         checkpoint_dir=checkpoint_dir,
@@ -365,7 +367,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     timeout_fn_calls = [0]
     def timeout_fn():
@@ -417,9 +420,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     self.assertEqual(final_values['my_var'], expected_value)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(
-        predictions + 1, labels)
+    accuracy0, update_op0 = metrics.accuracy(labels, predictions)
+    accuracy1, update_op1 = metrics.accuracy(labels, predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 16397622edd382bc6dcb12870de5fa22130a2c2b..96eff86d8d48bb7f61b0fe9db2ccf2fe12c741bb 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -38,40 +38,60 @@ class HParamsTest(test.TestCase):
     self.assertFalse('bar' in hparams)
 
   def testSomeValues(self):
-    hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6')
-    self.assertDictEqual({'aaa': 1, 'b': 2.0, 'c_c': 'relu6'}, hparams.values())
-    expected_str = '[(\'aaa\', 1), (\'b\', 2.0), (\'c_c\', \'relu6\')]'
+    hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6', d='/a/b=c/d')
+    self.assertDictEqual(
+        {'aaa': 1, 'b': 2.0, 'c_c': 'relu6', 'd': '/a/b=c/d'},
+        hparams.values())
+    expected_str = ('[(\'aaa\', 1), (\'b\', 2.0), (\'c_c\', \'relu6\'), '
+                    '(\'d\', \'/a/b=c/d\')]')
     self.assertEqual(expected_str, str(hparams.__str__()))
     self.assertEqual(expected_str, str(hparams))
     self.assertEqual(1, hparams.aaa)
     self.assertEqual(2.0, hparams.b)
     self.assertEqual('relu6', hparams.c_c)
+    self.assertEqual('/a/b=c/d', hparams.d)
     hparams.parse('aaa=12')
     self.assertDictEqual({
         'aaa': 12,
         'b': 2.0,
-        'c_c': 'relu6'
+        'c_c': 'relu6',
+        'd': '/a/b=c/d'
     }, hparams.values())
     self.assertEqual(12, hparams.aaa)
     self.assertEqual(2.0, hparams.b)
     self.assertEqual('relu6', hparams.c_c)
+    self.assertEqual('/a/b=c/d', hparams.d)
     hparams.parse('c_c=relu4, b=-2.0e10')
     self.assertDictEqual({
         'aaa': 12,
         'b': -2.0e10,
-        'c_c': 'relu4'
+        'c_c': 'relu4',
+        'd': '/a/b=c/d'
     }, hparams.values())
     self.assertEqual(12, hparams.aaa)
     self.assertEqual(-2.0e10, hparams.b)
     self.assertEqual('relu4', hparams.c_c)
+    self.assertEqual('/a/b=c/d', hparams.d)
     hparams.parse('c_c=,b=0,')
-    self.assertDictEqual({'aaa': 12, 'b': 0, 'c_c': ''}, hparams.values())
+    self.assertDictEqual({'aaa': 12, 'b': 0, 'c_c': '', 'd': '/a/b=c/d'},
+                         hparams.values())
     self.assertEqual(12, hparams.aaa)
     self.assertEqual(0.0, hparams.b)
     self.assertEqual('', hparams.c_c)
+    self.assertEqual('/a/b=c/d', hparams.d)
     hparams.parse('c_c=2.3",b=+2,')
     self.assertEqual(2.0, hparams.b)
     self.assertEqual('2.3"', hparams.c_c)
+    hparams.parse('d=/a/b/c/d,aaa=11,')
+    self.assertEqual(11, hparams.aaa)
+    self.assertEqual(2.0, hparams.b)
+    self.assertEqual('2.3"', hparams.c_c)
+    self.assertEqual('/a/b/c/d', hparams.d)
+    hparams.parse('b=1.5,d=/a=b/c/d,aaa=10,')
+    self.assertEqual(10, hparams.aaa)
+    self.assertEqual(1.5, hparams.b)
+    self.assertEqual('2.3"', hparams.c_c)
+    self.assertEqual('/a=b/c/d', hparams.d)
     with self.assertRaisesRegexp(ValueError, 'Unknown hyperparameter'):
       hparams.parse('x=123')
     with self.assertRaisesRegexp(ValueError, 'Could not parse'):
@@ -84,17 +104,19 @@ class HParamsTest(test.TestCase):
       hparams.parse('b=relu')
     with self.assertRaisesRegexp(ValueError, 'Must not pass a list'):
       hparams.parse('aaa=[123]')
-    self.assertEqual(12, hparams.aaa)
-    self.assertEqual(2.0, hparams.b)
+    self.assertEqual(10, hparams.aaa)
+    self.assertEqual(1.5, hparams.b)
     self.assertEqual('2.3"', hparams.c_c)
+    self.assertEqual('/a=b/c/d', hparams.d)
     # Exports to proto.
     hparam_def = hparams.to_proto()
     # Imports from proto.
     hparams2 = hparam.HParams(hparam_def=hparam_def)
     # Verifies that all hparams are restored.
-    self.assertEqual(12, hparams2.aaa)
-    self.assertEqual(2.0, hparams2.b)
+    self.assertEqual(10, hparams2.aaa)
+    self.assertEqual(1.5, hparams2.b)
     self.assertEqual('2.3"', hparams2.c_c)
+    self.assertEqual('/a=b/c/d', hparams2.d)
 
   def testSetFromMap(self):
     hparams = hparam.HParams(a=1, b=2.0, c='tanh')
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 72231948856b38edd3d022a99a62e6d4c8c5649e..99d486b1833ccaa81a873d457a8edb06f3d9c7a5 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -1574,8 +1574,9 @@ def _padding(sequences, num_unroll):
   if not sequences:
     return 0, {}
 
-  sequences_dict = {}
-  for key, value in sequences.items():
+  # Sort 'sequences_dict' so 'length' will have a predictable value below.
+  sequences_dict = collections.OrderedDict()
+  for key, value in sorted(sequences.items()):
     if not (isinstance(value, sparse_tensor.SparseTensor) or
             isinstance(value, sparse_tensor.SparseTensorValue)):
       sequences_dict[key] = ops.convert_to_tensor(value)
diff --git a/tensorflow/contrib/util/BUILD b/tensorflow/contrib/util/BUILD
index 6c766e4f1c04fd9cca0b6e03382737d42b6fda20..d9ccda8e89a4c9a1b3f3d24915b9ad3fb4d9be5f 100644
--- a/tensorflow/contrib/util/BUILD
+++ b/tensorflow/contrib/util/BUILD
@@ -75,15 +75,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 80a5d07ea43531ed2532443b6ff9327b9ece6df7..9720fd6e8657de18cf8d7565f834568ae52fdbda 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -11,18 +11,6 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index 94203ee2b3654bffe82d203cde8780a64f63ba2a..c9df6beb6b1d67f14d26d7f0420cb53b6347bf99 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -262,7 +262,7 @@ class RdmaTensorRequest {
   // Receive tensor content (RDMA write was completed).
   //
   // Decode proto if required and/or move to GPU if the content was not
-  // written to it directly (GPU direct is not avaliable). Afterwards,
+  // written to it directly (GPU direct is not available). Afterwards,
   // invoke Done().
   void RecvTensorContent();
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c92893d997bfc7709529c5cf3a229f857be6e741..7d5ae1c5b57bef0d1958eaf13c6b2fdb0cceefc9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -144,11 +144,14 @@ load(
     "tf_cuda_tests_tags",
     "if_static",
 )
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
 
+exports_files(["ops/ops.pbtxt"])
+
 # -----------------------------------------------------------------------------
 # Public targets
 
@@ -313,6 +316,7 @@ cc_library(
         "lib/gtl/optional.h",
         "lib/gtl/priority_queue_util.h",
         "lib/hash/crc32c.h",
+        "lib/hash/hash.h",
         "lib/histogram/histogram.h",
         "lib/io/buffered_inputstream.h",
         "lib/io/compression.h",
@@ -345,6 +349,7 @@ cc_library(
         "platform/env.h",
         "platform/env_time.h",
         "platform/file_system.h",
+        "platform/file_system_helper.h",
         "platform/fingerprint.h",
         "platform/init_main.h",
         "platform/logging.h",
@@ -353,6 +358,7 @@ cc_library(
         "platform/mutex.h",
         "platform/net.h",
         "platform/notification.h",
+        "platform/null_file_system.h",
         "platform/prefetch.h",
         "platform/profile_utils/clock_cycle_profiler.h",
         "platform/profile_utils/cpu_utils.h",
@@ -377,13 +383,13 @@ cc_library(
 )
 
 cc_library(
-    name = "session_message",
-    srcs = ["util/session_message.cc"],
-    hdrs = ["util/session_message.h"],
+    name = "stacktrace",
+    srcs = glob(["platform/*/stacktrace.h"]),
+    hdrs = ["platform/stacktrace.h"],
     deps = [
-        ":framework",
-        ":lib",
-        ":protos_all_cc",
+        ":abi",
+        ":lib_platform",
+        "//tensorflow/core/platform/default/build_config:stacktrace",
     ],
 )
 
@@ -392,8 +398,20 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
-        ":lib",
+        ":abi",
         ":lib_platform",
+        ":stacktrace",
+    ],
+)
+
+cc_library(
+    name = "session_message",
+    srcs = ["util/session_message.cc"],
+    hdrs = ["util/session_message.h"],
+    deps = [
+        ":framework",
+        ":lib",
+        ":protos_all_cc",
     ],
 )
 
@@ -441,6 +459,7 @@ tf_cuda_library(
         "framework/attr_value_util.h",
         "framework/bfloat16.h",
         "framework/cancellation.h",
+        "framework/collective.h",
         "framework/common_shape_fns.h",
         "framework/control_flow.h",  # TODO(josh11b): Make internal?
         "framework/dataset.h",
@@ -593,6 +612,7 @@ cc_library(
         "platform/prefetch.h",
         "platform/thread_annotations.h",
         "platform/types.h",
+        "platform/cpu_info.h",
     ] + if_windows(["platform/windows/integral_types.h"]),
     visibility = ["//visibility:public"],
     deps =
@@ -610,6 +630,7 @@ tf_gen_op_libs(
     op_lib_names = [
         "batch_ops",
         "bitwise_ops",
+        "boosted_trees_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
         "control_flow_ops",
@@ -632,6 +653,7 @@ tf_gen_op_libs(
         "random_ops",
         "remote_fused_graph_ops",
         "resource_variable_ops",
+        "scoped_allocator_ops",
         "sdca_ops",
         "set_ops",
         "script_ops",
@@ -685,6 +707,34 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "cudnn_rnn_ops",
+    srcs = [
+        "ops/cudnn_rnn_ops.cc",
+    ],
+    linkstatic = 1,
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//third_party/eigen3",
+        "@farmhash_archive//:farmhash",
+    ],
+    alwayslink = 1,
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "cudnn_rnn_ops",
+    ],
+    deps = [
+        ":lib",
+    ],
+)
+
 cc_library(
     name = "ops",
     visibility = ["//visibility:public"],
@@ -693,10 +743,12 @@ cc_library(
         ":audio_ops_op_lib",
         ":batch_ops_op_lib",
         ":bitwise_ops_op_lib",
+        ":boosted_trees_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
+        ":cudnn_rnn_ops_op_lib",
         ":data_flow_ops_op_lib",
         ":dataset_ops_op_lib",
         ":function_ops_op_lib",
@@ -715,11 +767,13 @@ cc_library(
         ":random_ops_op_lib",
         ":remote_fused_graph_ops_op_lib",
         ":resource_variable_ops_op_lib",
+        ":scoped_allocator_ops_op_lib",
         ":script_ops_op_lib",
         ":sdca_ops_op_lib",
         ":sendrecv_ops_op_lib",
         ":set_ops_op_lib",
         ":sparse_ops_op_lib",
+        ":summary_ops_op_lib",
         ":spectral_ops_op_lib",
         ":state_ops_op_lib",
         ":stateless_random_ops_op_lib",
@@ -831,10 +885,12 @@ cc_library(
         "//tensorflow/core/kernels:audio",
         "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels:bincount_op",
+        "//tensorflow/core/kernels:boosted_trees_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
+        "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/kernels:dataset_ops",
         "//tensorflow/core/kernels:fake_quant_ops",
@@ -858,6 +914,7 @@ cc_library(
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:required",
         "//tensorflow/core/kernels:resource_variable_ops",
+        "//tensorflow/core/kernels:scoped_allocator_ops",
         "//tensorflow/core/kernels:sdca_ops",
         "//tensorflow/core/kernels:set_kernels",
         "//tensorflow/core/kernels:sparse",
@@ -887,6 +944,9 @@ cc_library(
         "//tensorflow/core/kernels:mkl_softmax_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
         "//tensorflow/core/kernels:mkl_aggregate_ops",
+    ]) + if_cuda([
+        "//tensorflow/core/grappler/optimizers:gpu_swapping_kernels",
+        "//tensorflow/core/grappler/optimizers:gpu_swapping_ops",
     ]),
 )
 
@@ -1034,6 +1094,7 @@ filegroup(
             "util/tensor_bundle/*.h",
             "util/tensor_bundle/*.cc",
             "common_runtime/gpu/**/*",
+            "common_runtime/eager/*",
             "common_runtime/gpu_device_factory.*",
         ],
     ),
@@ -1059,6 +1120,7 @@ filegroup(
             "**/*testlib*",
             "**/*main.cc",
             "common_runtime/gpu/**/*",
+            "common_runtime/eager/*",
             "common_runtime/gpu_device_factory.*",
             "graph/dot.*",
         ],
@@ -1402,6 +1464,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "protobuf/device_properties_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "protobuf/device_properties.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "protobuf/meta_graph_pyclif",
     proto_lib = ":protos_all_cc",
@@ -1410,9 +1479,9 @@ tf_pyclif_proto_library(
 )
 
 tf_pyclif_proto_library(
-    name = "protobuf/device_properties_pyclif",
+    name = "protobuf/saved_model_pyclif",
     proto_lib = ":protos_all_cc",
-    proto_srcfile = "protobuf/device_properties.proto",
+    proto_srcfile = "protobuf/saved_model.proto",
     visibility = ["//visibility:public"],
 )
 
@@ -1578,6 +1647,7 @@ cc_library(
             "platform/**/env_time.cc",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/device_tracer.cc",
+            "platform/abi.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ],
@@ -1589,6 +1659,7 @@ cc_library(
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
+            "platform/abi.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ] +
@@ -1602,6 +1673,7 @@ cc_library(
     deps = tf_additional_lib_deps() + [
         ":lib_hash_crc32c_accelerate_internal",
         ":lib_proto_parsing",
+        ":abi",
         "//third_party/eigen3",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
@@ -1958,7 +2030,6 @@ cc_header_only_library(
     deps = [
         ":framework",
         ":reader_base",
-        "@nsync//:nsync_headers",
     ],
 )
 
@@ -2065,10 +2136,13 @@ tf_cuda_library(
 
 CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
     "common_runtime/device.h",
+    "common_runtime/device_mgr.h",
     "common_runtime/eval_const_tensor.h",
     "common_runtime/graph_runner.h",
     "common_runtime/shape_refiner.h",
     "framework/versions.h",
+    "common_runtime/process_function_library_runtime.h",
+    "common_runtime/function.h",
 ]
 
 tf_cuda_library(
@@ -2109,25 +2183,27 @@ tf_cuda_library(
 CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
     "common_runtime/bfc_allocator.h",
+    "common_runtime/collective_executor_mgr.h",
+    "common_runtime/collective_param_resolver_local.h",
+    "common_runtime/collective_rma_local.h",
+    "common_runtime/device_resolver_local.h",
+    "common_runtime/buf_rendezvous.h",
     "common_runtime/build_graph_options.h",
     "common_runtime/constant_folding.h",
     "common_runtime/copy_tensor.h",
     "common_runtime/costmodel_manager.h",
     "common_runtime/debugger_state_interface.h",
     "common_runtime/device_factory.h",
-    "common_runtime/device_mgr.h",
     "common_runtime/device_set.h",
     "common_runtime/dma_helper.h",
     "common_runtime/eigen_thread_pool.h",
     "common_runtime/executor.h",
-    "common_runtime/function.h",
     "common_runtime/graph_optimizer.h",
     "common_runtime/local_device.h",
     "common_runtime/memory_types.h",
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
-    "common_runtime/process_function_library_runtime.h",
     "common_runtime/process_util.h",
     "common_runtime/profile_handler.h",
     "common_runtime/renamed_device.h",
@@ -2150,7 +2226,11 @@ tf_cuda_library(
         "common_runtime/accumulate_n_optimizer.cc",
         "common_runtime/allocator_retry.cc",
         "common_runtime/bfc_allocator.cc",
+        "common_runtime/buf_rendezvous.cc",
         "common_runtime/build_graph_options.cc",
+        "common_runtime/collective_executor_mgr.cc",
+        "common_runtime/collective_param_resolver_local.cc",
+        "common_runtime/collective_rma_local.cc",
         "common_runtime/constant_folding.cc",
         "common_runtime/copy_tensor.cc",
         "common_runtime/costmodel_manager.cc",
@@ -2158,6 +2238,7 @@ tf_cuda_library(
         "common_runtime/device.cc",
         "common_runtime/device_factory.cc",
         "common_runtime/device_mgr.cc",
+        "common_runtime/device_resolver_local.cc",
         "common_runtime/device_set.cc",
         "common_runtime/executor.cc",
         "common_runtime/function.cc",
@@ -2765,6 +2846,11 @@ tf_cc_tests(
     name = "higher_level_tests",
     size = "small",
     srcs = [
+        "common_runtime/buf_rendezvous_test.cc",
+        "common_runtime/collective_executor_mgr_test.cc",
+        "common_runtime/collective_param_resolver_local_test.cc",
+        "common_runtime/collective_rma_local_test.cc",
+        "common_runtime/device_resolver_local_test.cc",
         "common_runtime/device_set_test.cc",
         "common_runtime/optimization_registry_test.cc",
         "common_runtime/pending_counts_test.cc",
@@ -2900,6 +2986,23 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "cudnn_rnn_ops_test_cc",
+    size = "small",
+    srcs = [
+        "ops/cudnn_rnn_ops_test.cc",
+    ],
+    deps = [
+        ":cudnn_rnn_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
@@ -3171,6 +3274,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "common_runtime_process_util_test",
+    size = "small",
+    srcs = ["common_runtime/process_util_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core_cpu_internal",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "common_runtime_rendezvous_util_test",
     size = "small",
@@ -3758,18 +3873,6 @@ cc_library(
 # -----------------------------------------------------------------------------
 # Google-internal targets go here (must be at the end).
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 alias(
     name = "android_srcs_no_runtime",
     actual = ":mobile_srcs_no_runtime",
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 58dbac4e8edac7079d315fbfcdafbd136793df0b..19d643880966f7607405539a5ad43d8e03dc13fb 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -17,18 +17,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "base_api_def",
     srcs = glob(["base_api/*"]),
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1921e3507bb1a6e8f175305400e4bfbad068d38
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -0,0 +1,87 @@
+op {
+  graph_op_name: "BoostedTreesCalculateBestGainsPerFeature"
+  visibility: HIDDEN
+  in_arg {
+    name: "node_id_range"
+    description: <<END
+A Rank 1 tensor (shape=[2]) to specify the range [first, last] of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1]+1)` (Note that the last index node_id_range[1] is inclusive).
+END
+  }
+  in_arg {
+    name: "stats_summary_list"
+    description: <<END
+A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+END
+  }
+  out_arg {
+    name: "node_ids_list"
+    description: <<END
+An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "gains_list"
+    description: <<END
+An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "thresholds_list"
+    description: <<END
+An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "left_node_contribs_list"
+    description: <<END
+A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "right_node_contribs_list"
+    description: <<END
+A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+END
+  }
+  attr {
+    name: "l1"
+    description: <<END
+l1 regularization factor on leaf weights, per instance based.
+END
+  }
+  attr {
+    name: "l2"
+    description: <<END
+l2 regularization factor on leaf weights, per instance based.
+END
+  }
+  attr {
+    name: "tree_complexity"
+    description: <<END
+adjustment to the gain, per leaf based.
+END
+  }
+  attr {
+    name: "max_splits"
+    description: <<END
+the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+inferred from the size of `stats_summary_list`; the number of total features.
+END
+  }
+  summary: "Calculates gains for each feature and returns the best possible split information for the feature."
+  description: <<END
+The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+
+It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+
+In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+
+The length of output lists are all of the same length, `num_features`.
+The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aee73b910f0ae8b542b3741ceeeadb9624126a27
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "BoostedTreesCreateEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble resource to be created.
+END
+  }
+  in_arg {
+    name: "stamp_token"
+    description: <<END
+Token to use as the initial value of the resource stamp.
+END
+  }
+  in_arg {
+    name: "tree_ensemble_serialized"
+    description: <<END
+Serialized proto of the tree ensemble.
+END
+  }
+  summary: "Creates a tree ensemble model and returns a handle to it."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1602ba045b95db060295a6b05f17dd8a06924d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "BoostedTreesDeserializeEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble.
+END
+  }
+  in_arg {
+    name: "stamp_token"
+    description: <<END
+Token to use as the new value of the resource stamp.
+END
+  }
+  in_arg {
+    name: "tree_ensemble_serialized"
+    description: <<END
+Serialized proto of the ensemble.
+END
+  }
+  summary: "Deserializes a serialized tree ensemble config and replaces current tree"
+  description: <<END
+ensemble.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bce5639a2049dba897fe45680fe98fa45f76c24
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "BoostedTreesEnsembleResourceHandleOp"
+  visibility: HIDDEN
+  summary: "Creates a handle to a BoostedTreesEnsembleResource"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef45a92498dadb9b911fbb99a1365f81a72060f2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "BoostedTreesGetEnsembleStates"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble.
+END
+  }
+  out_arg {
+    name: "stamp_token"
+    description: <<END
+Stamp token of the tree ensemble resource.
+END
+  }
+  out_arg {
+    name: "num_trees"
+    description: <<END
+The number of trees in the tree ensemble resource.
+END
+  }
+  out_arg {
+    name: "num_finalized_trees"
+    description: <<END
+The number of trees that were finished successfully.
+END
+  }
+  out_arg {
+    name: "num_attempted_layers"
+    description: <<END
+The number of layers we attempted to build (but not necessarily succeeded).
+END
+  }
+  summary: "Retrieves the tree ensemble resource stamp token."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc0856c900d1b1238d0641fef3f2f57b95a209fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "BoostedTreesMakeStatsSummary"
+  visibility: HIDDEN
+  in_arg {
+    name: "node_ids"
+    description: <<END
+int32 Rank 1 Tensor containing node ids, which each example falls into for the requested layer.
+END
+  }
+  in_arg {
+    name: "gradients"
+    description: <<END
+float32; Rank 2 Tensor (shape=[#examples, 1]) for gradients.
+END
+  }
+  in_arg {
+    name: "hessians"
+    description: <<END
+float32; Rank 2 Tensor (shape=[#examples, 1]) for hessians.
+END
+  }
+  in_arg {
+    name: "bucketized_features_list"
+    description: <<END
+int32 list of Rank 1 Tensors, each containing the bucketized feature (for each feature column).
+END
+  }
+  out_arg {
+    name: "stats_summary"
+    description: <<END
+output Rank 4 Tensor (shape=[#features, #splits, #buckets, 2]) containing accumulated stats put into the corresponding node and bucket. The first index of 4th dimension refers to gradients, and the second to hessians.
+END
+  }
+  attr {
+    name: "max_splits"
+    description: <<END
+int; the maximum number of splits possible in the whole tree.
+END
+  }
+  attr {
+    name: "num_buckets"
+    description: <<END
+int; equals to the maximum possible value of bucketized feature.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+int; inferred from the size of bucketized_features_list; the number of features.
+END
+  }
+  summary: "Makes the summary of accumulated stats for the batch."
+  description: <<END
+The summary stats contains gradients and hessians accumulated into the corresponding node and bucket for each example.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b23e77a1fa874e8b7569805527912a84c86111e5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "BoostedTreesPredict"
+  visibility: HIDDEN
+  in_arg {
+    name: "bucketized_features"
+    description: <<END
+A list of rank 1 Tensors containing bucket id for each
+feature.
+END
+  }
+  out_arg {
+    name: "logits"
+    description: <<END
+Output rank 2 Tensor containing logits for each example.
+END
+  }
+  attr {
+    name: "num_bucketized_features"
+    description: <<END
+Inferred.
+END
+  }
+  attr {
+    name: "logits_dimension"
+    description: <<END
+scalar, dimension of the logits, to be used for partial logits
+shape.
+END
+  }
+  attr {
+    name: "max_depth"
+    description: <<END
+scalar, max depth of trees. To be used for parallelization costs.
+END
+  }
+  summary: "Runs multiple additive regression ensemble predictors on input instances and"
+  description: <<END
+computes the logits. It is designed to be used during prediction.
+It traverses all the trees and calculates the final score for each instance.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0b3688d8a3f23d24178343d5b5e6b3ab5741b9f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "BoostedTreesSerializeEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble.
+END
+  }
+  out_arg {
+    name: "stamp_token"
+    description: <<END
+Stamp token of the tree ensemble resource.
+END
+  }
+  out_arg {
+    name: "tree_ensemble_serialized"
+    description: <<END
+Serialized proto of the ensemble.
+END
+  }
+  summary: "Serializes the tree ensemble to a proto."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7203d3cb5891ff78c441badba690a4a7d018cb83
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "BoostedTreesTrainingPredict"
+  visibility: HIDDEN
+  in_arg {
+    name: "cached_tree_ids"
+    description: <<END
+Rank 1 Tensor containing cached tree ids which is the starting
+tree of prediction.
+END
+  }
+  in_arg {
+    name: "cached_node_ids"
+    description: <<END
+Rank 1 Tensor containing cached node id which is the starting
+node of prediction.
+END
+  }
+  in_arg {
+    name: "bucketized_features"
+    description: <<END
+A list of rank 1 Tensors containing bucket id for each
+feature.
+END
+  }
+  out_arg {
+    name: "partial_logits"
+    description: <<END
+Rank 2 Tensor containing logits update (with respect to cached
+values stored) for each example.
+END
+  }
+  out_arg {
+    name: "tree_ids"
+    description: <<END
+Rank 1 Tensor containing new tree ids for each example.
+END
+  }
+  out_arg {
+    name: "node_ids"
+    description: <<END
+Rank 1 Tensor containing new node ids in the new tree_ids.
+END
+  }
+  attr {
+    name: "num_bucketized_features"
+    description: <<END
+Inferred.
+END
+  }
+  attr {
+    name: "logits_dimension"
+    description: <<END
+scalar, dimension of the logits, to be used for partial logits
+shape.
+END
+  }
+  attr {
+    name: "max_depth"
+    description: <<END
+scalar, max depth of trees. To be used for parallelization costs.
+END
+  }
+  summary: "Runs multiple additive regression ensemble predictors on input instances and"
+  description: <<END
+computes the update to cached logits. It is designed to be used during training.
+It traverses the trees starting from cached tree id and cached node id and
+calculates the updates to be pushed to the cache.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00f8953875783e87f40ce8c04187919424bae6da
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
@@ -0,0 +1,82 @@
+op {
+  graph_op_name: "BoostedTreesUpdateEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the ensemble variable.
+END
+  }
+  in_arg {
+    name: "feature_ids"
+    description: <<END
+Rank 1 tensor with ids for each feature. This is the real id of
+the feature that will be used in the split.
+END
+  }
+  in_arg {
+    name: "node_ids"
+    description: <<END
+List of rank 1 tensors representing the nodes for which this feature
+has a split.
+END
+  }
+  in_arg {
+    name: "gains"
+    description: <<END
+List of rank 1 tensors representing the gains for each of the feature's
+split.
+END
+  }
+  in_arg {
+    name: "thresholds"
+    description: <<END
+List of rank 1 tensors representing the thesholds for each of the
+feature's split.
+END
+  }
+  in_arg {
+    name: "left_node_contribs"
+    description: <<END
+List of rank 2 tensors with left leaf contribs for each of
+the feature's splits. Will be added to the previous node values to constitute
+the values of the left nodes.
+END
+  }
+  in_arg {
+    name: "right_node_contribs"
+    description: <<END
+List of rank 2 tensors with right leaf contribs for each
+of the feature's splits. Will be added to the previous node values to constitute
+the values of the right nodes.
+END
+  }
+  attr {
+    name: "max_depth"
+    description: <<END
+Max depth of the tree to build.
+END
+  }
+  attr {
+    name: "learning_rate"
+    description: <<END
+shrinkage const for each new tree.
+END
+  }
+  attr {
+    name: "pruning_mode"
+    description: <<END
+0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+Number of features that have best splits returned. INFERRED.
+END
+  }
+  summary: "Updates the tree ensemble by either adding a layer to the last tree being grown"
+  description: <<END
+or by starting a new tree.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CloseSummaryWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_CloseSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f6fd7d93169306fdf5ca62d27635e1f86f37bc4d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CloseSummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CloseSummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CreateSummaryDbWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_CreateSummaryDbWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28da46a0f8e452f65d06a13c4b0d0b03b2a75757
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CreateSummaryDbWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CreateSummaryDbWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CreateSummaryFileWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_CreateSummaryFileWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ce2c4d37e5001681ffa733bf4726c6bea652029
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CreateSummaryFileWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CreateSummaryFileWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..daeb5fe9a223d7d1254725325921a28a7d165902
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "CudnnRNN"
+  summary: "A RNN backed by cuDNN."
+  description: <<END
+Computes the RNN from the input and initial states, with respect to the params
+buffer.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+  The actual computation before the first layer. 'skip_input' is only allowed
+  when input_size == num_units; 'auto_select' implies 'skip_input' when
+  input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used.
+  dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
+input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: a 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+output: a 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: the same shape has input_h.
+output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+is_training: Indicates whether this operation is used for inferenece or
+  training.
+reserve_space: an opaque tensor that can be used in backprop calculation. It
+  is only produced if is_training is false.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..075ec52648e37397c95cb5ad302dcc9d951caada
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "CudnnRNNBackprop"
+  summary: "Backprop step of CudnnRNN."
+  description: <<END
+Compute the backprop of both data and weights in a RNN.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+    The actual computation before the first layer. 'skip_input' is only allowed
+    when input_size == num_units; 'auto_select' implies 'skip_input' when
+    input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used.
+    dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
+input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: a 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+output: a 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: the same shape has input_h.
+output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+    pass.
+output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+    pass.
+reserve_space: The same reserve_space produced in for forward operation.
+input_backprop: The backprop to input in the forward pass. Has the same shape
+    as input.
+input_h_backprop: The backprop to input_h in the forward pass. Has the same
+    shape as input_h.
+input_c_backprop: The backprop to input_c in the forward pass. Has the same
+    shape as input_c.
+params_backprop: The backprop to the params buffer in the forward pass. Has the
+    same shape as params.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNCanonicalToParams.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNCanonicalToParams.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..abf81a2071172c5b00fec662e1401a46fc49c450
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNCanonicalToParams.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "CudnnRNNCanonicalToParams"
+  summary: "Converts CudnnRNN params from canonical form to usable form."
+  description: <<END
+Writes a set of weights into the opaque params buffer so they can be used in
+upcoming training or inferences.
+
+Note that the params buffer may not be compatible across different GPUs. So any
+save and restoration should be converted to and from the canonical weights and
+biases.
+
+num_layers: Specifies the number of layers in the RNN model.
+num_units: Specifies the size of the hidden state.
+input_size: Specifies the size of the input state.
+weights: the canonical form of weights that can be used for saving
+    and restoration. They are more likely to be compatible across different
+    generations.
+biases: the canonical form of biases that can be used for saving
+    and restoration. They are more likely to be compatible across different
+    generations.
+num_params: number of parameter sets for all layers.
+    Each layer may contain multiple parameter sets, with each set consisting of
+    a weight matrix and a bias vector.
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+    The actual computation before the first layer. 'skip_input' is only allowed
+    when input_size == num_units; 'auto_select' implies 'skip_input' when
+    input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used.
+    dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31fb85d4fb3f59ae82737128cc88d2cbdbc996ea
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsSize.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "CudnnRNNParamsSize"
+  summary: "Computes size of weights that can be used by a Cudnn RNN model."
+  description: <<END
+Return the params size that can be used by the Cudnn RNN model. Subsequent
+weight allocation and initialization should use this size.
+
+num_layers: Specifies the number of layers in the RNN model.
+num_units: Specifies the size of the hidden state.
+input_size: Specifies the size of the input state.
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+  The actual computation before the first layer. 'skip_input' is only allowed
+  when input_size == num_units; 'auto_select' implies 'skip_input' when
+  input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used.
+  dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
+params_size: The size of the params buffer that should be allocated and
+  initialized for this RNN model. Note that this params buffer may not be
+  compatible across GPUs. Please use CudnnRNNParamsWeights and
+  CudnnRNNParamsBiases to save and restore them in a way that is compatible
+  across different runs.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsToCanonical.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsToCanonical.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..47753bf8fcf9aa3b1d0938b974aa788f1e2c5df1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNParamsToCanonical.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "CudnnRNNParamsToCanonical"
+  summary: "Retrieves CudnnRNN params in canonical form."
+  description: <<END
+Retrieves a set of weights from the opaque params buffer that can be saved and
+restored in a way compatible with future runs.
+
+Note that the params buffer may not be compatible across different GPUs. So any
+save and restoration should be converted to and from the canonical weights and
+biases.
+
+num_layers: Specifies the number of layers in the RNN model.
+num_units: Specifies the size of the hidden state.
+input_size: Specifies the size of the input state.
+num_params: number of parameter sets for all layers.
+    Each layer may contain multiple parameter sets, with each set consisting of
+    a weight matrix and a bias vector.
+weights: the canonical form of weights that can be used for saving
+    and restoration. They are more likely to be compatible across different
+    generations.
+biases: the canonical form of biases that can be used for saving
+    and restoration. They are more likely to be compatible across different
+    generations.
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicate whether there is a linear projection between the input and
+    The actual computation before the first layer. 'skip_input' is only allowed
+    when input_size == num_units; 'auto_select' implies 'skip_input' when
+    input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used.
+    dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
index 561c86ddf68a4fb093d263d076fb6ccc8d408733..599bbce65f44a3c6798be2d5cee3b8f6f2e2635a 100644
--- a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
@@ -6,7 +6,7 @@ Attributes `[min; max]` define the clamping range for the `inputs` data.
 `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 
 Quantization is called fake since the output is still in floating point.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt
index 2713c01b27f6bc45eb6117047243f06873d4dd87..1976ffb8aac29f6aaac307c6f126b78f17dcadb8 100644
--- a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt
@@ -8,7 +8,7 @@ and `max` to 'outputs' tensor of same shape as `inputs`.
 `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 
 This operation has a gradient and thus allows for training `min` and `max`
 values.
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
index e293d4d084bc90f24ee0cc1111f750ddfa46465b..c0fac6a445895eb1ebbfd621d9afd49d99ed80f4 100644
--- a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
@@ -9,7 +9,7 @@ to 'outputs' tensor of same shape as `inputs`.
 `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 
 This operation has a gradient and thus allows for training `min` and `max`
 values.
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
index 8a4ab368b5a8c4d8ac756513da14796ff3a41551..2051903f6dae6bb51490dbec137e7cd7592fb6c6 100644
--- a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
@@ -40,7 +40,7 @@ END
   attr {
     name: "num_bits"
     description: <<END
-The bitwidth of the quantization; between 2 and 8, inclusive.
+The bitwidth of the quantization; between 2 and 16, inclusive.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_FlushSummaryWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_FlushSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ada43c9b8b5e25b72fa6e6d7b0a313965dd9d5a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FlushSummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FlushSummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_For.pbtxt b/tensorflow/core/api_def/base_api/api_def_For.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7cd8e1a26e2c75c5d6aaea65699c0545c1be445
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_For.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "For"
+  in_arg { name: "start" description: "The lower bound. An int32" }
+  in_arg { name: "limit" description: "The upper bound. An int32" }
+  in_arg { name: "delta" description: "The increment. An int32" }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors whose types are T."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of output tensors whose types are T."
+  }
+  attr { name: "T"  description: "A list of dtypes." }
+  attr {
+    name: "body"
+    description: <<END
+    A function that takes a list of tensors (int32, T) and returns another
+    list of tensors (T).
+END
+  }
+  summary: <<END
+  ```python
+   output = input;
+   for i in range(start, limit, delta)
+     output = body(i, output);
+  ```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_If.pbtxt b/tensorflow/core/api_def/base_api/api_def_If.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ba5a3f37e1cd55e9f13b3c8f8f11f1f20346de7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_If.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "If"
+  in_arg { name: "cond"  description: "The predicate." }
+  in_arg {
+    name: "cond"
+    description: <<END
+      A Tensor. If the tensor is a scalar of non-boolean type, the
+      scalar is converted to a boolean according to the
+      following rule: if the scalar is a numerical value, non-zero means
+      `True` and zero means False; if the scalar is a string, non-empty
+      means `True` and empty means `False`. If the tensor is not a scalar,
+      being empty means False and being non-empty means True.
+END
+  }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "then_branch"
+    description: <<END
+      A function that takes 'inputs' and returns a list of tensors, whose
+      types are the same as what else_branch returns.
+END
+  }
+  attr {
+    name: "else_branch"
+    description: <<END
+    A function that takes 'inputs' and returns a list of tensors, whose
+    types are the same as what then_branch returns.
+END
+  }
+  summary: "output = cond ? then_branch(input) : else_branch(input)"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ImportEvent.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImportEvent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8813b58f3e53e5916edcabafc1fd28388fea8d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ImportEvent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ImportEvent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d54b7ef32a3237607c6d31934aa43f11859a248b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "IsBoostedTreesEnsembleInitialized"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble resouce.
+END
+  }
+  out_arg {
+    name: "is_initialized"
+    description: <<END
+output boolean on whether it is initialized or not.
+END
+  }
+  summary: "Checks whether a tree ensemble has been initialized."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt
index 51d91399f8a53325b03e67e643c1375c2bd7cf22..e667c328ae58092c7059d1466f90a5c0935f3c89 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt
@@ -49,14 +49,14 @@ in the batch:
 If `fast` is `True`, then the solution is computed by solving the normal
 equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
 \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-\lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\). 
+If \\(m \lt n\\) then `output` is computed as
 \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
 minimum-norm solution to the under-determined linear system, i.e.
 \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
 subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
 when \\(A\\) is numerically full rank and has a condition number
-\\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+\\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
 sufficiently large.
 
 If `fast` is `False` an algorithm based on the numerically robust complete
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt
index 9e0de08267288e32e34cef323761cf4566fce128..4eb6eb4e4da44115bc1889241881b39e8e066fde 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt
@@ -34,7 +34,7 @@ This operation computes
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their contributions add.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..47148f7b03e1e2029e3151133165c694ce8e7110
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterDiv.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterDiv"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Divides sparse updates into the variable referenced by `resource`."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] /= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] /= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions multiply.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71f06d9a4349ecc36f1d4d276caee5f167d8c999
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMax.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterMax"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Reduces sparse updates into the variable referenced by `resource` using the `max` operation."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = max(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions are combined.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08e40ee2a8039c3afbf98b534c7fd6f10faabeff
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMin.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterMin"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Reduces sparse updates into the variable referenced by `resource` using the `min` operation."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = min(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions are combined.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c63549d81009f3b6d54795325196ce87c396cf4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterMul.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterMul"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Multiplies sparse updates into the variable referenced by `resource`."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] *= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] *= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions multiply.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e71e60cbee5c69a44852ddbf835072fbdbd623eb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterSub.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterSub"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Subtracts sparse updates from the variable referenced by `resource`."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] -= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] -= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions add.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt
index 4b5201f025b438a1e6bba41035004b82ab876de7..9da9d09ea693036ea21b5e89b0a9a4d59f67b834 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt
@@ -51,7 +51,7 @@ This makes it easier to chain operations that need to use the reset value.
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their contributions add.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt
index 771cf0b591367e18f007e91bf66bc1cfd02ab459..8e99718c7e3751c1bf4ef4d03e558be3c0ada51e 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt
@@ -53,6 +53,6 @@ This makes it easier to chain operations that need to use the reset value.
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their contributions divide.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b52dad4a163643af659320f324ce6558fcffcd8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterMax.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "ScatterMax"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to reduce into `ref`.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as `ref`.  Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the update will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Reduces sparse updates into a variable reference using the `max` operation."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = max(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+This operation outputs `ref` after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions combine.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..721ac0ff35f934583e227317515b0ba3298de747
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterMin.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "ScatterMin"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to reduce into `ref`.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as `ref`.  Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the update will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Reduces sparse updates into a variable reference using the `min` operation."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = min(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+This operation outputs `ref` after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions combine.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt
index a51f571b00d7fc68a24dbfc4a0104522f8c0f559..b9e293ba9efba10de9ccd774111899adf4342c90 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt
@@ -53,6 +53,6 @@ This makes it easier to chain operations that need to use the reset value.
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their contributions multiply.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt
index c0d3a4a1337ee1e1a32114adc51c930e014bc268..d12b3e68c25c22825349bf7affbb09de8fdf98ac 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt
@@ -51,7 +51,7 @@ This makes it easier to chain operations that need to use the reset value.
 Duplicate entries are handled correctly: if multiple `indices` reference
 the same location, their (negated) contributions add.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/ScatterSub.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
index c44dbbd2332828242792d9cdd4a218e7457c7d2b..4804908afc61356db76391a4d425b0857c52412d 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
@@ -54,7 +54,7 @@ If values in `ref` is to be updated more than once, because there are
 duplicate entries in `indices`, the order at which the updates happen
 for each value is undefined.
 
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/ScatterUpdate.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt
index 51d63eeb5695d6a428e990ba43e54102db58b58e..7be9a958ab55d27b4b9fe3dd023e44ae828e042c 100644
--- a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt
@@ -19,6 +19,7 @@ form square matrices, with the same constraints as the single matrix
 SelfAdjointEig.
 
 The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+are sorted in non-decreasing order.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt
index 4a5e1252586ea8b3e03b2545e0d8646288ddc408..fae9e84fc85be06184b19308d87c90632347e2f6 100644
--- a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt
@@ -31,7 +31,8 @@ END
   summary: "Computes the eigen decomposition of one or more square self-adjoint matrices."
   description: <<END
 Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+are sorted in non-decreasing order.
 
 ```python
 # a is a tensor.
diff --git a/tensorflow/core/api_def/base_api/api_def_SummaryWriter.pbtxt b/tensorflow/core/api_def/base_api/api_def_SummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fe57ecf195c85217bd174dbc503b28f26adade9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_While.pbtxt b/tensorflow/core/api_def/base_api/api_def_While.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95a19c6dff99a51aa3228923b6408b3d7a995835
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_While.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "While"
+  in_arg {
+    name: "input"
+    description: "A list of input tensors whose types are T."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of output tensors whose types are T."
+  }
+  attr { name: "T"  description: "dtype in use." }
+  attr {
+    name: "cond"
+    description: <<END
+      A function takes 'input' and returns a tensor.  If the tensor is
+      a scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical
+      value, non-zero means True and zero means False; if the scalar is
+      a string, non-empty means True and empty means False. If the
+      tensor is not a scalar, non-emptiness means True and False
+      otherwise.
+END
+  }
+  attr {
+    name: "body"
+    description: <<END
+      A function that takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as specified
+      by T.
+END
+  }
+  summary: "output = input; While (Cond(output)) { output = Body(output) }"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..520952cd4117867f61cd3c536b8a7cc5beeeab62
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteAudioSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3653477b2067875ee772dedc5015bc550de1ec12
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteGraphSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26e1482630596d9ecb80917ff91adb2bd1131692
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteHistogramSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78db8700f0c7231f24fd1db3a0eedbcc4f43deeb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteImageSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bae8638d258b6fdf217fbdbd8705369f57e0bb3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteScalarSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db86883e21ea47a9e74788f8d76a166a235674da
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ArgMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ArgMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4c23a432f2b747a8a406c6152d36fc0ba5f1118f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ArgMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ArgMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ArgMin.pbtxt b/tensorflow/core/api_def/python_api/api_def_ArgMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..daa14f638663ef42fc50667e1bb1d5236e3d3361
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ArgMin.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ArgMin"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CloseSummaryWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_CloseSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f6fd7d93169306fdf5ca62d27635e1f86f37bc4d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CloseSummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CloseSummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CountUpTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_CountUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f41be2f540d241776bee3fcb1bba496d4baebeab
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CountUpTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CountUpTo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CreateSummaryDbWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_CreateSummaryDbWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28da46a0f8e452f65d06a13c4b0d0b03b2a75757
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CreateSummaryDbWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CreateSummaryDbWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CreateSummaryFileWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_CreateSummaryFileWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ce2c4d37e5001681ffa733bf4726c6bea652029
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CreateSummaryFileWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CreateSummaryFileWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNN.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b13586b63ba3418c452e44b0f007c42885498f9f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNN.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNN"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNNBackprop.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNNBackprop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81c4efc60b7f6338a0197e5898c6e7eddd5069bf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNNBackprop.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNBackprop"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNNCanonicalToParams.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNNCanonicalToParams.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..164a306034af2b85fb803b431367e337bc65b34f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNNCanonicalToParams.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNCanonicalToParams"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00f97f05b11d3bdb049c55beba2fe9ce18e14ff0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNParamsSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsToCanonical.pbtxt b/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsToCanonical.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..841bc0cf55e0800e3350f0eb68d37f42c788d79e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CudnnRNNParamsToCanonical.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNParamsToCanonical"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Div.pbtxt b/tensorflow/core/api_def/python_api/api_def_Div.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e5537c8bfea638b585c04c514264d930054fde5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Div.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Div"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..391167254edb69725c778e6319bf8a9f6038589f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Erf"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FlushSummaryWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_FlushSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ada43c9b8b5e25b72fa6e6d7b0a313965dd9d5a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FlushSummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FlushSummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_For.pbtxt b/tensorflow/core/api_def/python_api/api_def_For.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a58ddf56fe1a8516c57ca203f14ea76414ab55f5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_For.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "For" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_Identity.pbtxt b/tensorflow/core/api_def/python_api/api_def_Identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00f2afde271ebade0ac7d1ae75dc9dff6f692ab5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Identity.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Identity"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_If.pbtxt b/tensorflow/core/api_def/python_api/api_def_If.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a44db5da081692ee26a7931850236d31d2231627
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_If.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "If" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_ImportEvent.pbtxt b/tensorflow/core/api_def/python_api/api_def_ImportEvent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8813b58f3e53e5916edcabafc1fd28388fea8d8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ImportEvent.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ImportEvent"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Mod.pbtxt b/tensorflow/core/api_def/python_api/api_def_Mod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..48d828ca72a5abfebf1815980e82e1a3f471c175
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Mod.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Mod"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Rank.pbtxt b/tensorflow/core/api_def/python_api/api_def_Rank.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05aa12f2fa238f540f653e42576899c3e1b799da
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Rank.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Rank"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56b5a46d107e25e6ee501c9389dafda20dd34a04
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterDiv.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterDiv"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8119bcc6c6504de57e2c3a53dbb4c7bd03003ad4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterMin.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d874aef3fe33415e69f483841a6851f3b8c30523
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMin.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterMin"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..365a37fa0d73406c20e6908b84cc8b7d005f36c1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72dc5bf8893b249a7dc9b58f53222213d26faca1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Round.pbtxt b/tensorflow/core/api_def/python_api/api_def_Round.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74428e2f58323e47ca672d50f5193bac2977b1f9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Round.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Round"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f5b6decf6d1cfc6b3fbd8492824ab95958b060b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterNdUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ccf4a9cce807c4cbc5fe2fdc1e2a7057a0bc5464
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterNdUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterNdUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e4c41c1226674fbb21899ea31be8668b0d8f6ece
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShapeN.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShapeN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2dbe74b09689b6c4fb1c54640205c7281d23780
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ShapeN.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShapeN"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2ee91dd12ed16ba27a9c4ae45b48194bc5a8b03
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Sign"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59e2dfe8366813242337c9490d74ca317e525636
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Sqrt"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b39ae25fa062b4271dcc2aee6523847c97b1e4d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Square"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SummaryWriter.pbtxt b/tensorflow/core/api_def/python_api/api_def_SummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fe57ecf195c85217bd174dbc503b28f26adade9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SummaryWriter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SummaryWriter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_While.pbtxt b/tensorflow/core/api_def/python_api/api_def_While.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f47a9b0fceb77af3dc9fcad3569f8e18b7f44188
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_While.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "While" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteAudioSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteAudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..520952cd4117867f61cd3c536b8a7cc5beeeab62
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteAudioSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteAudioSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteGraphSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteGraphSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3653477b2067875ee772dedc5015bc550de1ec12
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteGraphSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteGraphSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteHistogramSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteHistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26e1482630596d9ecb80917ff91adb2bd1131692
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteHistogramSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteHistogramSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteImageSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78db8700f0c7231f24fd1db3a0eedbcc4f43deeb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteImageSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteImageSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteScalarSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bae8638d258b6fdf217fbdbd8705369f57e0bb3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteScalarSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteScalarSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db86883e21ea47a9e74788f8d76a166a235674da
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WriteSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WriteSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.cc b/tensorflow/core/common_runtime/buf_rendezvous.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b57eb2943a7fac79804587043e1958c279923daf
--- /dev/null
+++ b/tensorflow/core/common_runtime/buf_rendezvous.cc
@@ -0,0 +1,166 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+
+namespace tensorflow {
+
+BufRendezvous::~BufRendezvous() {
+  mutex_lock l(mu_);
+  if (!hook_table_.empty()) {
+    PurgeTable(errors::Internal("Delete called on non-empty BufRendezvous"),
+               &hook_table_);
+  }
+}
+
+void BufRendezvous::StartAbort(const Status& s) {
+  CHECK(!s.ok());
+  HookTable dummy_table;
+  {
+    mutex_lock l(mu_);
+    status_.Update(s);
+    hook_table_.swap(dummy_table);
+  }
+  PurgeTable(s, &dummy_table);
+}
+
+void BufRendezvous::PurgeTable(const Status& s, HookTable* table) {
+  for (auto& it : *table) {
+    Hook* h = it.second;
+    if (h->cons_cb != nullptr) {
+      h->cons_cb(s, nullptr);
+    }
+    if (h->prod_cb != nullptr) {
+      h->prod_cb(s);
+    }
+    delete h;
+  }
+  table->clear();
+}
+
+string BufRendezvous::Hook::DebugString() const {
+  return strings::StrCat("[dev:", (prod_dev ? prod_dev->name() : "none"),
+                         ", ctx:", reinterpret_cast<uint64>(prod_ctx),
+                         ", val:", reinterpret_cast<uint64>(prod_value),
+                         ", pcb:", reinterpret_cast<uint64>(&prod_cb),
+                         ", ccb:", reinterpret_cast<uint64>(&cons_cb), "]");
+}
+
+void BufRendezvous::ProvideBuf(const string& key, Device* dev,
+                               DeviceContext* dev_ctx, const Tensor* v,
+                               const AllocatorAttributes& attr,
+                               const ProducerCallback& done) {
+  Hook* h = nullptr;
+  Status providebuf_status;
+  do {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      providebuf_status = status_;
+      break;
+    } else {
+      auto it = hook_table_.find(key);
+      if (it == hook_table_.end()) {
+        h = new Hook;
+        it = hook_table_.insert(std::make_pair(key, h)).first;
+      } else {
+        if (it->second->prod_cb != nullptr) {
+          providebuf_status = errors::Internal(
+              "BufRendezvous::ProvideBuf already called for key ", key);
+          break;
+        }
+        h = it->second;
+      }
+      // Populate Hook with all of the prod values.
+      h->prod_dev = dev;
+      h->prod_ctx = dev_ctx;
+      h->prod_value = v;
+      h->prod_attr = attr;
+      h->prod_cb = done;
+      // If consumer is waiting, kick off right away, removing Hook from table.
+      if (h->cons_cb != nullptr) {
+        hook_table_.erase(it);
+      } else {
+        h = nullptr;
+      }
+    }
+  } while (false);
+  if (h) {
+    h->cons_cb(Status::OK(), h);
+  }
+  if (!providebuf_status.ok()) {
+    done(providebuf_status);
+  }
+}
+
+void BufRendezvous::ConsumeBuf(const string& key,
+                               const ConsumerCallback& done) {
+  Hook* existing_hook = nullptr;
+  Status consumebuf_status;
+  do {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      consumebuf_status = status_;
+      break;
+    }
+    auto it = hook_table_.find(key);
+    if (it != hook_table_.end()) {
+      // Prepare to consume immediately.
+      if (it->second->cons_cb) {
+        consumebuf_status =
+            errors::Internal("Second consumer arrived for key ", key);
+        break;
+      }
+      existing_hook = it->second;
+      hook_table_.erase(it);
+      existing_hook->cons_cb = done;
+    } else {
+      // Hang consumer callback on the Hook.
+      Hook* h = new Hook;
+      hook_table_[key] = h;
+      h->cons_cb = done;
+      return;
+    }
+  } while (false);
+  if (existing_hook) {
+    existing_hook->cons_cb(Status::OK(), existing_hook);
+    return;
+  }
+  if (!consumebuf_status.ok()) {
+    done(consumebuf_status, nullptr);
+    return;
+  }
+}
+
+/*static*/
+void BufRendezvous::DoneWithHook(Hook* h) {
+  h->prod_cb(Status::OK());
+  delete h;
+}
+
+void BufRendezvous::LogContents() {
+  mutex_lock l(mu_);
+  LOG(INFO) << strings::StrCat("BufRendezvous ",
+                               strings::Hex(reinterpret_cast<uint64>(this)),
+                               " step_id=", step_id_, " current contents:");
+  for (auto it : hook_table_) {
+    LOG(INFO) << it.first << ":" << it.second->DebugString();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.h b/tensorflow/core/common_runtime/buf_rendezvous.h
new file mode 100644
index 0000000000000000000000000000000000000000..e94e88b323ec74a36948ffff4e5718f211efbbb6
--- /dev/null
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+#define TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+class Device;
+class DeviceContext;
+class Tensor;
+
+// EXPERIMENTAL: RDMA oriented producer/consumer rendezvous on a local
+// Tensor value for which DMAHelper::CanUseDMA() is true, i.e. dense
+// numeric types.  Similar to Rendezvous but never owns a Ref on the
+// tensor, instead it uses an explicit callback to the producer when
+// the consumer side is finished with the value.  This allows the
+// producer to perform in-place updates on the source buffer or to take
+// other actions that depend on knowing the consumer has passed a certain
+// execution point.
+class BufRendezvous {
+ public:
+  explicit BufRendezvous(uint64 step_id) : step_id_(step_id) {}
+
+  ~BufRendezvous();
+
+  // Inform all all waiting parties that this BufRendezvous is defunct
+  // because of an error Status interrupting the Step.
+  void StartAbort(const Status& s);
+
+  struct Hook;
+  // Provided by the consumer to be called when access to the buffer
+  // is available.  If the Status arg is not OK, then hook will not
+  // be populated.  Ownership of Hook passes to consumer with the
+  // callback.
+  typedef std::function<void(const Status&, Hook*)> ConsumerCallback;
+  // Provided by the producer to be called when the consumer has finished
+  // reading the buffer and will no longer access it.
+  typedef std::function<void(const Status&)> ProducerCallback;
+
+  struct Hook {
+    Device* prod_dev;
+    DeviceContext* prod_ctx;
+    const Tensor* prod_value;
+    AllocatorAttributes prod_attr;
+    ProducerCallback prod_cb;
+    ConsumerCallback cons_cb;
+    Hook()
+        : prod_dev(nullptr),
+          prod_ctx(nullptr),
+          prod_value(nullptr),
+          prod_cb(nullptr),
+          cons_cb(nullptr) {}
+    string DebugString() const;
+  };
+
+  // Called to advertise availability of a Tensor value corresponding
+  // to key.  That value must stay valid until done is called.
+  void ProvideBuf(const string& key, Device* dev, DeviceContext* dev_ctx,
+                  const Tensor* v, const AllocatorAttributes& attr,
+                  const ProducerCallback& done);
+
+  // Called to request access to a Tensor value corresponding to key.
+  // Consumer is provide with a Hook as soon as availble.
+  void ConsumeBuf(const string& key, const ConsumerCallback& done);
+
+  // Consumer must call this function when it's done reading the Hook provided
+  // by the ConsumerCallback.  This function will invoke the producer callback
+  // and then delete h.
+  static void DoneWithHook(Hook* h);
+
+  // Write the current contents of the table to the INFO log.
+  void LogContents();
+
+ protected:
+  const uint64 step_id_;
+  mutex mu_;
+  Status status_ GUARDED_BY(mu_);
+  typedef gtl::FlatMap<string, Hook*> HookTable;
+  HookTable hook_table_ GUARDED_BY(mu_);
+
+  void PurgeTable(const Status& s, HookTable* table);
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e798235bf0649428409a2fa72ac3067736c347a
--- /dev/null
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -0,0 +1,197 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class BufRendezvousTest : public ::testing::Test {
+ protected:
+  BufRendezvousTest() {
+    br_.reset(new BufRendezvous(123));
+    fake_dev_ptr_ = reinterpret_cast<Device*>(512LLU);
+    fake_dev_ctx_ = reinterpret_cast<DeviceContext*>(1024LLU);
+    a_ = Tensor(DT_FLOAT, TensorShape({24}));
+    b_ = Tensor(DT_FLOAT, TensorShape({24}));
+  }
+
+  Device* fake_dev_ptr_ = nullptr;
+  DeviceContext* fake_dev_ctx_ = nullptr;
+  Tensor a_;
+  Tensor b_;
+  AllocatorAttributes aa_;
+  std::unique_ptr<BufRendezvous> br_;
+};
+
+TEST_F(BufRendezvousTest, CorrectUseProducerFirst) {
+  Status prod_status;
+  Status cons_status;
+  bool prod_callback_called = false;
+  bool cons_callback_called = false;
+  Notification note;
+  br_->ProvideBuf(
+      "key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+      [&note, &prod_status, &prod_callback_called](const Status& s) {
+        prod_status = s;
+        prod_callback_called = true;
+        note.Notify();
+      });
+  EXPECT_FALSE(prod_callback_called);
+  br_->ConsumeBuf("key0", [this, &cons_status, &cons_callback_called](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_callback_called = true;
+    ASSERT_TRUE(h != nullptr);
+    EXPECT_EQ(h->prod_dev, fake_dev_ptr_);
+    EXPECT_EQ(h->prod_ctx, fake_dev_ctx_);
+    EXPECT_EQ(h->prod_value, &a_);
+    br_->DoneWithHook(h);
+  });
+  EXPECT_TRUE(cons_callback_called);
+  note.WaitForNotification();
+  EXPECT_TRUE(prod_callback_called);
+  TF_EXPECT_OK(cons_status);
+  TF_EXPECT_OK(prod_status);
+}
+
+TEST_F(BufRendezvousTest, CorrectUseConsumerFirst) {
+  Status prod_status;
+  Status cons_status;
+  bool prod_callback_called = false;
+  bool cons_callback_called = false;
+  Notification note;
+  br_->ConsumeBuf("key0", [this, &cons_status, &cons_callback_called](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_callback_called = true;
+    ASSERT_TRUE(h != nullptr);
+    EXPECT_EQ(h->prod_dev, fake_dev_ptr_);
+    EXPECT_EQ(h->prod_ctx, fake_dev_ctx_);
+    EXPECT_EQ(h->prod_value, &a_);
+    br_->DoneWithHook(h);
+  });
+  EXPECT_FALSE(cons_callback_called);
+  br_->ProvideBuf(
+      "key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+      [&note, &prod_status, &prod_callback_called](const Status& s) {
+        prod_status = s;
+        prod_callback_called = true;
+        note.Notify();
+      });
+  EXPECT_TRUE(cons_callback_called);
+  note.WaitForNotification();
+  EXPECT_TRUE(prod_callback_called);
+  TF_EXPECT_OK(cons_status);
+  TF_EXPECT_OK(prod_status);
+}
+
+TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
+  bool prod_callback_called = false;
+  br_->ProvideBuf("key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [this, &prod_callback_called](const Status& s) {
+                    prod_callback_called = true;
+                  });
+  Status bad_status;
+  Notification note;
+  br_->ProvideBuf("key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [&bad_status, &note](const Status& s) {
+                    bad_status = s;
+                    note.Notify();
+                  });
+  note.WaitForNotification();
+  EXPECT_FALSE(bad_status.ok());
+  EXPECT_EQ("BufRendezvous::ProvideBuf already called for key key0",
+            bad_status.error_message());
+  EXPECT_FALSE(prod_callback_called);
+  br_.reset();
+}
+
+TEST_F(BufRendezvousTest, ErrorDeleteNonEmpty) {
+  Status cons_status;
+  br_->ConsumeBuf(
+      "key0", [this, &cons_status](const Status& s, BufRendezvous::Hook* h) {
+        cons_status = s;
+        EXPECT_EQ(h, nullptr);
+      });
+  EXPECT_TRUE(cons_status.ok());
+  br_.reset();
+  EXPECT_FALSE(cons_status.ok());
+  EXPECT_EQ("Delete called on non-empty BufRendezvous",
+            cons_status.error_message());
+}
+
+TEST_F(BufRendezvousTest, AbortNonEmpty) {
+  Status cons_status;
+  Status prod_status;
+  Notification prod_note;
+  Notification cons_note;
+  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_note.Notify();
+  });
+  br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [this, &prod_note, &prod_status](const Status& s) {
+                    prod_status = s;
+                    prod_note.Notify();
+                  });
+  br_->StartAbort(errors::Internal("Falling sky detected"));
+  prod_note.WaitForNotification();
+  cons_note.WaitForNotification();
+  EXPECT_FALSE(prod_status.ok());
+  EXPECT_EQ(prod_status.error_message(), "Falling sky detected");
+  EXPECT_FALSE(cons_status.ok());
+  EXPECT_EQ(cons_status.error_message(), "Falling sky detected");
+}
+
+TEST_F(BufRendezvousTest, AbortEmpty) {
+  br_->StartAbort(errors::Internal("Falling sky detected"));
+}
+
+TEST_F(BufRendezvousTest, UseAfterAbort) {
+  br_->StartAbort(errors::Internal("Falling sky detected"));
+  Status cons_status;
+  Status prod_status;
+  Notification prod_note;
+  Notification cons_note;
+  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_note.Notify();
+  });
+  br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [this, &prod_note, &prod_status](const Status& s) {
+                    prod_status = s;
+                    prod_note.Notify();
+                  });
+  prod_note.WaitForNotification();
+  cons_note.WaitForNotification();
+  EXPECT_FALSE(prod_status.ok());
+  EXPECT_EQ(prod_status.error_message(), "Falling sky detected");
+  EXPECT_FALSE(cons_status.ok());
+  EXPECT_EQ(cons_status.error_message(), "Falling sky detected");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5c4946e58edf900ef3e42586f6b484d8f5e4891
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -0,0 +1,114 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/build_graph_options.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace {
+// TODO(tucker): Temporary class just until a real CollectiveExecutor
+// implementation is submitted in a later CL.
+class DummyCollectiveExecutor : public CollectiveExecutor {
+ public:
+  explicit DummyCollectiveExecutor(CollectiveExecutorMgr* ce_mgr)
+      : CollectiveExecutor(ce_mgr) {}
+
+  ~DummyCollectiveExecutor() override {}
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(DummyCollectiveExecutor);
+};
+}  // namespace
+
+CollectiveExecutorMgr::CollectiveExecutorMgr(
+    const ConfigProto& config, const DeviceMgr* dev_mgr,
+    DeviceResolverInterface* dev_resolver,
+    ParamResolverInterface* param_resolver)
+    : dev_mgr_(dev_mgr),
+      dev_resolver_(dev_resolver),
+      param_resolver_(param_resolver) {}
+
+CollectiveExecutorMgr::~CollectiveExecutorMgr() {
+  for (auto iter : executor_table_) {
+    iter.second->Unref();
+  }
+}
+
+CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
+  CollectiveExecutor* ce = nullptr;
+  {
+    mutex_lock l(exec_mu_);
+    auto it = executor_table_.find(step_id);
+    if (it != executor_table_.end()) {
+      ce = it->second;
+    } else {
+      ce = new DummyCollectiveExecutor(this);
+      executor_table_[step_id] = ce;
+    }
+    ce->Ref();
+  }
+  return ce;
+}
+
+void CollectiveExecutorMgr::Cleanup(int64 step_id) {
+  CollectiveExecutor* ce = nullptr;
+  {
+    mutex_lock l(exec_mu_);
+    auto it = executor_table_.find(step_id);
+    if (it != executor_table_.end()) {
+      ce = it->second;
+      executor_table_.erase(it);
+    }
+  }
+  if (ce) ce->Unref();
+}
+
+void CollectiveExecutorMgr::GetStepSequenceAsync(
+    const GetStepSequenceRequest* request, GetStepSequenceResponse* response,
+    const StatusCallback& done) {
+  done(errors::Internal(
+      "CollectiveExecutorMgr does not implement GetStepSequence."));
+}
+
+void CollectiveExecutorMgr::RefreshStepIdSequenceAsync(
+    int64 graph_key, const StatusCallback& done) {
+  done(errors::Internal(
+      "CollectiveExecutorMgr does not implement RefreshStepIdSequence."));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b42e2b4d16c5804e0660079c7a149442b47edb0
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+class ConfigProto;
+class DeviceMgr;
+
+class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
+ public:
+  CollectiveExecutorMgr(const ConfigProto& config, const DeviceMgr* dev_mgr,
+                        DeviceResolverInterface* dev_resolver,
+                        ParamResolverInterface* param_resolver);
+
+  virtual ~CollectiveExecutorMgr();
+
+  CollectiveExecutor* FindOrCreate(int64 step_id) override;
+
+  void Cleanup(int64 step_id) override;
+
+  ParamResolverInterface* GetParamResolver() const override {
+    return param_resolver_.get();
+  }
+
+  DeviceResolverInterface* GetDeviceResolver() const override {
+    return dev_resolver_.get();
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            const StatusCallback& done) override;
+
+  void RefreshStepIdSequenceAsync(int64 graph_key,
+                                  const StatusCallback& done) override;
+
+  int64 NextStepId(int64 graph_key) override {
+    return CollectiveExecutor::kInvalidId;
+  }
+
+  void RetireStepId(int64 graph_key, int64 step_id) override {}
+
+ protected:
+  const DeviceMgr* dev_mgr_;
+  std::unique_ptr<DeviceResolverInterface> dev_resolver_;
+  std::unique_ptr<ParamResolverInterface> param_resolver_;
+  CollectiveRemoteAccess* remote_access_;
+  string task_name_;
+  mutex exec_mu_;
+  // Map from step_id to CollectiveExecutor
+  gtl::FlatMap<int64, CollectiveExecutor*> executor_table_ GUARDED_BY(exec_mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34c9163d6a40ba47323afc306cc2803b643e1d8b
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class CollectiveExecutorMgrTest : public ::testing::Test {
+ protected:
+  CollectiveExecutorMgrTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    string task_name = "/job:localhost/replica:0/task:0";
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    DeviceResolverLocal* drl = new DeviceResolverLocal(device_mgr_.get());
+    cme_.reset(new CollectiveExecutorMgr(
+        cp, device_mgr_.get(), drl,
+        new CollectiveParamResolverLocal(device_mgr_.get(), drl, task_name)));
+  }
+
+  std::unique_ptr<CollectiveExecutorMgr> cme_;
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+};
+
+TEST_F(CollectiveExecutorMgrTest, FindOrCreate) {
+  CollectiveExecutor::Handle* h =
+      new CollectiveExecutor::Handle(cme_->FindOrCreate(1), true);
+  EXPECT_TRUE(h->get());
+  CollectiveExecutor::Handle* h2 =
+      new CollectiveExecutor::Handle(cme_->FindOrCreate(1), true);
+  EXPECT_EQ(h->get(), h2->get());
+  CollectiveExecutor* ce = h->get();
+  delete h;
+  delete h2;
+  CollectiveExecutor::Handle h3(cme_->FindOrCreate(1), true);
+  EXPECT_EQ(ce, h3.get());
+  cme_->Cleanup(1);
+}
+
+TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
+  EXPECT_EQ(CollectiveExecutor::kInvalidId, cme_->NextStepId(123));
+  Notification ss_note;
+  Status ss_status;
+  cme_->RefreshStepIdSequenceAsync(
+      123, [this, &ss_status, &ss_note](const Status& s) {
+        ss_status = s;
+        ss_note.Notify();
+      });
+  ss_note.WaitForNotification();
+  EXPECT_FALSE(ss_status.ok());
+  EXPECT_EQ(ss_status.error_message(),
+            "CollectiveExecutorMgr does not implement RefreshStepIdSequence.");
+  Notification gs_note;
+  Status gs_status;
+  GetStepSequenceRequest* req = nullptr;
+  GetStepSequenceResponse* resp = nullptr;
+  cme_->GetStepSequenceAsync(req, resp,
+                             [this, &gs_status, &gs_note](const Status& s) {
+                               gs_status = s;
+                               gs_note.Notify();
+                             });
+  gs_note.WaitForNotification();
+  EXPECT_FALSE(gs_status.ok());
+  EXPECT_EQ(gs_status.error_message(),
+            "CollectiveExecutorMgr does not implement GetStepSequence.");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b34950b2f471d756f3d553b296e437c796320fcb
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -0,0 +1,666 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+
+namespace tensorflow {
+
+CollectiveParamResolverLocal::CollectiveParamResolverLocal(
+    const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+    const string& task_name)
+    : dev_mgr_(dev_mgr), dev_resolver_(dev_resolver), task_name_(task_name) {}
+
+void CollectiveParamResolverLocal::CompleteGroupAsync(
+    const CompleteGroupRequest* request, CompleteGroupResponse* response,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  done(
+      errors::Internal("CompleteGroup is not implemented by "
+                       "CollectiveParamResolverLocal which is "
+                       "intended only for non-distributed deployment."));
+}
+
+void CollectiveParamResolverLocal::CompleteGroupLocal(
+    const string& device, CollectiveParams* cp, const GroupRecCallback& done) {
+  VLOG(1) << "CompleteGroupLocal " << cp << ": " << cp->ToString();
+  std::vector<StatusCallback> to_be_called;
+  GroupRec* gr = nullptr;
+  {
+    mutex_lock l(group_mu_);
+    auto it = group_table_.find(cp->group.group_key);
+    if (it == group_table_.end()) {
+      gr = new GroupRec;
+      gr->group.group_key = cp->group.group_key;
+      gr->group.group_size = cp->group.group_size;
+      gr->group.device_type = cp->group.device_type;
+      group_table_[gr->group.group_key].reset(gr);
+      VLOG(2) << "New group_key=" << gr->group.group_key
+              << " group_size=" << gr->group.group_size;
+    } else {
+      gr = it->second.get();
+    }
+  }
+  Status status;
+  {
+    mutex_lock gr_lock(gr->mu);
+    if (!gr->device_set.empty()) {
+      // Check for consistency with existing GroupRec.
+      if (cp->group.device_type != gr->group.device_type) {
+        status = errors::Internal(
+            "Collective Op ", cp->name, " is assigned to device ", device,
+            " with type ", cp->group.device_type.type_string(),
+            " and group_key ", cp->group.group_key, " but that group has type ",
+            gr->group.device_type.type_string());
+      } else if (cp->group.group_size != gr->group.group_size) {
+        status = errors::Internal(
+            "Collective Op ", cp->name, " has group_size ",
+            cp->group.group_size, " and group_key", cp->group.group_key,
+            " but that group has size ", gr->group.group_size);
+      }
+    }
+    if (status.ok()) {
+      // Insert device if not already present.
+      auto it = gr->device_set.find(device);
+      if (it == gr->device_set.end()) {
+        if (gr->device_set.size() == gr->group.group_size) {
+          // The group is already full.
+          status = errors::Internal(
+              "Collective Op ", cp->name, " is assigned to device ", device,
+              " and group_key ", cp->group.group_key,
+              " but that group doesn't contain that device.");
+        } else {
+          // This is a new device that has not yet joined the group.
+          gr->device_set.insert(device);
+          gr->device_list.push_back(device);
+          DeviceNameUtils::ParsedName parsed_device;
+          DeviceNameUtils::ParseFullName(device, &parsed_device);
+          string task_name = strings::StrCat("/job:", parsed_device.job,
+                                             "/replica:", parsed_device.replica,
+                                             "/task:", parsed_device.task);
+          gr->task_set.insert(task_name);
+          gr->task_list.push_back(task_name);
+          gr->group.num_tasks = static_cast<int32>(gr->task_set.size());
+          VLOG(1) << "group_key=" << gr->group.group_key
+                  << " group_size=" << gr->group.group_size
+                  << " dev_set=" << gr->device_set.size();
+        }
+      }
+    }
+
+    if (status.ok()) {
+      // If the group is not yet complete, queue to wait for it.
+      VLOG(2) << "group_size " << gr->group.group_size << " set size "
+              << gr->device_set.size() << " gr " << gr;
+
+      if (gr->device_set.size() < gr->group.group_size) {
+        gr->waiting.push_back(std::bind(done, std::placeholders::_1, gr));
+        return;
+      }
+      CHECK_EQ(gr->device_set.size(), gr->group.group_size);
+      if (!gr->waiting.empty()) {
+        std::swap(to_be_called, gr->waiting);
+      }
+    }
+  }
+  done(status, gr);
+  for (int i = 0; i < to_be_called.size(); ++i) {
+    to_be_called[i](Status::OK());
+  }
+}
+
+namespace {
+
+struct DevRec {
+  string task;
+  string device;
+  int original_rank;
+  int local_rank;
+  int global_rank;
+  const DeviceLocality* locality;
+};
+typedef std::unordered_map<string, DevRec> TaskDeviceMap;
+typedef std::unordered_map<string, TaskDeviceMap> GlobalDeviceMap;
+
+// Create a populated GlobalDeviceMap from CollInstanceParams and localities.
+GlobalDeviceMap BuildDevRecs(const CollInstanceParams& ip,
+                             const std::vector<DeviceLocality>& localities) {
+  GlobalDeviceMap gdm;
+  CHECK_EQ(ip.device_names.size(), ip.task_names.size());
+  CHECK_EQ(ip.device_names.size(), localities.size());
+  for (int i = 0; i < ip.device_names.size(); ++i) {
+    TaskDeviceMap& tdm = gdm[ip.task_names[i]];
+    DevRec* dr = &tdm[ip.device_names[i]];
+    dr->task = ip.task_names[i];
+    dr->device = ip.device_names[i];
+    dr->original_rank = i;
+    dr->local_rank = 0;   // Will be populated later by OrderTaskDeviceMap.
+    dr->global_rank = 0;  // Will be populated later by EstablishGlobalRank.
+    dr->locality = &localities[i];
+  }
+  return gdm;
+}
+
+void OrderTaskDeviceMap(TaskDeviceMap* tdm) {
+  CHECK_GT(tdm->size(), 0);  // Should never be called with 0 devices
+  int least_rank = -1;
+  string next_device;
+  std::set<string> selected;
+  // Starting device is one with the least initial rank.
+  for (const auto& it : *tdm) {
+    if (least_rank < 0 || it.second.original_rank < least_rank) {
+      least_rank = it.second.original_rank;
+      next_device = it.second.device;
+    }
+  }
+  CHECK_GE(least_rank, 0);
+  DeviceNameUtils::ParsedName parsed_name;
+  CHECK(DeviceNameUtils::ParseFullName(next_device, &parsed_name));
+  // NOTE: InterconnectLink has only a device_id, nothing more, so for
+  // the time being if there's more than one device at a task we
+  // assume they're all GPUs.
+
+  int next_rank = 0;
+  while (true) {
+    selected.insert(next_device);
+    DevRec* dr = &(*tdm)[next_device];
+    dr->local_rank = next_rank;
+    ++next_rank;
+    if (selected.size() == tdm->size()) {
+      break;
+    }
+    // For the present time we assume Locality links only cover GPUs.
+    // For multiple CPUs, just take them in order.
+    const InterconnectLink* best_link = nullptr;
+    if (parsed_name.type == "GPU") {
+      for (const InterconnectLink& il : dr->locality->links().link()) {
+        parsed_name.id = il.device_id();
+        string endpoint_device =
+            DeviceNameUtils::ParsedNameToString(parsed_name);
+        if (selected.find(endpoint_device) != selected.end()) {
+          continue;
+        }
+        if (best_link == nullptr || il.strength() > best_link->strength()) {
+          best_link = &il;
+        }
+      }
+    }
+    if (best_link != nullptr) {
+      // Follow the best edge
+      parsed_name.id = best_link->device_id();
+      next_device = DeviceNameUtils::ParsedNameToString(parsed_name);
+    } else {
+      // No good edges, alas. Pick the lowest initial rank among remaining
+      // devices.
+      least_rank = -1;
+      for (const auto& it : *tdm) {
+        if (selected.find(it.second.device) != selected.end()) {
+          continue;
+        }
+        if (least_rank < 0 || it.second.original_rank < least_rank) {
+          least_rank = it.second.original_rank;
+          next_device = it.second.device;
+        }
+      }
+      CHECK_GE(least_rank, 0);
+    }
+  }
+}
+
+// The first time a shared CollectiveParams is established for a
+// shared set of instances we compute a good rank order for all the
+// devices in the group, that is appropriate for a ring algorithm.
+// This order need not be the same across different instance groups
+// sharing the same device group where there is more than one good
+// order.
+GlobalDeviceMap EstablishGlobalRank(
+    CollectiveParams* cp, const std::vector<DeviceLocality>& localities) {
+  VLOG(1) << "EstablishGlobalRank";
+  GlobalDeviceMap gdm = BuildDevRecs(cp->instance, localities);
+  for (auto& iter : gdm) {
+    TaskDeviceMap& tdm = iter.second;
+    OrderTaskDeviceMap(&tdm);
+  }
+  // Connect the global rank order by the order in which tasks first appear.
+  std::set<string> ordered_tasks;
+  int next_rank = 0;
+  for (int i = 0; i < cp->instance.task_names.size(); ++i) {
+    const string& task_name = cp->instance.task_names[i];
+    if (ordered_tasks.find(task_name) != ordered_tasks.end()) {
+      continue;
+    }
+    ordered_tasks.insert(task_name);
+    TaskDeviceMap* tdm = &gdm[task_name];
+    for (auto& it : *tdm) {
+      it.second.global_rank = it.second.local_rank + next_rank;
+    }
+    next_rank += tdm->size();
+  }
+  return gdm;
+}
+
+// Sort cp->instance.device_names lexicographically, but do by first
+// computing a reordering permutation so we can keep cp->instance.task_names
+// in corresponding order.
+void SortDevicesAndTasks(CollectiveParams* cp) {
+  VLOG(1) << "SortDevicesAndTasks " << cp << " instance " << &cp->instance;
+  CHECK(cp);
+  CHECK_EQ(cp->group.group_size, cp->instance.device_names.size());
+  CHECK_EQ(cp->group.group_size, cp->instance.task_names.size());
+  std::vector<int> perm(cp->group.group_size);
+  // TODO(tucker): substitute std::iota when the windows build supports it.
+  // std::iota(perm.begin(), perm.end(), 0);
+  for (int i = 0; i < perm.size(); ++i) {
+    perm[i] = i;
+  }
+  std::sort(perm.begin(), perm.end(), [cp](const int& a, const int& b) {
+    return cp->instance.device_names[a] < cp->instance.device_names[b];
+  });
+  std::vector<string> new_devs;
+  std::vector<string> new_tasks;
+  new_devs.reserve(cp->group.group_size);
+  new_tasks.reserve(cp->group.group_size);
+  for (int pi : perm) {
+    new_devs.push_back(cp->instance.device_names[pi]);
+    new_tasks.push_back(cp->instance.task_names[pi]);
+  }
+  cp->instance.device_names = std::move(new_devs);
+  cp->instance.task_names = std::move(new_tasks);
+  VLOG(1) << "Modified device_names on " << cp;
+}
+
+// Establish the requested number of subdivision permutations based on the
+// ring order implicit in the device order.
+void GenerateSubdivPerms(const string& device, int source_rank,
+                         CollectiveParams* cp) {
+  CHECK_GT(cp->instance.impl_details.subdiv_offsets.size(), 0);
+  cp->instance.impl_details.subdiv_permutations.resize(
+      cp->instance.impl_details.subdiv_offsets.size());
+  // Each subdiv permutation is a ring formed by rotating each
+  // single-task subsequence of devices by an offset.  This makes most
+  // sense when each task has the same number of devices but we can't
+  // depend on that being the case so we'll compute something that
+  // works in any case.
+
+  // Start by counting the devices in each task.
+  // Precondition: device_names must be sorted so that all devices in
+  // the same task are adjacent.
+  VLOG(2) << "Sorted task names: "
+          << str_util::Join(cp->instance.task_names, ", ");
+  std::vector<int> dev_per_task;
+  const string* prior_task_name = &cp->instance.task_names[0];
+  int dev_count = 1;
+  for (int di = 1; di < cp->group.group_size; ++di) {
+    if (cp->instance.task_names[di] != *prior_task_name) {
+      dev_per_task.push_back(dev_count);
+      dev_count = 1;
+      prior_task_name = &cp->instance.task_names[di];
+    } else {
+      ++dev_count;
+    }
+  }
+  dev_per_task.push_back(dev_count);
+  CHECK_EQ(cp->group.num_tasks, dev_per_task.size());
+
+  // Generate a ring permutation for each requested offset.
+  CHECK_GT(cp->instance.impl_details.subdiv_offsets.size(), 0);
+  VLOG(2) << "Setting up perms for cp " << cp << " subdiv_permutations "
+          << &cp->instance.impl_details.subdiv_permutations;
+  cp->instance.impl_details.subdiv_permutations.resize(
+      cp->instance.impl_details.subdiv_offsets.size());
+  cp->subdiv_rank.resize(cp->instance.impl_details.subdiv_offsets.size(), -1);
+  for (int sdi = 0; sdi < cp->instance.impl_details.subdiv_offsets.size();
+       ++sdi) {
+    std::vector<int>& perm = cp->instance.impl_details.subdiv_permutations[sdi];
+    CHECK_EQ(perm.size(), 0);
+    int offset = cp->instance.impl_details.subdiv_offsets[sdi];
+    int prior_dev_count = 0;
+    for (int ti = 0; ti < cp->group.num_tasks; ++ti) {
+      for (int di = 0; di < dev_per_task[ti]; ++di) {
+        int offset_di = (di + offset) % dev_per_task[ti];
+        int permuted_di = prior_dev_count + offset_di;
+        perm.push_back(permuted_di);
+        if (cp->instance.device_names[prior_dev_count + di] == device) {
+          CHECK_EQ(prior_dev_count + di, cp->default_rank);
+          cp->subdiv_rank[sdi] = permuted_di;
+        }
+      }
+      prior_dev_count += dev_per_task[ti];
+    }
+    CHECK_EQ(cp->group.group_size, perm.size());
+  }
+
+  if (cp->instance.type == BROADCAST_COLLECTIVE) {
+    CHECK_GE(source_rank, 0);
+    cp->subdiv_source_rank.resize(
+        cp->instance.impl_details.subdiv_offsets.size(), -1);
+    for (int sdi = 0; sdi < cp->subdiv_source_rank.size(); ++sdi) {
+      for (int j = 0; j < cp->group.group_size; ++j) {
+        if (cp->instance.impl_details.subdiv_permutations[sdi][j] ==
+            source_rank) {
+          cp->subdiv_source_rank[sdi] = j;
+          break;
+        }
+      }
+      CHECK_GE(cp->subdiv_source_rank[sdi], 0);
+    }
+  }
+
+  if (VLOG_IS_ON(1)) {
+    // Log the computed ring order for each subdiv.
+    string buf;
+    for (int sdi = 0;
+         sdi < cp->instance.impl_details.subdiv_permutations.size(); ++sdi) {
+      buf = strings::StrCat("Subdiv ", sdi, " device order:\n");
+      for (int di = 0;
+           di < cp->instance.impl_details.subdiv_permutations[sdi].size();
+           ++di) {
+        int idx = cp->instance.impl_details.subdiv_permutations[sdi][di];
+        strings::StrAppend(&buf, cp->instance.device_names[idx], "\n");
+      }
+      strings::StrAppend(&buf, " subdiv_offsets: ");
+      for (auto o : cp->instance.impl_details.subdiv_offsets)
+        strings::StrAppend(&buf, o, " ");
+      strings::StrAppend(&buf, " SubdivRank: ");
+      for (auto d : cp->subdiv_rank) strings::StrAppend(&buf, d, " ");
+      VLOG(1) << buf;
+    }
+  }
+}
+
+}  // namespace
+
+void CollectiveParamResolverLocal::CompleteTaskIsLocal(const string& task_name,
+                                                       CollectiveParams* cp) {
+  cp->task.is_local.resize(cp->group.group_size, false);
+  for (int i = 0; i < cp->group.group_size; ++i) {
+    cp->task.is_local[i] = (cp->instance.task_names[i] == task_name);
+  }
+}
+
+void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
+                                                  CollectiveParams* cp) {
+  CHECK_EQ(cp->group.group_size, cp->instance.device_names.size()) << cp;
+  for (int i = 0; i < cp->group.group_size; ++i) {
+    if (cp->instance.device_names[i] == device) {
+      cp->default_rank = i;
+      break;
+    }
+  }
+}
+
+Status CollectiveParamResolverLocal::InitInstanceSharedParams(
+    GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
+  VLOG(1) << "InitInstanceSharedParams " << ir;
+  ir->shared.instance = cp->instance;
+  {
+    mutex_lock gl(gr->mu);
+    ir->shared.group = gr->group;
+    ir->shared.instance.device_names.assign(gr->device_list.begin(),
+                                            gr->device_list.end());
+    ir->shared.instance.task_names.assign(gr->task_list.begin(),
+                                          gr->task_list.end());
+    VLOG(2) << "Initialized names for instance: "
+            << ir->shared.instance.ToString();
+  }
+  ir->shared.default_rank = -1;
+
+  // Sort devce_names lexicographcally, keeping task_names in
+  // corresponding order.
+  SortDevicesAndTasks(&ir->shared);
+
+  // Get Locality data for all devices.
+
+  // Set is_local and task_names in *shared prior to invoking
+  // GetDeviceLocalitiesAsync.  In a distributed context this function can be
+  // called by a derived class, some of the devices may be non-local and
+  // GetDeviceLocalitiesAsync will use those fields to launch RPCs.
+  CompleteTaskIsLocal(task_name_, &ir->shared);
+  std::vector<DeviceLocality> localities;
+  Notification note;
+  Status status;
+  dev_resolver_->GetDeviceLocalitiesAsync(ir->shared.instance, &localities,
+                                          [&note, &status](const Status& s) {
+                                            status = s;
+                                            note.Notify();
+                                          });
+  note.WaitForNotification();
+  if (status.ok()) {
+    CompleteDefaultRanking(gr, cp, ir, localities);
+  }
+  return status;
+}
+
+void CollectiveParamResolverLocal::CompleteDefaultRanking(
+    GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
+    const std::vector<DeviceLocality>& localities) {
+  // Establish an instance-specific default rank order for devices
+  // based on localities.  This rank order should be a good ring
+  // order, if possible.
+  GlobalDeviceMap gdm = EstablishGlobalRank(&ir->shared, localities);
+  // Reflect the new global ranking on shared
+  size_t num_devices = ir->shared.group.group_size;
+  std::vector<string> new_device_names(num_devices, "");
+  std::vector<string> new_task_names(num_devices, "");
+  for (const auto& git : gdm) {
+    const TaskDeviceMap& tdm = git.second;
+    for (const auto& tit : tdm) {
+      const DevRec& dr = tit.second;
+      new_device_names[dr.global_rank] =
+          ir->shared.instance.device_names[dr.original_rank];
+      new_task_names[dr.global_rank] =
+          ir->shared.instance.task_names[dr.original_rank];
+    }
+  }
+
+  ir->shared.instance.device_names = new_device_names;
+  ir->shared.instance.task_names = new_task_names;
+  if (VLOG_IS_ON(2)) {
+    string buf;
+    for (const auto& d : cp->instance.device_names)
+      strings::StrAppend(&buf, "\n", d);
+    VLOG(2) << "Optimized device order for " << ir->shared.name << ": " << buf;
+  }
+}
+
+void CollectiveParamResolverLocal::CallbackWithStatus(
+    const InstanceRecCallback& done, InstanceRec* irec) {
+  Status s;
+  {
+    mutex_lock l(irec->out_mu);
+    s = irec->status;
+  }
+  done(s, irec);
+}
+
+void CollectiveParamResolverLocal::FindInstanceRec(
+    GroupRec* gr, CollectiveParams* cp, const InstanceRecCallback& done) {
+  InstanceRec* irec = nullptr;
+  bool exit_outside_locks = false;
+  {
+    mutex_lock l(instance_mu_);
+    auto it = instance_table_.find(cp->instance.instance_key);
+    if (it != instance_table_.end()) {
+      irec = it->second.get();
+      {
+        mutex_lock l(irec->in_mu);
+        if (irec->is_init) {
+          exit_outside_locks = true;
+        } else {
+          irec->init_waiters.push_back([this, gr, cp, done](InstanceRec* irec) {
+            CallbackWithStatus(done, irec);
+          });
+          return;
+        }
+      }
+    } else {
+      // Create new InstanceRec.
+      irec = new InstanceRec;
+      instance_table_[cp->instance.instance_key].reset(irec);
+    }
+  }
+  if (exit_outside_locks) {
+    CallbackWithStatus(done, irec);
+    return;
+  }
+  // Initialize the new InstanceRec while holding out_mu.
+  {
+    mutex_lock il(irec->out_mu);
+    irec->known.resize(cp->group.group_size, false);
+    irec->status = InitInstanceSharedParams(gr, cp, irec);
+  }
+  // Prepare to invoke any waiters that accumlated during initialization.
+  std::vector<IRConsumer> init_waiters;
+  {
+    mutex_lock tl(instance_mu_);
+    {
+      mutex_lock l(irec->in_mu);
+      irec->is_init = true;
+      if (!irec->init_waiters.empty()) {
+        std::swap(init_waiters, irec->init_waiters);
+      }
+    }
+  }
+  CallbackWithStatus(done, irec);
+  for (auto& f : init_waiters) {
+    f(irec);
+  }
+}
+
+void CollectiveParamResolverLocal::CompleteParamsAsync(
+    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
+    const StatusCallback& done) {
+  VLOG(1) << "CompleteParams " << device << " for " << cp << ": "
+          << cp->ToString();
+  CompleteGroupLocal(
+      device, cp, [this, device, cp, done](const Status& s, GroupRec* gr) {
+        if (s.ok()) {
+          CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
+        } else {
+          done(s);
+        }
+      });
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceAsync(
+    const CompleteInstanceRequest* request, CompleteInstanceResponse* response,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  done(
+      errors::Internal("CompleteInstance is not implemented by "
+                       "CollectiveParamResolverLocal which is "
+                       "intended only for non-distributed deployment."));
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceLocal(
+    const string& device, GroupRec* gr, CollectiveParams* cp, bool is_source,
+    const StatusCallback& done) {
+  VLOG(1) << "CompleteInstanceLocal " << device
+          << " instance_key: " << cp->instance.instance_key << " gr " << gr;
+
+  // Populate the group portion of *cp from *gr.  Most of it should already
+  // match.
+  DCHECK_EQ(cp->group.group_key, gr->group.group_key);
+  DCHECK_EQ(cp->group.group_size, gr->group.group_size);
+  DCHECK_EQ(cp->group.device_type, gr->group.device_type);
+  cp->group = gr->group;
+
+  // Get the shared InstanceRec for this instance.
+  FindInstanceRec(gr, cp,
+                  [this, device, gr, cp, is_source, done](const Status& s,
+                                                          InstanceRec* ir) {
+                    if (s.ok()) {
+                      CompleteInstanceFromInitializedIRec(device, gr, cp, ir,
+                                                          is_source, done);
+                    } else {
+                      done(s);
+                    }
+                  });
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
+    const string& device, GroupRec* gr, CollectiveParams* cp, InstanceRec* ir,
+    bool is_source, const StatusCallback& done) {
+  // Populate the fields common across instance.
+  {
+    mutex_lock l(ir->out_mu);
+    // custom operator= does a deep copy.
+    cp->instance = ir->shared.instance;
+  }
+  // Populate the fields common across task, also default_rank.
+  SetDefaultRank(device, cp);
+  CompleteTaskIsLocal(task_name_, cp);
+  // If broadcast, may need to wait for source discovery.
+  if (cp->instance.type == BROADCAST_COLLECTIVE) {
+    CompleteInstanceSource(ir, cp, is_source,
+                           [this, ir, device, cp, done](InstanceRec* irec) {
+                             CHECK_EQ(ir, irec);
+                             Status s;
+                             int source_rank;
+                             {
+                               mutex_lock l(irec->out_mu);
+                               s = irec->status;
+                               source_rank = ir->source_rank;
+                             }
+                             if (s.ok()) {
+                               GenerateSubdivPerms(device, source_rank, cp);
+                             }
+                             done(s);
+                           });
+    return;
+  } else {
+    GenerateSubdivPerms(device, 0, cp);
+  }
+  done(Status::OK());
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
+                                                          CollectiveParams* cp,
+                                                          bool is_source,
+                                                          const IRConsumer& f) {
+  std::vector<IRConsumer> ready_waiters;
+  {
+    mutex_lock l(ir->out_mu);
+    CHECK_EQ(cp->group.group_size, ir->known.size());
+    CHECK_GE(cp->default_rank, 0);
+    if (!ir->known[cp->default_rank]) {
+      ir->known[cp->default_rank] = true;
+      ++ir->known_count;
+      if (is_source) {
+        if (ir->source_rank >= 0) {
+          ir->status = errors::Internal("Instance ", cp->instance.instance_key,
+                                        " already has source ", ir->source_rank,
+                                        ", recevied second claim from ",
+                                        cp->default_rank);
+        } else {
+          ir->source_rank = cp->default_rank;
+        }
+      }
+    }
+    if (ir->known_count < ir->shared.group.group_size) {
+      ir->known_waiters.push_back(f);
+      return;
+    }
+    CHECK_EQ(ir->known_count, ir->shared.group.group_size);
+    CHECK_GE(ir->source_rank, 0);
+    if (!ir->known_waiters.empty()) {
+      ready_waiters = std::move(ir->known_waiters);
+    }
+  }
+  f(ir);
+  for (auto& f : ready_waiters) {
+    f(ir);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff3415b0a909b22cb573ff0e6d6cb210924c092a
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -0,0 +1,209 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+class CompleteGroupRequest;
+class CompleteGroupResponse;
+class CompleteInstanceRequest;
+class CompleteInstanceResponse;
+class DeviceMgr;
+
+// Implements ParamResolverInterface for a single-task context.
+// It also implements the functionality necessary to serve as the
+// group leader for param resolution in a multi-task context.
+class CollectiveParamResolverLocal : public ParamResolverInterface {
+ public:
+  CollectiveParamResolverLocal(const DeviceMgr* dev_mgr,
+                               DeviceResolverInterface* dev_resolver,
+                               const string& task_name);
+
+  ~CollectiveParamResolverLocal() override {}
+
+  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           const StatusCallback& done) override;
+
+  void CompleteGroupAsync(const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          CancellationManager* cancel_mgr,
+                          const StatusCallback& done) override;
+
+  void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             CancellationManager* cancel_mgr,
+                             const StatusCallback& done) override;
+
+ protected:
+  // Used to complete/verify CollGroup.
+  struct GroupRec {
+    CollGroupParams group;
+    mutex mu;
+    Status status GUARDED_BY(mu);
+    std::set<string> device_set GUARDED_BY(mu);
+    std::vector<string> device_list GUARDED_BY(mu);
+    std::set<string> task_set GUARDED_BY(mu);
+    std::vector<string> task_list GUARDED_BY(mu);
+    std::vector<StatusCallback> waiting GUARDED_BY(mu);
+  };
+
+  // Finds the GroupRec that corresponds to cp->group_key.
+  // Also populates cp->group from that group_rec.
+  // Will wait until GroupRec is fully populated or an error arises before
+  // calling done.  Callback GroupRec* arg is only valid if status is ok.
+  // Ownership of GroupRec stays with this object and does not pass to the
+  // callback.
+  typedef std::function<void(const Status& s, GroupRec* gr)> GroupRecCallback;
+  void CompleteGroupLocal(const string& device, CollectiveParams* cp,
+                          const GroupRecCallback& done)
+      LOCKS_EXCLUDED(group_mu_);
+
+  // Used to complete/verify CollInstance.
+  struct InstanceRec;
+  typedef std::function<void(InstanceRec*)> IRConsumer;
+  struct InstanceRec {
+    // This structure has two mutexes so that a possibly long
+    // initialization can be done without holding the instance_mu_
+    // table lock the whole time (which can cause an excessive number
+    // of threads to block on it), and because the compiler may not
+    // permit mutex locks to be taken in more than one order.
+    //
+    // out_mu guards access to most of the fields.
+    // in_mu guards access to a queue of comsumer callbacks wanting to
+    // read the fields guarded by out_mu.
+    //
+    // The in_mu should be locked only while holding instance_mu_; the
+    // out_mu should be locked only while not holding
+    // instance_mu_.
+    //
+    // When is_init is false (the initial value) any potential user
+    // other than the creator should queue a callback on init_waiters.
+    // As soon as the shared member of this structure is fully
+    // initialized is_init will be set true and those callbacks will
+    // be invoked.
+    //
+    // Once inserted in the table this structure will never be replaced
+    // so users can capture the pointer while holding instance_mu_,
+    // drop that lock, then take a lock on out_mu before
+    // reading/modifying its values.
+    mutex in_mu;
+    bool is_init GUARDED_BY(in_mu);
+    std::vector<IRConsumer> init_waiters GUARDED_BY(in_mu);
+
+    // Values to be shared by all instances, constant after initialization.
+    mutex out_mu;
+    CollectiveParams shared GUARDED_BY(out_mu);
+    // If an error occurs during initialization this structure stays in
+    // the table with a non-OK status.  Purging the table and restarting
+    // needs to be done at a higher level.
+    Status status GUARDED_BY(out_mu);
+
+    // These fields are used to count the instances that have called
+    // in and become known while resolving broadcast source identity.
+    int source_rank GUARDED_BY(out_mu);
+    int known_count GUARDED_BY(out_mu);
+    std::vector<bool> known GUARDED_BY(out_mu);
+    std::vector<IRConsumer> known_waiters GUARDED_BY(out_mu);
+
+    InstanceRec() : is_init(false), source_rank(-1), known_count(0) {}
+  };
+
+  // Find the InstanceRec with the same instance_key as cp.  If it doesn't
+  // already exist, create and initialize from gr and cp.
+  //
+  // Precondition: *gr must be a complete GroupRec, i.e. the value set
+  // by CompleteGroupLocal. *cp must be populated with all the fields
+  // required by InitInstanceSharedParams.  Ownership of InstanceRec stays
+  // with this object and does not pass to the callback.
+  typedef std::function<void(const Status& s, InstanceRec* ir)>
+      InstanceRecCallback;
+  void FindInstanceRec(GroupRec* gr, CollectiveParams* cp,
+                       const InstanceRecCallback& done)
+      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+
+  // Populate *ir with device membership from gr, then initialize to be specific
+  // to cp->instance_key, i.e. order the devices and tasks.
+  //
+  // Preconditions:
+  //  cp is populated with all DeviceLocalities
+  Status InitInstanceSharedParams(GroupRec* gr, const CollectiveParams* cp,
+                                  InstanceRec* ir)
+      EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu) LOCKS_EXCLUDED(gr->mu);
+
+  // Establishes the final order of ir->shared.instance.device_names and
+  // ir->shared.instance.task_names by considering localities of all devices.
+  void CompleteDefaultRanking(GroupRec* gr, const CollectiveParams* cp,
+                              InstanceRec* ir,
+                              const std::vector<DeviceLocality>& localities)
+      EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
+
+  // Finish populating *cp.
+  // Precondition: *gr has been fully populated by CompleteGroupLocal.
+  void CompleteInstanceLocal(const string& device, GroupRec* gr,
+                             CollectiveParams* cp, bool is_source,
+                             const StatusCallback& done)
+      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+
+  // Finish populating *cp from fully initialized *ir.
+  // Precondition: *gr and *ir are fully populated.
+  void CompleteInstanceFromInitializedIRec(const string& device, GroupRec* gr,
+                                           CollectiveParams* cp,
+                                           InstanceRec* ir, bool is_source,
+                                           const StatusCallback& done)
+      LOCKS_EXCLUDED(ir->out_mu);
+
+  // Complete source data for a broadcast instance.
+  // Precondition: *cp has complete group data and default_rank.
+  void CompleteInstanceSource(InstanceRec* ir, CollectiveParams* cp,
+                              bool is_source, const IRConsumer& f)
+      LOCKS_EXCLUDED(ir->out_mu);
+
+  // If cp.device_names contains only devices local to this process
+  // populates *localities, else returns an error.
+  Status GetLocalDeviceLocalities(const CollectiveParams& cp,
+                                  std::vector<DeviceLocality>* localities);
+
+  // Sets CollTaskParams.is_local and CollectiveParams.default_rank.
+  // Precondition: cp->device_names is fully populated and in final order.
+  void CompleteTaskIsLocal(const string& task_name, CollectiveParams* cp);
+
+  // Sets cp->instance_default_rank according to location of device in
+  // current ordering of cp->instance.device_names.
+  void SetDefaultRank(const string& device, CollectiveParams* cp);
+
+  // Helper to grab status under lock, invoke callback out of lock.
+  void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
+      LOCKS_EXCLUDED(irec->out_mu);
+
+  const DeviceMgr* dev_mgr_;
+  DeviceResolverInterface* dev_resolver_;
+  string task_name_;
+  mutex group_mu_;
+  gtl::FlatMap<int32, std::unique_ptr<GroupRec>> group_table_
+      GUARDED_BY(group_mu_);
+  mutex instance_mu_;
+  gtl::FlatMap<int32, std::unique_ptr<InstanceRec>> instance_table_
+      GUARDED_BY(instance_mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e3c7125f2b319e4fc3fa227187277688977a313
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class CollectiveParamResolverLocalTest : public ::testing::Test {
+ protected:
+  CollectiveParamResolverLocalTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    string task_name = "/job:localhost/replica:0/task:0";
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
+                                                task_name));
+  }
+
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DeviceResolverLocal> drl_;
+  std::unique_ptr<CollectiveParamResolverLocal> prl_;
+};
+
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
+  CollectiveParams cps[NUM_DEVS];
+  Status statuses[NUM_DEVS];
+  Notification note[NUM_DEVS];
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    CollectiveParams* cp = &cps[i];
+    cp->group.group_key = 1;
+    cp->group.group_size = 3;
+    cp->group.device_type = DeviceType("CPU");
+    cp->group.num_tasks = 1;
+    cp->instance.instance_key = 7;
+    cp->instance.type = REDUCTION_COLLECTIVE;
+    cp->instance.data_type = DataType(DT_FLOAT);
+    cp->instance.shape = TensorShape({5});
+    cp->instance.device_names.push_back(
+        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
+    cp->instance.impl_details.subdiv_offsets.push_back(0);
+    cp->is_source = false;
+    Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
+      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+                                nullptr /*CancellationManager*/,
+                                [this, &statuses, &note, i](const Status& s) {
+                                  statuses[i] = s;
+                                  note[i].Notify();
+                                });
+    });
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    note[i].WaitForNotification();
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    TF_ASSERT_OK(statuses[i]);
+    ASSERT_EQ(cps[i].instance.device_names.size(), 3);
+    for (int j = 0; j < NUM_DEVS; ++j) {
+      EXPECT_EQ(
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", j),
+          cps[i].instance.device_names[j]);
+      EXPECT_TRUE(cps[i].task.is_local[j]);
+    }
+    EXPECT_EQ(cps[i].subdiv_rank[0], i);
+    EXPECT_EQ(cps[i].subdiv_source_rank.size(), 0);
+    EXPECT_FALSE(cps[i].is_source);
+    EXPECT_EQ(cps[i].default_rank, i);
+  }
+}
+
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
+  CollectiveParams cps[NUM_DEVS];
+  Status statuses[NUM_DEVS];
+  Notification note[NUM_DEVS];
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    CollectiveParams* cp = &cps[i];
+    cp->group.group_key = 1;
+    cp->group.group_size = 3;
+    cp->group.device_type = DeviceType("CPU");
+    cp->group.num_tasks = 1;
+    cp->instance.instance_key = 3;
+    cp->instance.type = BROADCAST_COLLECTIVE;
+    cp->instance.data_type = DataType(DT_FLOAT);
+    cp->instance.shape = TensorShape({5});
+    cp->instance.device_names.push_back(
+        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
+    cp->instance.impl_details.subdiv_offsets.push_back(0);
+    cp->is_source = (i == 1);
+    Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
+      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+                                nullptr /*CancellationManager*/,
+                                [this, &statuses, &note, i](const Status& s) {
+                                  statuses[i] = s;
+                                  note[i].Notify();
+                                });
+    });
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    note[i].WaitForNotification();
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    TF_ASSERT_OK(statuses[i]);
+    ASSERT_EQ(cps[i].instance.device_names.size(), 3);
+    for (int j = 0; j < NUM_DEVS; ++j) {
+      EXPECT_EQ(
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", j),
+          cps[i].instance.device_names[j]);
+      EXPECT_TRUE(cps[i].task.is_local[j]);
+    }
+    ASSERT_GT(cps[i].subdiv_rank.size(), 0);
+    EXPECT_EQ(cps[i].subdiv_rank[0], i);
+    ASSERT_GT(cps[i].subdiv_source_rank.size(), 0);
+    EXPECT_EQ(cps[i].subdiv_source_rank[0], 1);
+    EXPECT_EQ(cps[i].is_source, (i == 1));
+    EXPECT_EQ(cps[i].default_rank, i);
+  }
+}
+
+// TEST_F(CollectiveParamResolverLocalTest,
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad9b32ce3514dcfb29662d781ca6f1febd406c89
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+
+namespace tensorflow {
+
+void CollectiveRemoteAccessLocal::StartAbort(const Status& s) {
+  buf_rendezvous_.StartAbort(s);
+}
+
+void CollectiveRemoteAccessLocal::RecvFromPeer(
+    const string& peer_device, const string& peer_task, bool peer_is_local,
+    const string& key, Device* to_device, DeviceContext* to_device_ctx,
+    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+    const DeviceLocality& client_locality, const StatusCallback& done) {
+  VLOG(1) << "RecvFromPeer " << this << " from " << peer_device << " key "
+          << key;
+  if (!peer_is_local) {
+    done(
+        errors::Internal("CollectiveRemoteAccessLocal::RecvFromPeer "
+                         "called with peer_is_local=false"));
+    return;
+  }
+  buf_rendezvous_.ConsumeBuf(
+      key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr, done](
+               const Status& s, BufRendezvous::Hook* hook) {
+        if (!s.ok()) {
+          done(s);
+          delete hook;
+        } else {
+          int64 recv_bytes = to_tensor->TotalBytes();
+          CHECK_EQ(recv_bytes, hook->prod_value->TotalBytes());
+          MemCpyAsync(hook->prod_ctx,    // src DeviceContext
+                      to_device_ctx,     // dst DeviceContext
+                      hook->prod_dev,    // src Device
+                      to_device,         // dst Device
+                      hook->prod_attr,   // src AllocatorAttributes
+                      to_alloc_attr,     // dst AllocatorAttributes
+                      hook->prod_value,  // src Tensor*
+                      to_tensor,         // dst Tensor*
+                      [hook, done](const Status& s) {
+                        done(s);
+                        hook->prod_cb(s);
+                        delete hook;
+                      });
+        }
+      });
+}
+
+void CollectiveRemoteAccessLocal::PostToPeer(
+    const string& peer_device, const string& peer_task, const string& key,
+    Device* from_device, DeviceContext* from_device_ctx,
+    const AllocatorAttributes& from_alloc_attr, const Tensor* from_tensor,
+    const DeviceLocality& client_locality, const StatusCallback& done) {
+  VLOG(1) << "PostToPeer " << this << " key " << key
+          << " step_id_=" << step_id_;
+  buf_rendezvous_.ProvideBuf(key, from_device, from_device_ctx, from_tensor,
+                             from_alloc_attr, done);
+}
+
+/*static*/
+void CollectiveRemoteAccessLocal::MemCpyAsync(
+    DeviceContext* src_dev_ctx, DeviceContext* dst_dev_ctx, Device* src_dev,
+    Device* dst_dev, const AllocatorAttributes& src_attr,
+    const AllocatorAttributes& dst_attr, const Tensor* src, Tensor* dst,
+    const StatusCallback& done) {
+  // We want a real copy to happen, i.e. the bytes inside of src should be
+  // transferred to the buffer backing dst.  If src and dst are on different
+  // devices then CopyTensor::ViaDMA will do just that.  But if they're both
+  // the same CPU, then it will actually just reset dst to point to src.
+  // Since this routine is used for copying between devices and within a
+  // device, we need to detect and bypass the wrong-semantics case.
+  const DeviceType src_device_type(
+      src_attr.on_host() ? DEVICE_CPU : src_dev->attributes().device_type());
+  const DeviceType dst_device_type(
+      dst_attr.on_host() ? DEVICE_CPU : dst_dev->attributes().device_type());
+  const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
+  const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
+  if (non_cpu_src) CHECK(src_dev_ctx);
+  if (non_cpu_dst) CHECK(dst_dev_ctx);
+  if (non_cpu_src || non_cpu_dst) {
+    CopyTensor::ViaDMA("",  // edge name (non-existent)
+                       src_dev_ctx, dst_dev_ctx, src_dev, dst_dev, src_attr,
+                       dst_attr, src, dst, done);
+  } else {
+    int64 bytes = src->TotalBytes();
+    DCHECK_EQ(dst->TotalBytes(), bytes);
+    memcpy(DMAHelper::base(dst), DMAHelper::base(src), bytes);
+    done(Status::OK());
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..d25dd5f04acb4814931f33bc8fa710ebdc215d68
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
+#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/rendezvous.h"
+
+namespace tensorflow {
+
+// Basic implementation of PerStepCollectiveRemoteAccess.
+class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
+ public:
+  CollectiveRemoteAccessLocal(const DeviceMgr* dev_mgr,
+                              DeviceResolverInterface* dev_resolver,
+                              int64 step_id)
+      : dev_mgr_(dev_mgr),
+        dev_resolver_(dev_resolver),
+        buf_rendezvous_(step_id),
+        step_id_(step_id) {}
+
+  virtual ~CollectiveRemoteAccessLocal() {}
+
+  void StartAbort(const Status& s);
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override;
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override;
+
+  void GetDeviceLocalitiesAsync(const CollInstanceParams& ci_params,
+                                std::vector<DeviceLocality>* localities,
+                                const StatusCallback& done) override {
+    dev_resolver_->GetDeviceLocalitiesAsync(ci_params, localities, done);
+  }
+
+  void GetLocalityAsync(const string& device, const string& task,
+                        DeviceLocality* locality,
+                        const StatusCallback& done) override {
+    dev_resolver_->GetLocalityAsync(device, task, locality, done);
+  }
+
+  void ClearTask(const string& task) override {
+    dev_resolver_->ClearTask(task);
+  }
+
+  // Copy utility that always copies bytes from src to dst even if
+  // they are on the same device, unlike CopyTensor::ViaDMA which will
+  // just change the dst buffer pointer in that case.
+  static void MemCpyAsync(DeviceContext* src_dev_ctx,
+                          DeviceContext* dst_dev_ctx, Device* src_dev,
+                          Device* dst_dev, const AllocatorAttributes& src_attr,
+                          const AllocatorAttributes& dst_attr,
+                          const Tensor* src, Tensor* dst,
+                          const StatusCallback& done);
+
+ protected:
+  const DeviceMgr* dev_mgr_;               // not owned
+  DeviceResolverInterface* dev_resolver_;  // not owned
+  BufRendezvous buf_rendezvous_;
+  int64 step_id_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcd4272d96b5f855660509bf69de4585128f836c
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+static const int kStepId = 123;
+
+class CollectiveRemoteAccessLocalTest : public ::testing::Test {
+ protected:
+  const string kTaskName = "/job:localhost/replica:0/task:0";
+
+  CollectiveRemoteAccessLocalTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
+                                                kTaskName));
+    rma_.reset(new CollectiveRemoteAccessLocal(device_mgr_.get(), drl_.get(),
+                                               kStepId));
+  }
+
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DeviceResolverLocal> drl_;
+  std::unique_ptr<CollectiveParamResolverLocal> prl_;
+  std::unique_ptr<CollectiveRemoteAccessLocal> rma_;
+};
+
+TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
+  Device* cpu0 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:0", &cpu0));
+  Tensor sink_tensor(DT_FLOAT, TensorShape({8}));
+  Notification recv_note;
+  Status recv_status;
+  rma_->RecvFromPeer(kTaskName + "/device:CPU:0", kTaskName, true /*is_local*/,
+                     "key_0", cpu0 /*to_device*/, nullptr /*to_device_ctx*/,
+                     attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
+                     [this, &recv_note, &recv_status](const Status& s) {
+                       recv_status = s;
+                       recv_note.Notify();
+                     });
+  Tensor source_tensor(DT_FLOAT, TensorShape({8}));
+  for (int i = 0; i < 8; ++i) {
+    source_tensor.flat<float>()(i) = i / 2;
+  }
+  // Tensors have distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+  Notification send_note;
+  Status send_status;
+  rma_->PostToPeer(kTaskName + "/device:CPU:0", kTaskName, "key_0",
+                   cpu0 /*from_device*/, nullptr /*from_device_ctx*/,
+                   attr /*to_alloc_attr*/, &source_tensor, dev_locality,
+                   [this, &send_note, &send_status](const Status& s) {
+                     send_status = s;
+                     send_note.Notify();
+                   });
+  recv_note.WaitForNotification();
+  send_note.WaitForNotification();
+  TF_EXPECT_OK(recv_status);
+  TF_EXPECT_OK(send_status);
+  // Sink tensor gets the source tensor values.
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(sink_tensor.flat<float>()(i), i / 2);
+  }
+  // And still has distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+}
+
+TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
+  Device* cpu2 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:2", &cpu2));
+  Tensor sink_tensor(DT_FLOAT, TensorShape({8}));
+  Notification recv_note;
+  Status recv_status;
+  rma_->RecvFromPeer(kTaskName + "/device:CPU:1", kTaskName, true /*is_local*/,
+                     "key_0", cpu2 /*to_device*/, nullptr /*to_device_ctx*/,
+                     attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
+                     [this, &recv_note, &recv_status](const Status& s) {
+                       recv_status = s;
+                       recv_note.Notify();
+                     });
+  Tensor source_tensor(DT_FLOAT, TensorShape({8}));
+  for (int i = 0; i < 8; ++i) {
+    source_tensor.flat<float>()(i) = i / 2;
+  }
+  // Tensors have distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+  Device* cpu1 = nullptr;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:1", &cpu1));
+  Notification send_note;
+  Status send_status;
+  rma_->PostToPeer(kTaskName + "/device:CPU:2", kTaskName, "key_0",
+                   cpu1 /*from_device*/, nullptr /*from_device_ctx*/,
+                   attr /*to_alloc_attr*/, &source_tensor, dev_locality,
+                   [this, &send_note, &send_status](const Status& s) {
+                     send_status = s;
+                     send_note.Notify();
+                   });
+  recv_note.WaitForNotification();
+  send_note.WaitForNotification();
+  TF_EXPECT_OK(recv_status);
+  TF_EXPECT_OK(send_status);
+  // Sink tensor gets the source tensor values.
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(sink_tensor.flat<float>()(i), i / 2);
+  }
+  // And still has distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 6ac9319ad1e2c4953c2d82257dac6a3aeeffcd5c..16b61315f29322565492da8c168c6fbc89d6daf1 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/null_file_system.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
diff --git a/tensorflow/core/common_runtime/device_resolver_local.cc b/tensorflow/core/common_runtime/device_resolver_local.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17ef4a228449560c619b51dabbac0e67ec1d4db8
--- /dev/null
+++ b/tensorflow/core/common_runtime/device_resolver_local.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+
+namespace tensorflow {
+
+void DeviceResolverLocal::GetDeviceLocalitiesAsync(
+    const CollInstanceParams& ci_params,
+    std::vector<DeviceLocality>* localities, const StatusCallback& done) {
+  localities->clear();
+  for (const string& device_name : ci_params.device_names) {
+    Device* dev;
+    Status s = dev_mgr_->LookupDevice(device_name, &dev);
+    if (!s.ok()) {
+      done(s);
+      return;
+    }
+    localities->push_back(dev->attributes().locality());
+  }
+  done(Status::OK());
+}
+
+void DeviceResolverLocal::GetLocalityAsync(const string& device,
+                                           const string& task,
+                                           DeviceLocality* locality,
+                                           const StatusCallback& done) {
+  Device* dev;
+  Status s = dev_mgr_->LookupDevice(device, &dev);
+  if (s.ok()) {
+    *locality = dev->attributes().locality();
+  }
+  done(s);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_resolver_local.h b/tensorflow/core/common_runtime/device_resolver_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..098eccdf842ea754c445e9cb83a2b270ec82e386
--- /dev/null
+++ b/tensorflow/core/common_runtime/device_resolver_local.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+#define TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+class DeviceMgr;
+
+// Implements DeviceResolverInterface in a single-task context.
+class DeviceResolverLocal : public DeviceResolverInterface {
+ public:
+  DeviceResolverLocal(const DeviceMgr* dev_mgr) : dev_mgr_(dev_mgr) {}
+
+  virtual ~DeviceResolverLocal() {}
+
+  void GetDeviceLocalitiesAsync(const CollInstanceParams& ci_params,
+                                std::vector<DeviceLocality>* localities,
+                                const StatusCallback& done) override;
+
+  void GetLocalityAsync(const string& device, const string& task,
+                        DeviceLocality* locality,
+                        const StatusCallback& done) override;
+
+  void ClearTask(const string& task) override {}
+
+ protected:
+  const DeviceMgr* dev_mgr_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5a6471ff731578d377ccfc9ad146847ae3f221c
--- /dev/null
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class DeviceResolverLocalTest : public ::testing::Test {
+ protected:
+  DeviceResolverLocalTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    string task_name = "/job:localhost/replica:0/task:0";
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+  }
+
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DeviceResolverLocal> drl_;
+};
+
+TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesKnown) {
+  CollectiveParams cp;
+  std::vector<DeviceLocality> localities;
+  cp.instance.device_names.push_back(
+      "/job:localhost/replica:0/task:0/device:CPU:1");
+  cp.instance.device_names.push_back(
+      "/job:localhost/replica:0/task:0/device:CPU:2");
+  Notification note;
+  Status status;
+  drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
+                                 [this, &note, &status](const Status& s) {
+                                   status = s;
+                                   note.Notify();
+                                 });
+  note.WaitForNotification();
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(2, localities.size());
+}
+
+TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesUnknown) {
+  CollectiveParams cp;
+  std::vector<DeviceLocality> localities;
+  // In some builds there may be 1 GPU, but there should never be 9.
+  cp.instance.device_names.push_back(
+      "/job:localhost/replica:0/task:0/device:GPU:9");
+  Notification note;
+  Status status;
+  drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
+                                 [this, &note, &status](const Status& s) {
+                                   status = s;
+                                   note.Notify();
+                                 });
+  note.WaitForNotification();
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(0, localities.size());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 25cfb9e524cd12c92fc5edb01f0d4bed64fb872f..0479061daffb5b73d21f351a6ae3dd1108f21888 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
@@ -69,20 +70,6 @@ auto* direct_session_runs = monitoring::Counter<0>::New(
     "/tensorflow/core/direct_session_runs",
     "The number of times DirectSession::Run() has been called.");
 
-int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
-  const int32 t = options.config.inter_op_parallelism_threads();
-  if (t != 0) return t;
-  // Default to using the number of cores available in the process.
-  return port::NumSchedulableCPUs();
-}
-
-thread::ThreadPool* NewThreadPoolFromSessionOptions(
-    const SessionOptions& options) {
-  const int32 num_threads = NumInterOpThreadsFromSessionOptions(options);
-  VLOG(1) << "Direct session inter op parallelism threads: " << num_threads;
-  return new thread::ThreadPool(options.env, "Compute", num_threads);
-}
-
 Status NewThreadPoolFromThreadPoolOptions(
     const SessionOptions& options,
     const ThreadPoolOptionProto& thread_pool_options, int pool_number,
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index ee3896061858bd65d03171b97cae0ec850f82ad9..f95cecfc66785b9aa89d95bdbb916c36ef167f71 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -155,22 +156,22 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
 
     Status s = session->RunCallable(handle, {}, nullptr, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("`fetch_tensors` must be provided"));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                      "`fetch_tensors` must be provided"));
 
     TF_ASSERT_OK(session->ReleaseCallable(handle));
 
     std::vector<Tensor> outputs;
     s = session->RunCallable(handle, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(
-        StringPiece(s.error_message())
-            .contains("Attempted to run callable after handle was released"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Attempted to run callable after handle was released"));
 
     s = session->RunCallable(handle + 1, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
     EXPECT_TRUE(
-        StringPiece(s.error_message()).contains("No such callable handle"));
+        str_util::StrContains(s.error_message(), "No such callable handle"));
   }
 }
 
@@ -567,7 +568,7 @@ TEST(DirectSessionTest, MultipleFeedTest) {
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
 }
 
 TEST(DirectSessionTest, MultipleFeedTest_Callable) {
@@ -650,7 +651,7 @@ TEST(DirectSessionTest, MultipleFeedTest_Callable) {
           {first_identity->name() + ":0", second_identity->name() + ":0"}, {}),
       &handle);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
 }
 
 TEST(DirectSessionTest, FetchMultipleTimes) {
@@ -845,8 +846,8 @@ TEST(DirectSessionTest, PartialRunMissingFeed) {
   s = session->PRun(handle, {{first_const->name(), value_11}},
                     {third_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("can't be computed from the feeds"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "can't be computed from the feeds"));
 }
 
 TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
@@ -875,8 +876,8 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
   // Fetch fourth_identity without feeds.
   s = session->PRun(handle, {}, {fourth_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("can't be computed from the feeds"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "can't be computed from the feeds"));
 
   // Feed switch_node:1 and fetch fourth_identity.
   s = session->PRun(handle, {{switch_node->name() + ":1", bool_value}},
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..941a0e61c75cbf807636a5feed65339bffef4338
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -0,0 +1,168 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_cuda_library",
+)
+
+tf_cuda_library(
+    name = "eager_executor",
+    srcs = [
+        "eager_executor.cc",
+    ],
+    hdrs = [
+        "eager_executor.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cuda_library(
+    name = "context",
+    srcs = [
+        "context.cc",
+    ],
+    hdrs = [
+        "context.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":eager_executor",
+        ":kernel_and_device",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+    ],
+)
+
+tf_cuda_library(
+    name = "tensor_handle",
+    srcs = [
+        "tensor_handle.cc",
+    ],
+    hdrs = [
+        "tensor_handle.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":eager_executor",
+        ":kernel_and_device",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+    ],
+)
+
+tf_cuda_library(
+    name = "copy_to_device_node",
+    hdrs = [
+        "copy_to_device_node.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":eager_executor",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+    ],
+)
+
+tf_cuda_library(
+    name = "kernel_and_device",
+    srcs = [
+        "kernel_and_device.cc",
+    ],
+    hdrs = [
+        "kernel_and_device.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_and_device_test",
+    srcs = ["kernel_and_device_test.cc"],
+    deps = [
+        ":kernel_and_device",
+        "//tensorflow/c/eager:runtime",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "execute",
+    srcs = ["execute.cc"],
+    hdrs = ["execute.h"],
+    deps = [
+        ":context",
+        ":copy_to_device_node",
+        ":kernel_and_device",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "execute_node",
+    hdrs = ["execute_node.h"],
+    deps = [
+        ":context",
+        ":eager_executor",
+        ":execute",
+        ":kernel_and_device",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3fe6a7edeabecdaba0d894cca700608b896026d
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+
+#include "tensorflow/core/common_runtime/process_util.h"
+
+namespace tensorflow {
+
+EagerContext::EagerContext(const SessionOptions& opts,
+                           ContextDevicePlacementPolicy default_policy,
+                           bool async, std::unique_ptr<DeviceMgr> device_mgr,
+                           Rendezvous* rendezvous)
+    : policy_(default_policy),
+      device_manager_(std::move(device_mgr)),
+      devices_(device_manager_->ListDevices()),
+      rendezvous_(rendezvous),
+      thread_pool_(NewThreadPoolFromSessionOptions(opts)),
+      pflr_(new ProcessFunctionLibraryRuntime(
+          device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_,
+          {}, thread_pool_.get())),
+      log_device_placement_(opts.config.log_device_placement()),
+      async_default_(async) {
+  if (async_default_) {
+    executor_.EnableAsync();
+  }
+
+  for (auto* device : devices_) {
+    devices_map_[device->name()] = device;
+  }
+}
+
+bool EagerContext::Async() const {
+  mutex_lock l(async_map_mu_);
+  return gtl::FindWithDefault(thread_local_async_, std::this_thread::get_id(),
+                              async_default_);
+}
+
+Status EagerContext::SetAsyncForThread(bool async) {
+  {
+    tensorflow::mutex_lock l(async_map_mu_);
+    thread_local_async_[std::this_thread::get_id()] = async;
+  }
+  if (async) {
+    executor_.EnableAsync();
+  } else {
+    // TODO(agarwal): Currently we add a wait here to handle cases where a
+    // sync op has a control dependency on an async op, and the latter has not
+    // executed yet. This wait can be removed by storing all the control
+    // inputs and waiting for them when executing ops.
+    return executor_.WaitForAllPendingNodes();
+  }
+  return Status::OK();
+}
+
+void EagerContext::ClearCaches() {
+  mutex_lock ml(cache_mu_);
+  gtl::STLDeleteValues(&kernel_cache_);
+}
+
+void EagerContext::SetThreadLocalDevicePlacementPolicy(
+    ContextDevicePlacementPolicy policy) {
+  mutex_lock ml(policy_map_mu_);
+  thread_local_policies_[std::this_thread::get_id()] = policy;
+}
+
+ContextDevicePlacementPolicy EagerContext::GetDevicePlacementPolicy() {
+  mutex_lock ml(policy_map_mu_);
+  auto policy_map_it = thread_local_policies_.find(std::this_thread::get_id());
+  if (policy_map_it != thread_local_policies_.end()) {
+    return policy_map_it->second;
+  }
+  return policy_;
+}
+
+EagerContext::~EagerContext() {
+  executor_.WaitForAllPendingNodes().IgnoreError();
+  ClearCaches();
+  rendezvous_->Unref();
+}
+
+bool EagerContext::FindFunctionByName(const string& name) {
+  mutex_lock l(functions_mu_);
+  return func_lib_def_.Find(name) != nullptr;
+}
+
+Status EagerContext::FindFunctionOpData(
+    const string& name, const tensorflow::OpRegistrationData** op_data) {
+  mutex_lock l(functions_mu_);
+  return func_lib_def_.LookUp(name, op_data);
+}
+
+const FunctionDef* EagerContext::FindFunctionDef(const string& name) {
+  mutex_lock l(functions_mu_);
+  return func_lib_def_.Find(name);
+}
+
+Status EagerContext::FindDeviceByName(const string& name, Device** result) {
+  auto it = devices_map_.find(name);
+  if (it == devices_map_.end()) {
+    return errors::InvalidArgument(name, " unknown device.");
+  }
+  *result = it->second;
+  return Status::OK();
+}
+
+Status EagerContext::AddFunctionDef(const FunctionDef& fdef) {
+  mutex_lock l(functions_mu_);
+  return func_lib_def_.AddFunctionDef(fdef);
+}
+
+KernelAndDevice* EagerContext::GetCachedKernel(Fprint128 cache_key) {
+  tf_shared_lock l(cache_mu_);
+  return gtl::FindPtrOrNull(kernel_cache_, cache_key);
+}
+
+void EagerContext::AddKernelToCache(Fprint128 cache_key,
+                                    KernelAndDevice* kernel) {
+  mutex_lock ml(cache_mu_);
+  gtl::InsertOrUpdate(&kernel_cache_, cache_key, kernel);
+}
+
+void EagerContext::SetShouldStoreMetadata(bool value) {
+  should_store_metadata_.store(value);
+  if (!value) {
+    mutex_lock ml(metadata_mu_);
+    run_metadata_.Clear();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..6665df27d09a73d4c30756cf01e383834fcae339
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -0,0 +1,193 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// Note: there's a copy enum in eager/c_api.h. It should be kept in sync.
+enum ContextDevicePlacementPolicy {
+  // Running operations with input tensors on the wrong device will fail.
+  DEVICE_PLACEMENT_EXPLICIT = 0,
+  // Copy the tensor to the right device but log a warning.
+  DEVICE_PLACEMENT_WARN = 1,
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default policy.
+  DEVICE_PLACEMENT_SILENT = 2,
+  // Default placement policy which silently copies int32 tensors but not other
+  // dtypes.
+  DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
+};
+
+class EagerContext {
+ public:
+  explicit EagerContext(const SessionOptions& opts,
+                        ContextDevicePlacementPolicy default_policy, bool async,
+                        std::unique_ptr<DeviceMgr> device_mgr,
+                        Rendezvous* rendezvous);
+
+  ~EagerContext();
+
+  // Returns the function library runtime for the given device.
+  FunctionLibraryRuntime* func_lib(Device* d) const {
+    return pflr_->GetFLR(d->name());
+  }
+
+  // True if running in asynchronous mode.
+  bool Async() const;
+
+  EagerExecutor* Executor() { return &executor_; }
+
+  // Sets whether this thread should run in synchronous or asynchronous mode.
+  Status SetAsyncForThread(bool async);
+
+  // TODO(apassos) make this return a constant reference
+  gtl::FlatMap<string, Device*, StringPieceHasher>* device_map() {
+    return &devices_map_;
+  }
+
+  // TODO(apassos) make this return a constant reference
+  std::vector<Device*>* devices() { return &devices_; }
+
+  // Clears the kernel caches.
+  void ClearCaches();
+
+  // Sets the device placement policy for the current thread.
+  void SetThreadLocalDevicePlacementPolicy(ContextDevicePlacementPolicy policy);
+
+  // Returns the device placement policy for the current thread.
+  ContextDevicePlacementPolicy GetDevicePlacementPolicy();
+
+  Status AsyncWait() { return executor_.WaitForAllPendingNodes(); }
+
+  Status GetStatus() { return executor_.status(); }
+
+  void ClearAsyncError() { executor_.ClearError(); }
+
+  bool FindFunctionByName(const string& name);
+
+  Status FindFunctionOpData(const string& name,
+                            const tensorflow::OpRegistrationData** op_data);
+
+  const FunctionDef* FindFunctionDef(const string& name);
+
+  Status FindDeviceByName(const string& name, Device** result);
+
+  Device* HostCPU() { return devices_[0]; }
+
+  uint64 NextId() { return executor_.NextId(); }
+
+  void ExecutorAdd(EagerNode* node) { executor_.Add(node); }
+
+  Status AddFunctionDef(const FunctionDef& fdef);
+
+  KernelAndDevice* GetCachedKernel(Fprint128 cache_key);
+
+  void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);
+
+  bool LogDevicePlacement() { return log_device_placement_; }
+
+  Rendezvous* GetRendezvous() { return rendezvous_; }
+
+  mutex* FunctionsMu() { return &functions_mu_; }
+
+  tensorflow::DeviceMgr* device_mgr() { return device_manager_.get(); }
+
+  // TODO(apassos) remove the need for this
+  void ReleaseDeviceMgr() { device_manager_.release(); }
+
+  // TODO(apassos) clean up RunMetadata storage.
+  mutex* MetadataMu() { return &metadata_mu_; }
+  bool ShouldStoreMetadata() { return should_store_metadata_.load(); }
+  void SetShouldStoreMetadata(bool value);
+  RunMetadata* RunMetadataProto() { return &run_metadata_; }
+
+  FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; }
+
+ private:
+  const ContextDevicePlacementPolicy policy_;
+
+  // Note: we cannot use C++11 thread_local here as there is no concept of a
+  // thread-local-object-local variable in C++11.
+  mutex policy_map_mu_;
+  std::unordered_map<std::thread::id, ContextDevicePlacementPolicy>
+      thread_local_policies_ GUARDED_BY(policy_map_mu_);
+
+  std::unique_ptr<DeviceMgr> device_manager_;
+  // Devices owned by device_manager
+  std::vector<Device*> devices_;
+  // All devices are not owned.
+  gtl::FlatMap<string, Device*, StringPieceHasher> devices_map_;
+  Rendezvous* const rendezvous_;
+
+  mutex functions_mu_;
+  FunctionLibraryDefinition func_lib_def_ GUARDED_BY(functions_mu_){
+      OpRegistry::Global(), {}};
+
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
+  // One FunctionLibraryRuntime per device.
+  // func_libs[i] is the FunctionLibraryRuntime corresponding to
+  // session->devices[i].
+  const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+
+  mutex cache_mu_;
+  std::unordered_map<Fprint128, KernelAndDevice*, Fprint128Hasher> kernel_cache_
+      GUARDED_BY(cache_mu_);
+
+  // Whether we should compute RunMetadata.
+  std::atomic<bool> should_store_metadata_{false};
+  mutex metadata_mu_;
+  RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
+  const bool log_device_placement_;
+  // EagerExecutor for async execution.
+  EagerExecutor executor_;
+
+  // True if the default value for execution mode is async. Note that this value
+  // can be overridden per thread based on `thread_local_async` overrides.
+  const bool async_default_;
+  mutable mutex async_map_mu_;
+  std::unordered_map<std::thread::id, bool> thread_local_async_
+      GUARDED_BY(async_map_mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a887540b066055fc1f59e64e0cead9f2512178e
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class CopyToDeviceNode : public EagerNode {
+ public:
+  CopyToDeviceNode(TensorHandle* src, Device* dstd, EagerContext* ctx)
+      : EagerNode(ctx->NextId()),
+        src_(src),
+        dstd_(dstd),
+        ctx_(ctx),
+        dst_(new TensorHandle(id, src_->dtype, ctx)) {
+    src_->Ref();
+    dst_->Ref();
+  }
+
+  ~CopyToDeviceNode() override {
+    src_->Unref();
+    dst_->Unref();
+  }
+
+  Status Run() override {
+    TensorHandle* temp = nullptr;
+    TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &temp));
+    const Tensor* tensor = nullptr;
+    Device* device = nullptr;
+    Device* op_device = nullptr;
+    Status status = temp->TensorAndDevice(&tensor, &device, &op_device);
+    // `temp` is a ready handle. So the following call should return OK.
+    TF_DCHECK_OK(status) << status.error_message();
+    DCHECK(tensor);
+    dst_->SetTensorAndDevice(*tensor, device, op_device);
+    temp->Unref();
+    return Status::OK();
+  }
+
+  TensorHandle* dst() { return dst_; }
+
+ private:
+  TensorHandle* src_;
+  Device* dstd_;
+  EagerContext* ctx_;
+  TensorHandle* dst_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b699036e9697576adca403d5919b341d8f919db0
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+
+namespace tensorflow {
+
+EagerNode::EagerNode(tensorflow::uint64 id) : id(id) {}
+
+EagerExecutor::~EagerExecutor() {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  thread_done_ = true;
+  nodes_pending_.notify_all();
+}
+
+tensorflow::uint64 EagerExecutor::NextId() {
+  tensorflow::mutex_lock l(next_id_mutex_);
+  return next_id_++;
+}
+
+void EagerExecutor::EnableAsync() {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  if (thread_ == nullptr) {
+    thread_.reset(tensorflow::Env::Default()->StartThread(
+        tensorflow::ThreadOptions(), "eager_async_executor",
+        std::bind(&EagerExecutor::Run, this)));
+  }
+}
+
+void EagerExecutor::Add(EagerNode* node) {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  DCHECK(thread_) << "EnableAsync should have been called before Add";
+  if (!status_.ok()) {
+    delete node;
+    return;
+  }
+  int64 qlen = node_queue_.size();
+  if (qlen > 0) {
+    if (node_queue_.back()->id >= node->id) {
+      status_ = tensorflow::errors::InvalidArgument(
+          "Inserting EagerNode with non-increasing ids:",
+          node_queue_.back()->id, " vs ", node->id);
+      delete node;
+      return;
+    }
+    node_queue_.push(node);
+  } else {
+    node_queue_.push(node);
+    nodes_pending_.notify_all();
+  }
+}
+
+tensorflow::Status EagerExecutor::WaitFor(tensorflow::uint64 node_id) {
+  return WaitImpl(false, node_id);
+}
+
+tensorflow::Status EagerExecutor::WaitForAllPendingNodes() {
+  return WaitImpl(true, 0);
+}
+
+tensorflow::Status EagerExecutor::WaitImpl(bool wait_all,
+                                           tensorflow::uint64 node_id) {
+  tensorflow::condition_variable cond;
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  // Don't wait if an error is already set.
+  if (!status_.ok()) return status_;
+  if (node_queue_.empty()) return tensorflow::Status::OK();
+  if (wait_all) {
+    node_id = node_queue_.back()->id;
+  } else if (node_id < node_queue_.front()->id) {
+    // Note that we are relying on the ops being dispatched sequentially from
+    // the queue.
+    return tensorflow::Status::OK();
+  }
+  node_done_notifications_.insert(std::make_pair(node_id, &cond));
+  cond.wait(l);
+  // Note that we could be woken up if an error occurs, even though the node has
+  // not actually executed.
+  return status_;
+}
+
+void EagerExecutor::ClearError() {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  if (status_.ok()) return;
+  // If an error was set, node_done_notifications_ and node_queue_ should have
+  // been cleared, and no new entries should have been added since.
+  DCHECK(node_done_notifications_.empty());
+  DCHECK(node_queue_.empty());
+  status_ = tensorflow::Status::OK();
+  nodes_pending_.notify_all();
+}
+
+tensorflow::Status EagerExecutor::status() {
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  return status_;
+}
+
+void EagerExecutor::Run() {
+  while (true) {
+    std::unique_ptr<EagerNode> curr_node;
+    {
+      tensorflow::mutex_lock l(node_queue_mutex_);
+      while (node_queue_.empty() || !status_.ok()) {
+        if (thread_done_) return;
+        nodes_pending_.wait(l);
+      }
+      curr_node.reset(node_queue_.front());
+    }
+    tensorflow::Status status = curr_node->Run();
+    const bool ok = status.ok();
+    tensorflow::mutex_lock l(node_queue_mutex_);
+    node_queue_.pop();
+    if (!ok) {
+      status_ = status;
+      // TODO(agarwal): mark all affected handles as corrupted before clearing
+      // this queue.
+      // We remove any pending ops so that we don't try to execute them if
+      // ClearError is called.
+      for (int i = 0; i < node_queue_.size(); ++i) {
+        delete node_queue_.front();
+        node_queue_.pop();
+      }
+    }
+    if (!node_done_notifications_.empty()) {
+      tensorflow::uint64 node_id = curr_node->id;
+      // Note that we notify all waiting threads in case an error has occurred.
+      // These calling threads are responsible for checking status_ before
+      // proceeding.
+      const auto range = ok ? node_done_notifications_.equal_range(node_id)
+                            : make_pair(node_done_notifications_.begin(),
+                                        node_done_notifications_.end());
+      for (auto it = range.first; it != range.second; ++it) {
+        it->second->notify_all();
+      }
+      node_done_notifications_.erase(range.first, range.second);
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..021daeb21d2ecb033b8017f012a148dafa092c01
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// A unit of execution for the EagerExecutor class below. Example subclasses
+// encapsulate execution of a TFE_Op, or copying a TFE_TensorHandle from one
+// device to another.
+class EagerNode {
+ public:
+  explicit EagerNode(uint64 id);
+
+  virtual ~EagerNode() {}
+
+  // Runs the computation corresponding to this node and blocks till the
+  // execution is done.
+  virtual Status Run() = 0;
+
+  // An id unique to the TFE_Context under which this node is created. Allocated
+  // monotonically.
+  const uint64 id;
+};
+
+// A class for handling async execution (see TFE_ContextSetAsync).
+// Note that this class is thread-safe.
+// TODO(agarwal): TFE_OpAddInput may currently block if it tries to access the
+// device of the input handle. Fix that.
+// TODO(agarwal): On error, mark all affected handles as corrupted.
+// TODO(agarwal): Implement support for control dependencies.
+// TODO(agarwal): Support out-of-order execution and dispatching multiple
+// EagerNode in parallel.
+// TODO(agarwal): Implement optimizations over EagerNode traces.
+class EagerExecutor {
+ public:
+  ~EagerExecutor();
+
+  // This is called whenever async mode is enabled. Note that it may be called
+  // multiple times as different calling threads may switch async mode on or off
+  // independently.
+  void EnableAsync();
+
+  // Helper function to create monotonically increasing ids unique to this
+  // object.
+  uint64 NextId();
+
+  // Schedules `node` for execution.
+  // Note that Add must be called in monotonically increasing order of node->id.
+  void Add(EagerNode* node);
+
+  // Causes the caller to block till node with id `node_id` has finished
+  // execution.
+  Status WaitFor(uint64 node_id);
+
+  // Blocks till all currently pending ops are done.
+  Status WaitForAllPendingNodes();
+
+  // Clears all currently set errors which re-enables async execution.
+  void ClearError();
+
+  // Returns Status based on any errors that occurred during async execution.
+  Status status();
+
+ private:
+  // Starts execution of pending EagerNodes. This function loops till
+  // thread_done_ is set to true. If any errors are encontered, these are set
+  // inside `status_`. The loop blocks anytime there are no pending nodes, or if
+  // `status_` is not ok.
+  void Run();
+
+  Status WaitImpl(bool wait_all, uint64 node_id);
+
+  mutex node_queue_mutex_;
+
+  // Used to signal that some EagerNodes are pending execution.
+  condition_variable nodes_pending_ GUARDED_BY(node_queue_mutex_);
+
+  // Queue of pending EagerNodes.
+  std::queue<EagerNode*> node_queue_ GUARDED_BY(node_queue_mutex_);
+
+  // `status_` is set based on any errors raised during execution of a
+  // EagerNode.  It remains set until ClearError is called.
+  Status status_ GUARDED_BY(node_queue_mutex_);
+
+  // Map from id of a EagerNode to condition_variables (not owned by the map).
+  // These condition_variables are notified and removed when that EagerNode is
+  // done executing, or if an error is found in execution of any EagerNode.
+  std::multimap<uint64, condition_variable*> node_done_notifications_
+      GUARDED_BY(node_queue_mutex_);
+
+  // Thread object that calls the `Run` method. Currently we use only one thread
+  // for executing the EagerNodes one-by-one.
+  std::unique_ptr<Thread> thread_ GUARDED_BY(node_queue_mutex_);
+
+  // Indicates that `thread_` should stop as soon as it is done executing the
+  // current EagerNode.
+  bool thread_done_ GUARDED_BY(node_queue_mutex_) = false;
+
+  mutex next_id_mutex_;
+  uint64 next_id_ GUARDED_BY(next_id_mutex_) = 1;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98e8471102b5e0a8813f97d5f99f9a383f0b6225
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -0,0 +1,130 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/execute.h"
+
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+Status EagerExecute(EagerContext* ctx, Device* device,
+                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                    TensorHandle** retvals, int num_retvals) {
+  if (device == nullptr) {
+    // TODO(apassos) debug how the assignment below might return a different
+    // device from the one requested above.
+    device = kernel->device();
+  }
+
+  std::vector<Tensor> outputs(1);
+  const MemoryTypeVector* output_memory_types = nullptr;
+  output_memory_types = &kernel->kernel()->output_memory_types();
+  std::vector<Tensor> inputs(op_inputs.size());
+  for (int i = 0; i < op_inputs.size(); ++i) {
+    const Tensor* input_tensor = nullptr;
+    TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
+    inputs[i] = *input_tensor;
+  }
+  // WARNING: kernel->Run utilizes the FunctionLibraryRuntime
+  // (ctx->func_lib(device)), which in turn holds a pointer to func_lib_def.
+  // But knowledge of the implementation
+  // of FunctionLibraryRuntime tells us that func_lib_def is not accessed by
+  // FunctionLibraryRuntime::Run(), so there is no thread-safety concern here.
+  // This is quite subtle. Re-work things to make this better?  (Would it make
+  // sense for FunctionLibraryRuntime to ensure thread-safe access to
+  // FunctionLibraryDefinition?).  TODO(apassos) figure out how to record stats
+  // for ops which are a part of functions.
+  // TODO(agarwal): change Run to take vector of handles ?
+  TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats));
+  if (maybe_stats != nullptr) {
+    maybe_stats->set_op_end_rel_micros(Env::Default()->NowMicros() -
+                                       maybe_stats->all_start_micros());
+    mutex_lock ml(*ctx->MetadataMu());
+    if (ctx->ShouldStoreMetadata()) {
+      auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
+      // Lazily initialize the RunMetadata with information about all devices if
+      // this is the first call.
+      while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+        step_stats->add_dev_stats();
+      }
+      // Find the current device's index.
+      int device_idx = 0;
+      for (int i = 0; i < ctx->devices()->size(); ++i) {
+        if (ctx->devices()->at(i) == device) {
+          device_idx = i;
+          break;
+        }
+      }
+      // Populate the device stats for this device.
+      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+      dev_stats->set_device(device->name());
+      *dev_stats->add_node_stats() = *maybe_stats;
+    }
+  }
+  DCHECK_EQ(num_retvals, outputs.size());
+  Device* op_device = device;
+  for (int i = 0; i < num_retvals; ++i) {
+    Device* d = op_device;
+    if (d != nullptr && output_memory_types != nullptr &&
+        (*output_memory_types)[i] == HOST_MEMORY) {
+      d = nullptr;
+    }
+    if (retvals[i] == nullptr) {
+      retvals[i] = new TensorHandle(outputs[i], d, op_device, ctx);
+    } else {
+      retvals[i]->SetTensorAndDevice(outputs[i], d, op_device);
+    }
+  }
+  return Status::OK();
+}
+
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         const char* device_name, TensorHandle** result) {
+  TF_RETURN_IF_ERROR(ctx->GetStatus());
+  Device* dstd = ctx->HostCPU();
+  if (device_name != nullptr && strlen(device_name) > 0) {
+    TF_RETURN_IF_ERROR(ctx->device_mgr()->LookupDevice(device_name, &dstd));
+  }
+  if (ctx->Async()) {
+    // Note that `h` may not be currently ready. However execution order will
+    // make sure that `h` is ready before the copy is actually done.
+    CopyToDeviceNode* node = new CopyToDeviceNode(h, dstd, ctx);
+    TensorHandle* output = node->dst();
+    // Note that calling Add makes `node` accessible by the EagerExecutor
+    // thread. So further accesses need to be thread-safe.
+    ctx->ExecutorAdd(node);
+    *result = output;
+    return Status::OK();
+  } else {
+    TF_RETURN_IF_ERROR(h->CopyToDevice(ctx, dstd, result));
+    return Status::OK();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f6ad031e1dd03bc9c3b90778ed1b37e958085e0
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+// Low-level utility to execute the kernel specified by kernel on device device,
+// with the inputs op_inputs, in the context ctx.
+Status EagerExecute(EagerContext* ctx, Device* device,
+                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                    TensorHandle** retvals, int num_retvals);
+
+// Low-level utility to copy a tensor handle from one device to another.
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         const char* device_name, TensorHandle** result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..93018dd96914c0d091c7242a9c053fabce434e78
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+class ExecuteNode : public EagerNode {
+ public:
+  ExecuteNode(uint64 id, EagerContext* ctx, Device* op_device,
+              const tensorflow::gtl::InlinedVector<TensorHandle*, 4>& inputs,
+              KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+              const DataTypeVector& output_dtypes,
+              const tensorflow::gtl::InlinedVector<TensorHandle*, 2>& retvals)
+      : EagerNode(id),
+        ctx_(ctx),
+        op_device_(op_device),
+        inputs_(inputs),
+        kernel_(kernel),
+        maybe_stats_(maybe_stats),
+        retvals_(retvals) {
+    for (auto handle : inputs_) {
+      handle->Ref();
+    }
+    for (auto handle : retvals_) {
+      handle->Ref();
+    }
+  }
+
+  ~ExecuteNode() override {
+    for (auto handle : inputs_) {
+      handle->Unref();
+    }
+    for (auto handle : retvals_) {
+      handle->Unref();
+    }
+  }
+
+  tensorflow::Status Run() override {
+    const Status status =
+        EagerExecute(ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
+                     retvals_.begin(), retvals_.size());
+    if (status.ok()) {
+      return status;
+    } else {
+      return Status(status.code(),
+                    strings::StrCat("Got error, \"", status.error_message(),
+                                    "\" while executing kernel ",
+                                    kernel_->kernel()->def().DebugString()));
+    }
+  }
+
+ private:
+  tensorflow::EagerContext* ctx_;
+  tensorflow::Device* op_device_;
+  tensorflow::gtl::InlinedVector<TensorHandle*, 4> inputs_;
+  tensorflow::KernelAndDevice* kernel_;
+  std::unique_ptr<NodeExecStats> maybe_stats_;
+  tensorflow::gtl::InlinedVector<TensorHandle*, 2> retvals_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a4895a938a72a41cf3ad494ecca2ef9fb3e9648
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -0,0 +1,132 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+
+// static
+Status KernelAndDevice::InitOp(Device* device, const NodeDef& ndef,
+                               KernelAndDevice* out) {
+  OpKernel* k = nullptr;
+  Status s = CreateOpKernel(device->device_type().c_str(), device,
+                            device->GetAllocator(AllocatorAttributes()),
+                            nullptr, ndef, TF_GRAPH_DEF_VERSION, &k);
+  out->device_ = device;
+  out->kernel_.reset(k);
+  out->flib_ = nullptr;
+  return s;
+}
+
+// static
+Status KernelAndDevice::Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
+                             KernelAndDevice* out) {
+  OpKernel* k = nullptr;
+  Status s = flib->CreateKernel(ndef, &k);
+  out->device_ = flib->device();
+  out->kernel_.reset(k);
+  out->flib_ = flib;
+  return s;
+}
+
+Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
+                            std::vector<Tensor>* output_tensors,
+                            NodeExecStats* stats) {
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (Tensor& t : *input_tensors) {
+    inputs.push_back(TensorValue(&t));
+  }
+
+  std::vector<AllocatorAttributes> out_attrs(kernel_->num_outputs());
+  for (size_t i = 0; i < out_attrs.size(); ++i) {
+    out_attrs[i].set_on_host(kernel_->output_memory_types()[i] ==
+                             tensorflow::HOST_MEMORY);
+  }
+
+  OpKernelContext::Params params;
+  params.device = device_;
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = kernel_.get();
+  params.resource_manager = device_->resource_manager();
+  params.output_attr_array = gtl::vector_as_array(&out_attrs);
+  params.function_library = flib_;
+  params.slice_reader_cache = &slice_reader_cache_;
+  params.rendezvous = rendez_;
+  if (stats != nullptr) {
+    params.track_allocations = true;
+  }
+  // TODO(apassos): use a thread pool.
+  std::function<void(std::function<void()>)> runner =
+      [](std::function<void()> f) { f(); };
+  params.runner = &runner;
+
+  OpKernelContext context(&params);
+
+  if (kernel_->def().op() == "_Recv") {
+    // TODO(apassos) do not special-case _Recv. Currently the GPU device fails
+    // if trying to run _Recv->Compute(), specifically checking for _Recv. To go
+    // around this we call _Recv->ComputeAsync, to mimic graph mode behavior.
+    AsyncOpKernel* async = kernel_->AsAsync();
+    Notification done;
+    device_->ComputeAsync(async, &context, [&done]() { done.Notify(); });
+    done.WaitForNotification();
+  } else {
+    device_->Compute(kernel_.get(), &context);
+  }
+  if (!context.status().ok()) return context.status();
+
+  output_tensors->clear();
+  for (int i = 0; i < context.num_outputs(); ++i) {
+    output_tensors->push_back(Tensor(*context.mutable_output(i)));
+  }
+  if (stats != nullptr) {
+    for (const auto& allocator_pair : context.wrapped_allocators()) {
+      AllocatorMemoryUsed* memory = stats->add_memory();
+      memory->set_allocator_name(allocator_pair.first->Name());
+      auto sizes = allocator_pair.second->GetSizes();
+      memory->set_total_bytes(std::get<0>(sizes));
+      memory->set_peak_bytes(std::get<1>(sizes));
+      memory->set_live_bytes(std::get<2>(sizes));
+
+      AllocatorStats allocator_stats;
+      allocator_pair.first->GetStats(&allocator_stats);
+      memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
+      allocator_pair.second->GetRecordsAndUnRef();
+    }
+    auto* ms = stats->mutable_memory_stats();
+    ms->set_temp_memory_size(context.temp_memory_allocated());
+    for (const auto& alloc_id : context.persistent_alloc_ids()) {
+      ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
+    }
+
+    ms->set_persistent_memory_size(context.persistent_memory_allocated());
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..46ec550c780aaa3cd5cad5f02a4dfe9a75572277
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_KERNEL_AND_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_KERNEL_AND_DEVICE_H_
+
+// Support for eager execution of TensorFlow kernels.
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+
+// KernelAndDevice encapsulates an instantiated kernel and the device it is on.
+//
+// Also see:
+// https://www.tensorflow.org/code/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+// and
+// https://www.tensorflow.org/code/tensorflow/core/kernels/ops_testutil.h
+class KernelAndDevice {
+ public:
+  // Populates 'out' with a kernel appropriate for 'ndef'.
+  //
+  // The provided FunctionLibraryRuntime MUST outlive all calls to
+  // Run() on the returned KernelAndDevice.
+  //
+  // TODO(ashankar): Figure out thread-safety concerns around
+  // FunctionLibraryRuntime (in particular, how the underlying
+  // FunctionLibraryDefinition might be mutated by another thread as new
+  // functions are registered with it).  Conservatively, thread-safe usage of
+  // the FunctionLibraryRuntime is pushed on to the caller (see locking in
+  // c_api.cc).
+  static Status Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
+                     KernelAndDevice* out);
+  // TODO(ashankar): Remove this
+  static Status InitOp(Device* device, const NodeDef& ndef,
+                       KernelAndDevice* out);
+
+  KernelAndDevice(tensorflow::Rendezvous* rendez)
+      : device_(nullptr), flib_(nullptr), rendez_(rendez) {}
+
+  // TODO(ashankar): Handle list-valued inputs.
+  Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
+             NodeExecStats* stats);
+
+  const OpKernel* kernel() const { return kernel_.get(); }
+
+  Device* device() const { return device_; }
+
+  DataTypeVector* mutable_output_dtypes() { return &output_dtypes_; }
+  const DataTypeVector& output_dtypes() { return output_dtypes_; }
+
+ private:
+  std::unique_ptr<OpKernel> kernel_;
+  Device* device_;
+  FunctionLibraryRuntime* flib_;
+  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
+  Rendezvous* rendez_;
+  DataTypeVector output_dtypes_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_KERNEL_AND_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd055c3c3eb6df3eb440a78b7a8d3e72ff9335bd
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+class TestEnv {
+ public:
+  TestEnv() : flib_def_(OpRegistry::Global(), {}) {
+    Device* device =
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
+    device_mgr_.reset(new DeviceMgr({device}));
+    flib_runtime_ = NewFunctionLibraryRuntime(device_mgr_.get(), Env::Default(),
+                                              device, TF_GRAPH_DEF_VERSION,
+                                              &flib_def_, nullptr, {}, nullptr);
+  }
+
+  FunctionLibraryRuntime* function_library_runtime() const {
+    return flib_runtime_.get();
+  }
+
+ private:
+  FunctionLibraryDefinition flib_def_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
+};
+
+void BM_CreateGraph(int iters) {
+  for (int i = 0; i < iters; ++i) {
+    Scope root = Scope::NewRootScope();
+    auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
+    auto M = ops::MatMul(root, C, C);
+    TF_CHECK_OK(root.status());
+  }
+}
+BENCHMARK(BM_CreateGraph);
+
+void BM_RunGraph(int iters) {
+  tensorflow::testing::StopTiming();
+  Scope root = Scope::NewRootScope();
+  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
+  auto M = ops::MatMul(root, C, C);
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(1);
+  opts.config.set_intra_op_parallelism_threads(1);
+  ClientSession sess(root, opts);
+  std::vector<Tensor> outputs;
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    outputs.clear();
+    TF_CHECK_OK(sess.Run({M}, &outputs));
+  }
+}
+BENCHMARK(BM_RunGraph);
+
+void BM_CreateAndDestroySession(int iters) {
+  tensorflow::testing::StopTiming();
+  Scope root = Scope::NewRootScope();
+  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
+  auto M = ops::MatMul(root, C, C);
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    ClientSession sess(root);
+  }
+}
+BENCHMARK(BM_CreateAndDestroySession);
+
+void BM_KernelAndDeviceInit(int iters) {
+  tensorflow::testing::StopTiming();
+  NodeDef ndef(AttrBuilder("MatMul")
+                   .Set("T", DT_FLOAT)
+                   .Set("transpose_a", false)
+                   .Set("transpose_b", false)
+                   .NumInputs(2)
+                   .BuildNodeDef());
+  TestEnv env;
+  KernelAndDevice k(nullptr);
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    TF_CHECK_OK(
+        KernelAndDevice::Init(ndef, env.function_library_runtime(), &k));
+  }
+}
+BENCHMARK(BM_KernelAndDeviceInit);
+
+void BM_KernelAndDeviceRun(int iters) {
+  tensorflow::testing::StopTiming();
+  Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
+  std::vector<Tensor> inputs;
+  inputs.push_back(t);
+  inputs.push_back(t);
+  std::vector<Tensor> outputs;
+  NodeDef ndef(AttrBuilder("MatMul")
+                   .Set("T", DT_FLOAT)
+                   .Set("transpose_a", false)
+                   .Set("transpose_b", false)
+                   .NumInputs(inputs.size())
+                   .BuildNodeDef());
+  TestEnv env;
+  KernelAndDevice kernel(nullptr);
+  TF_CHECK_OK(
+      KernelAndDevice::Init(ndef, env.function_library_runtime(), &kernel));
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    TF_CHECK_OK(kernel.Run(&inputs, &outputs, nullptr));
+  }
+}
+BENCHMARK(BM_KernelAndDeviceRun);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e11f7b7104ce22cc585e2b03fcfd914e0eb80aa
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -0,0 +1,179 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+bool TensorHandle::IsReady() {
+  if (node_id == 0) return true;
+  mutex_lock l(ctx_mutex_);
+  return is_ready_;
+}
+
+Status TensorHandle::WaitReady() {
+  if (node_id == 0) return Status::OK();
+  EagerExecutor* executor = nullptr;
+  {
+    mutex_lock l(ctx_mutex_);
+    if (is_ready_) return Status::OK();
+    executor = ctx_->Executor();
+  }
+  return executor->WaitFor(node_id);
+}
+
+Status TensorHandle::Tensor(const tensorflow::Tensor** t) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *t = &tensor_;
+  return Status::OK();
+}
+
+Status TensorHandle::Device(tensorflow::Device** d) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *d = device_;
+  return Status::OK();
+}
+
+Status TensorHandle::OpDevice(tensorflow::Device** d) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *d = op_device_;
+  return Status::OK();
+}
+
+Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
+                                     tensorflow::Device** device,
+                                     tensorflow::Device** op_device) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *tensor = &tensor_;
+  *device = device_;
+  *op_device = op_device_;
+  return Status::OK();
+}
+
+void TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor,
+                                      tensorflow::Device* device,
+                                      tensorflow::Device* op_device) {
+  mutex_lock l(ctx_mutex_);
+  DCHECK(node_id > 0 && !is_ready_)
+      << "SetTensorAndDevice should be only called  "
+      << "on non-ready handles.";
+  is_ready_ = true;
+  tensor_ = tensor;
+  device_ = device;
+  op_device_ = op_device;
+}
+
+Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
+                                  TensorHandle** output) {
+  const tensorflow::Tensor* src = nullptr;
+  tensorflow::Device* srcd = nullptr;
+  // TODO(agarwal): src_opd is unused. Perhaps allow TensorAndDevice to accept
+  // nullptr.
+  tensorflow::Device* src_opd = nullptr;
+  TF_RETURN_IF_ERROR(TensorAndDevice(&src, &srcd, &src_opd));
+  if (srcd == nullptr) srcd = ctx->HostCPU();
+  bool is_same_device = (srcd == dstd) || (srcd->name() == dstd->name());
+  const bool dst_cpu = dstd->tensorflow_gpu_device_info() == nullptr;
+  const bool src_cpu = srcd->tensorflow_gpu_device_info() == nullptr;
+  // both_on_cpu can be true and yet is_same_device is false, if one of src/dst
+  // has device type XLA_CPU, and the other CPU.
+  const bool both_on_cpu = src_cpu && dst_cpu;
+  if (is_same_device || both_on_cpu) {
+    dstd = dst_cpu ? nullptr : dstd;
+    *output = new tensorflow::TensorHandle(*src, dstd, dstd, ctx);
+    return tensorflow::Status::OK();
+  }
+  if (!dst_cpu && (src->dtype() != tensorflow::DT_VARIANT &&
+                   !tensorflow::DataTypeCanUseMemcpy(src->dtype()))) {
+    return tensorflow::errors::InvalidArgument(
+        "Can't copy Tensor with type ",
+        tensorflow::DataTypeString(src->dtype()), " to device ", dstd->name(),
+        ".");
+  }
+  tensorflow::AllocatorAttributes attr;
+  if (src->dtype() == tensorflow::DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  tensorflow::Tensor dst(dstd->GetAllocator(attr), src->dtype(), src->shape());
+  if (src->shape().num_elements() == 0) {
+    dstd = dst_cpu ? nullptr : dstd;
+    *output = new tensorflow::TensorHandle(dst, dstd, dstd, ctx);
+    return tensorflow::Status::OK();
+  }
+  tensorflow::DeviceContext* src_device_context = nullptr;
+  if (!src_cpu) {
+    src_device_context = srcd->tensorflow_gpu_device_info()->default_context;
+  }
+  tensorflow::DeviceContext* dst_device_context = nullptr;
+  if (!dst_cpu) {
+    dst_device_context = dstd->tensorflow_gpu_device_info()->default_context;
+  }
+  // TODO(ashankar): The Sync() call below may be more aggressive than
+  // necessary. It is based on knowledge of implementation details - that
+  // GPU devices are implemented using 3 streams - one for host->device copies,
+  // one for device->host copies and one for sending operations to the GPU.
+  // With that setup, Sync()ing across all 3 streams should be sufficient
+  // but more than necessary (since it waits for operations that might have
+  // nothing to do with this tensor to complete).
+  TF_RETURN_IF_ERROR(srcd->Sync());
+  tensorflow::Notification n;
+  tensorflow::Status status;
+  tensorflow::CopyTensor::ViaDMA("copy", src_device_context, dst_device_context,
+                                 srcd, dstd, tensorflow::AllocatorAttributes(),
+                                 tensorflow::AllocatorAttributes(), src, &dst,
+                                 [&status, &n](const tensorflow::Status& s) {
+                                   status = s;
+                                   n.Notify();
+                                 });
+  n.WaitForNotification();
+  if (status.ok()) {
+    dstd = dst_cpu ? nullptr : dstd;
+    *output = new tensorflow::TensorHandle(dst, dstd, dstd, ctx);
+  }
+  return status;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..d66c4d95e2a5513680f81e3f7c1875266b2dfb02
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// Associates a Tensor and a Device, used in the eager runtime. Internal version
+// executor_of the TFE_TensorHandle struct and the python EagerTensor class
+// (unrelated to python TensorHandle).
+class TensorHandle : public core::RefCounted {
+ public:
+  TensorHandle(const Tensor& t, Device* d, Device* op_device, EagerContext* ctx)
+      : dtype(t.dtype()),
+        node_id(0),
+        tensor_(t),
+        device_(d),
+        op_device_(op_device),
+        ctx_(ctx),
+        is_ready_(true) {}
+
+  TensorHandle(uint64 node_id, DataType dtype, EagerContext* ctx)
+      : dtype(dtype),
+        node_id(node_id),
+        tensor_(dtype),
+        device_(nullptr),
+        op_device_(nullptr),
+        ctx_(ctx),
+        is_ready_(ctx == nullptr) {
+    DCHECK_GT(node_id, 0);
+  }
+
+  ~TensorHandle() override {}
+
+  Status Tensor(const tensorflow::Tensor** t);
+
+  Status Device(tensorflow::Device** d);
+
+  Status OpDevice(tensorflow::Device** d);
+
+  Status TensorAndDevice(const tensorflow::Tensor** tensor,
+                         tensorflow::Device** device,
+                         tensorflow::Device** op_device);
+
+  // Note that this can be called at most once, and only on non-ready handles,
+  // and makes them ready.
+  void SetTensorAndDevice(const tensorflow::Tensor& tensor,
+                          tensorflow::Device* device,
+                          tensorflow::Device* op_device);
+
+  Status CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
+                      TensorHandle** output);
+
+  // Warning: can return nullptr for CPU tensors.
+  EagerContext* Context() {
+    mutex_lock ml(ctx_mutex_);
+    return ctx_;
+  }
+
+  // dtype for the handle. It must be the same as t.dtype() once the handle is
+  // ready.
+  const DataType dtype;
+
+ private:
+  // If the contents of the Tensor pointed to by this handle is yet to be
+  // computed by a EagerNode, this function will block till that compuatation is
+  // done and the handle is "ready".
+  Status WaitReady();
+
+  bool IsReady();
+
+  // Id for the EagerNode that will compute the value pointed to by this handle.
+  // If the value is 0, the handle is already ready, but not vice-versa.
+  const uint64 node_id;
+
+  tensorflow::Tensor tensor_;
+
+  // TODO(ashankar): device_ == nullptr iff local CPU
+  // This was expedient, but perhaps worth revisiting ('device_' should always
+  // be a valid pointer?)
+  // This can be done if TFE_NewOp() and the TFE_TensorHandle constructors are
+  // provided with the appropriate TFE_Context.
+  //
+  // TODO(ashankar): Reference count TFE_Context to ensure that 'device_' of a
+  // TFE_TensorHandle does not outlive the TFE_Context from which it came?
+  tensorflow::Device* device_;
+
+  // Device in which the op producing this tensor was executed. Equals to
+  // device_ for constant tensors.
+  tensorflow::Device* op_device_;
+
+  mutex ctx_mutex_;
+
+  // `ctx` is only guaranteed to be set if the handle is not "ready". This is
+  // typically true when the handle was produced during async execution.
+  // `ctx` object is not owned and should outlive this handle.
+  EagerContext* ctx_ GUARDED_BY(ctx_mutex_);
+  bool is_ready_ GUARDED_BY(ctx_mutex_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index b06b75d6585f01640374eb7ab9842bf441cf9411..0c461a9ee98ca61fb3d3f165d93adf0e5cec7ee7 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -258,6 +258,13 @@ struct NodeItem {
   // Return array of per-output allocator attributes.
   const AllocatorAttributes* output_attrs() const { return output_attr_base(); }
 
+  // Return array of expected input index from which each output should
+  // be forwarded:
+  // kNeverForward (-2) for DO NOT FORWARD (must allocate).
+  // kNoReservation (-1) for no expected forwarding.
+  // 0... for forward from that input.
+  const int* forward_from() const { return forward_from_base(); }
+
  private:
   friend class GraphView;
 
@@ -267,6 +274,7 @@ struct NodeItem {
   //   AllocatorAttributes output_attr[num_outputs];
   //   uint8               input_type[num_inputs];
   //   uint8               output_type[num_outputs];
+  //   int                 forward_from[num_outputs];
 
   // Return pointer to variable length section.
   char* var() const {
@@ -292,6 +300,13 @@ struct NodeItem {
         sizeof(AllocatorAttributes) * num_outputs + sizeof(uint8) * num_inputs);
   }
 
+  int* forward_from_base() const {
+    return reinterpret_cast<int*>(var() + sizeof(EdgeInfo) * num_output_edges +
+                                  sizeof(AllocatorAttributes) * num_outputs +
+                                  sizeof(uint8) * num_inputs +
+                                  sizeof(uint8) * num_outputs);
+  }
+
   TF_DISALLOW_COPY_AND_ASSIGN(NodeItem);
 };
 
@@ -466,7 +481,8 @@ size_t GraphView::NodeItemBytes(const Node* n) {
       + num_output_edges * sizeof(EdgeInfo)        // output_edges[...]
       + num_outputs * sizeof(AllocatorAttributes)  // output_attr[...]
       + num_inputs * sizeof(uint8)                 // input_type[num_inputs]
-      + num_outputs * sizeof(uint8);               // output_type[num_outputs]
+      + num_outputs * sizeof(uint8)                // output_type[num_outputs]
+      + num_outputs * sizeof(int);                 // forward_from[num_outputs]
   static constexpr size_t kItemAlignment = sizeof(NodeItem*);
   static_assert(kItemAlignment % alignof(NodeItem) == 0,
                 "NodeItem must be aligned with kItemAlignment");
@@ -737,8 +753,8 @@ Status InferAllocAttr(const Node* n, const Node* dst,
       VLOG(2) << "node " << n->name() << " is the sink of an RPC in";
     } else if ((local_dev_name.type == "CPU" || n->IsHostRecv()) &&
                parsed_src_name.type != "CPU") {
-      // Value is going to be the sink of a local DMA from GPU to CPU (or other
-      // types of accelerators).
+      // Value is going to be the sink of a local DMA from GPU to CPU (or
+      // other types of accelerators).
       attr->set_gpu_compatible(true);
       VLOG(2) << "node " << n->name() << " is the sink of a gpu->cpu copy";
     } else {
@@ -1022,7 +1038,8 @@ class ExecutorState {
     int total_input_tensors = 0;
     std::vector<const Node*>* nodes = nullptr;
 
-    // Lock ordering: ExecutorState.mu_ < mu.
+    // Lock ordering: ExecutorState.mu_ < mu;
+    // during structured traversal: parent_frame->mu < mu.
     mutex mu;
 
     void InitializeFrameInfo(const string& enter_name) {
@@ -1090,7 +1107,8 @@ class ExecutorState {
     void ActivateLoopInvs(const GraphView* gview, int64 iter,
                           TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
-    // Add a new loop invariant and make it available to all active iterations.
+    // Add a new loop invariant and make it available to all active
+    // iterations.
     void AddLoopInv(const NodeItem* item, const Entry& value,
                     TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
@@ -1147,8 +1165,8 @@ class ExecutorState {
         if (front_index_ == ready_.size()) {
           ready_.clear();
         } else {
-          // Lots of unused entries at beginning of vector: move everything down
-          // to start of vector.
+          // Lots of unused entries at beginning of vector: move everything
+          // down to start of vector.
           ready_.erase(ready_.begin(), ready_.begin() + front_index_);
         }
         front_index_ = 0;
@@ -1596,6 +1614,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
       params.frame_iter = FrameAndIter(input_frame->frame_id, input_iter);
       params.is_input_dead = is_input_dead;
       params.output_attr_array = item.output_attrs();
+      params.forward_from_array = nullptr;  // later: item.forward_from();
 
       if (item.kernel_is_async) {
         // Asynchronous computes.
@@ -2333,8 +2352,9 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   FrameState* parent_frame = frame->parent_frame;
   const int64 parent_iter = frame->parent_iter;
   if (parent_frame != nullptr) {
-    mutex_lock paranet_frame_lock(parent_frame->mu);
+    mutex_lock parent_frame_lock(parent_frame->mu);
     // Propagate all the dead exits to the parent frame.
+    mutex_lock this_frame_lock(frame->mu);
     for (const Node* node : frame->dead_exits) {
       auto parent_iter_state = parent_frame->GetIteration(parent_iter);
       for (const Edge* e : node->out_edges()) {
@@ -2603,7 +2623,7 @@ void ExecutorImpl::RunAsync(const Args& args, DoneCallback done) {
   (new ExecutorState(args, this))->RunAsync(std::move(done));
 }
 
-}  // end namespace
+}  // namespace
 
 Status NewLocalExecutor(const LocalExecutorParams& params,
                         std::unique_ptr<const Graph> graph,
@@ -2629,4 +2649,4 @@ Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
 
 void DeleteNonCachedKernel(OpKernel* kernel) { delete kernel; }
 
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 37c59a16f56809fe8d5f88c05b824bcbdcc7cf4e..d310520ebde5a6143b84e5879c9e1c23ab0b5935 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -796,16 +796,17 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
       done(status);
     };
   }
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
-    parent_->Run(run_opts, handle, args, rets, done);
-    return;
-  }
 
   if (run_opts.runner == nullptr) {
     run_opts.runner = &default_runner_;
   }
   DCHECK(run_opts.runner != nullptr);
 
+  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+    parent_->Run(run_opts, handle, args, rets, done);
+    return;
+  }
+
   Executor::Args* exec_args = new Executor::Args;
   // Inherit the step_id from the caller.
   exec_args->step_id = run_opts.step_id;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index d17ef4d4590e5932e43a0bb01fe1e05ab2c4f873..61b2f0e60f7ea6ca7f7b36f21845766399489795 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -53,8 +54,8 @@ Status GetOpSig(const string& op, const OpDef** sig) {
   return OpRegistry::Global()->LookUpOpDef(op, sig);
 }
 
-void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+void HasError(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
@@ -240,7 +241,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsInvalidArgument(status2));
     EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
+        str_util::StrContains(status2.error_message(), "remote execution."));
 
     return status;
   }
@@ -310,7 +311,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsInvalidArgument(status2));
     EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
+        str_util::StrContains(status2.error_message(), "remote execution."));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 6223a4e648baeb9cd7a2595c74881cddbf9a6f0b..2d09e83d013591ceaa91c9a7e0fe929a328742a3 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -153,7 +154,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsInvalidArgument(status2));
     EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
+        str_util::StrContains(status2.error_message(), "remote execution."));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 8357cc5a7201b3b590c6965648eed72116167459..0b9e8f9cc2d8b65834c7963112427ac2e24a5789 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -257,6 +257,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                                                          physical_device_desc)),
       gpu_allocator_(gpu_allocator),
       cpu_allocator_(cpu_allocator),
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
       tf_gpu_id_(tf_gpu_id),
       sync_every_op_(sync_every_op),
       max_streams_(max_streams) {
@@ -840,6 +841,17 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
   }
 }
 
+Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
+                                             int64 step_id) {
+  if (attr.scope_id > 0) {
+    return scoped_allocator_mgr_->GetContainer(step_id)->GetInstance(
+        attr.scope_id);
+  }
+  LOG(FATAL) << "Unexpected call to BaseGPUDevice::GetScopedAllocator "
+             << "attr.scope_id = " << attr.scope_id;
+  return gpu_allocator_;
+}
+
 const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
 const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index d817c7dd1f3af5656e48c3b2a0420270a7938447..cc5c3881dd24fec24c027406d8e4577e81042433 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -17,8 +17,8 @@ limitations under the License.
 #error This file must only be included when building with Cuda support
 #endif
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
 
 #include <memory>
 #include <string>
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -95,11 +96,19 @@ class BaseGPUDevice : public LocalDevice {
   // corresponds to the cuda context.
   gpu::StreamExecutor* executor() const { return executor_; }
 
+  Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                int64 step_id) override;
+
+  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
+    return scoped_allocator_mgr_.get();
+  }
+
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
 
   gpu::StreamExecutor* executor_;  // not owned
+  std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_;
 
  private:
   struct StreamGroup {
@@ -205,4 +214,4 @@ class BaseGPUDeviceFactory : public DeviceFactory {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 2452efc77952a89dd7b01989f684ac04a8a5ca90..af6a59a85df1cf3dc6a78c4eb81b78a61d095954 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -30,10 +30,6 @@ EventMgr::EventMgr(gpu::StreamExecutor* se, const GPUOptions& gpu_options)
       polling_active_delay_usecs_(gpu_options.polling_active_delay_usecs()
                                       ? gpu_options.polling_active_delay_usecs()
                                       : 10),
-      polling_inactive_delay_msecs_(
-          gpu_options.polling_inactive_delay_msecs()
-              ? gpu_options.polling_inactive_delay_msecs()
-              : 1),
       accumulated_stream_(nullptr),
       accumulated_tensors_(new TensorReferenceVector),
       accumulated_tensor_bytes_(0),
@@ -78,16 +74,22 @@ EventMgr::~EventMgr() {
 
 void EventMgr::StartPollingLoop() {
   CHECK(polling_stopped_ == nullptr);
-  stop_polling_.reset(new Notification);
+  {
+    mutex_lock l(mu_);
+    stop_polling_ = false;
+  }
   polling_stopped_.reset(new Notification);
   threadpool_.Schedule([this]() { PollLoop(); });
 }
 
 void EventMgr::StopPollingLoop() {
-  if (stop_polling_) {
-    stop_polling_->Notify();
+  if (polling_stopped_) {
+    {
+      mutex_lock l(mu_);
+      stop_polling_ = true;
+      events_pending_.notify_all();
+    }
     polling_stopped_->WaitForNotification();
-    stop_polling_.reset(nullptr);
     polling_stopped_.reset(nullptr);
   }
 }
@@ -121,28 +123,31 @@ void EventMgr::FlushAccumulatedTensors() {
   accumulated_stream_ = nullptr;
 }
 
-// A polling loop to detect completion of GPU events.  There's a
-// tradeoff between achieving low latency detection, which argues for
-// little delay between calls, and minimizing CPU use and lock
-// contention, which argue for longer delay.  The current strategy is
-// to poll frequently when the queue is non-empty, and infrequently
-// otherwise.
+// A polling loop to detect completion of GPU events.
+//
+// While one or more events is outstanding, poll for completed events.  When no
+// events are outstanding, we sleep until one is enqueued.
 void EventMgr::PollLoop() {
-  bool queue_empty = false;
-  while (!stop_polling_->HasBeenNotified()) {
-    if (queue_empty) {
-      mutex_lock l(mu_);
-      WaitForMilliseconds(&l, &events_pending_, polling_inactive_delay_msecs_);
-    } else {
-      Env::Default()->SleepForMicroseconds(polling_active_delay_usecs_);
-    }
-    ToFreeVector to_free;
+  ToFreeVector to_free;
+  while (true) {
+    bool events_still_pending;
     {
       mutex_lock l(mu_);
+      if (stop_polling_) {
+        break;
+      }
+      if (used_events_.empty()) {
+        events_pending_.wait(l);
+      }
       PollEvents(true, &to_free);
-      queue_empty = used_events_.empty();
+      events_still_pending = !used_events_.empty();
     }
     FreeMemory(to_free);
+    to_free.clear();
+
+    if (events_still_pending) {
+      Env::Default()->SleepForMicroseconds(polling_active_delay_usecs_);
+    }
   }
   polling_stopped_->Notify();
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index 9692b24084ab577c3d27ed32248d430fd0d65fa0..d23898e1f26a2e0c8363f6080c0b8e301ec7fd67 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -94,7 +94,6 @@ class EventMgr {
   perftools::gputools::StreamExecutor* const exec_;
   const int64 deferred_bytes_threshold_;
   const int32 polling_active_delay_usecs_;
-  const int32 polling_inactive_delay_msecs_;
   mutex mu_;
   condition_variable events_pending_ GUARDED_BY(mu_);
 
@@ -180,7 +179,7 @@ class EventMgr {
   // A FIFO queue of InUse events and associated tensors.
   std::deque<InUse> used_events_ GUARDED_BY(mu_);
 
-  std::unique_ptr<Notification> stop_polling_;
+  bool stop_polling_ GUARDED_BY(mu_);
   std::unique_ptr<Notification> polling_stopped_;
 
   // The main PollLoop for the event manager runs in this threadpool.
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index f5e3d782428875ddf8cf4002f29f8ec71ea0eac5..2f17af273ff8cdc83a112ef350fde88346c7e13d 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -237,6 +237,42 @@ void GraphExecutionState::RestoreStatefulNodes(Graph* graph) {
   }
 }
 
+Status GraphExecutionState::PruneGraph(
+    const BuildGraphOptions& options, Graph* graph,
+    subgraph::RewriteGraphMetadata* out_rewrite_metadata) {
+  std::vector<std::unique_ptr<subgraph::PruneRewrite>> feed_rewrites;
+  feed_rewrites.reserve(options.callable_options.feed_size());
+  std::vector<std::unique_ptr<subgraph::PruneRewrite>> fetch_rewrites;
+  fetch_rewrites.reserve(options.callable_options.fetch_size());
+  const DeviceAttributes* device_info =
+      &device_set_->client_device()->attributes();
+  if (options.use_function_convention) {
+    for (int i = 0; i < options.callable_options.feed_size(); ++i) {
+      feed_rewrites.emplace_back(new subgraph::ArgFeedRewrite(
+          &options.callable_options.feed(i), device_info, i));
+    }
+    for (int i = 0; i < options.callable_options.fetch_size(); ++i) {
+      fetch_rewrites.emplace_back(new subgraph::RetvalFetchRewrite(
+          &options.callable_options.fetch(i), device_info, i));
+    }
+  } else {
+    for (const string& feed : options.callable_options.feed()) {
+      feed_rewrites.emplace_back(
+          new subgraph::RecvFeedRewrite(&feed, device_info));
+    }
+    for (const string& fetch : options.callable_options.fetch()) {
+      fetch_rewrites.emplace_back(
+          new subgraph::SendFetchRewrite(&fetch, device_info));
+    }
+  }
+  std::vector<string> target_node_names(
+      options.callable_options.target().begin(),
+      options.callable_options.target().end());
+  return subgraph::RewriteGraphForExecution(graph, feed_rewrites,
+                                            fetch_rewrites, target_node_names,
+                                            out_rewrite_metadata);
+}
+
 Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   const GraphDef* graph_def = &original_graph_def_;
 
@@ -251,10 +287,8 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
       session_options_->config.graph_options().place_pruned_graph()) {
     // Rewrite the graph before placement.
     rewrite_metadata_.reset(new subgraph::RewriteGraphMetadata);
-    TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
-        new_graph.get(), options.callable_options,
-        device_set_->client_device()->attributes(),
-        options.use_function_convention, rewrite_metadata_.get()));
+    TF_RETURN_IF_ERROR(
+        PruneGraph(options, new_graph.get(), rewrite_metadata_.get()));
   }
 
   // Save stateful placements before placing.
@@ -404,12 +438,7 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   subgraph::RewriteGraphMetadata rewrite_metadata;
   if (session_options_ == nullptr ||
       !session_options_->config.graph_options().place_pruned_graph()) {
-    // Extract the subset of the graph that needs to be run, adding feed/fetch
-    // ops as needed.
-    TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
-        ng.get(), options.callable_options,
-        device_set_->client_device()->attributes(),
-        options.use_function_convention, &rewrite_metadata));
+    TF_RETURN_IF_ERROR(PruneGraph(options, ng.get(), &rewrite_metadata));
   } else {
     // This GraphExecutionState represents a graph that was
     // pruned when this was constructed, so we copy the metadata from
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 2312e1a89fd1fd5734fab4316c25ca2e39f16ae5..2154ef5bd3e09f69728360e62b435354ca33e160 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -177,6 +177,11 @@ class GraphExecutionState {
   void SaveStatefulNodes(Graph* graph);
   void RestoreStatefulNodes(Graph* graph);
 
+  // Extract the subset of the graph that needs to be run, adding feed/fetch
+  // ops as needed.
+  Status PruneGraph(const BuildGraphOptions& options, Graph* graph,
+                    subgraph::RewriteGraphMetadata* out_rewrite_metadata);
+
   Status OptimizeGraph(const BuildGraphOptions& options,
                        std::unique_ptr<Graph>* optimized_graph);
 
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
index 43a909466ed4b6fe6ea32b1ad72a1154390288ac..829c19204af19119667fb455aad6505b388de94e 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
@@ -19,9 +19,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-constexpr const char* MklCPUAllocator::kMaxLimitStr;
-constexpr const size_t MklCPUAllocator::kDefaultMaxLimit;
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index e128b9257f2369e25c911f9a9e1d08475706d561..86851c2c075a60a57c6f169cbc7ad81253a94227 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -151,7 +152,8 @@ class ColocationGraph {
       if (attr_value != nullptr && attr_value->has_list()) {
         for (const string& class_spec : attr_value->list().s()) {
           StringPiece spec(class_spec);
-          if (spec.Consume(kColocationGroupPrefixStringPiece)) {
+          if (str_util::ConsumePrefix(&spec,
+                                      kColocationGroupPrefixStringPiece)) {
             found_spec = true;
             TF_RETURN_IF_ERROR(
                 ColocateNodeToGroup(&colocation_group_root, node, spec));
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 098024d2195aad8ef651120181ab271be168f92a..5ad251c892f175dceccc0304bceedc1405bc0123 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -262,9 +263,9 @@ class PlacerTest : public ::testing::Test {
                 ->attributes()                                          \
                 .device_type())
 
-#define EXPECT_DEVICE_CONTAINS(g, name, device_substr)                        \
-  EXPECT_TRUE(StringPiece(GetNodeByName((g), (name))->assigned_device_name()) \
-                  .contains(device_substr))
+#define EXPECT_DEVICE_CONTAINS(g, name, device_substr) \
+  EXPECT_TRUE(::tensorflow::str_util::StrContains(     \
+      GetNodeByName((g), (name))->assigned_device_name(), device_substr))
 
 // Test that a graph with no constraints will successfully assign nodes to the
 // "best available" device (i.e. prefer GPU over CPU).
@@ -488,11 +489,10 @@ TEST_F(PlacerTest, TestAssignedGpuDeviceToCpuDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Assigned device '/job:a/replica:0/task:0/device:fakegpu:0' "
-              "does not have registered OpKernel support for TestInput"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Assigned device '/job:a/replica:0/task:0/device:fakegpu:0' "
+      "does not have registered OpKernel support for TestInput"));
 }
 
 // Test that graphs with reference connections are correctly placed.
@@ -541,15 +541,15 @@ TEST_F(PlacerTest, TestReferenceConnection) {
   {
     Status s = ReferenceTestHelper("VariableCPU", "AssignGPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("no device type supports both of those nodes"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "TestAssign", "FakeGPU"));
   {
     Status s = ReferenceTestHelper("VariableGPU", "AssignCPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("no device type supports both of those nodes"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "AssignGPU", "FakeGPU"));
 }
@@ -760,8 +760,9 @@ TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
   }
 
   Status s = Place(&g);
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Cannot colocate nodes 'foo' and 'in' because no "
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(),
+                            "Cannot colocate nodes 'foo' and 'in' because no "
                             "device type supports both of those nodes and the "
                             "other nodes colocated with them"));
 }
@@ -824,11 +825,11 @@ TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
   }
 
   Status s = Place(&g);
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Cannot colocate nodes 'var3' and 'assign3' because no "
-                    "device type supports both of those nodes and the other "
-                    "nodes colocated with them."));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Cannot colocate nodes 'var3' and 'assign3' because no "
+      "device type supports both of those nodes and the other "
+      "nodes colocated with them."));
 }
 
 TEST_F(PlacerTest, TestColocationAndReferenceConnections) {
@@ -888,7 +889,7 @@ TEST_F(PlacerTest, TestEmptyDeviceSet) {
 
   Status s = Place(&g, &empty);
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("No devices are registered"));
+      str_util::StrContains(s.error_message(), "No devices are registered"));
 }
 
 // Test that placement fails when the requested device forces an
@@ -913,16 +914,17 @@ TEST_F(PlacerTest, TestHeterogeneousDeviceSetFailure) {
   heterogeneous.AddDevice(cpu.get());
   Status s = Place(&g, &heterogeneous);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("colocated with a group of nodes that required "
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(),
+                            "colocated with a group of nodes that required "
                             "incompatible device"));
 
   // The error message should contain information that indicates which
   // op types have which registered device types.
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("VariableGPU: FakeGPU"))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "VariableGPU: FakeGPU"))
       << s;
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("TestAssign: FakeGPU FakeCPU"))
+      str_util::StrContains(s.error_message(), "TestAssign: FakeGPU FakeCPU"))
       << s;
 }
 
@@ -937,7 +939,7 @@ TEST_F(PlacerTest, TestUnknownDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the combination of partial
@@ -952,7 +954,7 @@ TEST_F(PlacerTest, TestUnknownMergedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the previously-assigned device for a
@@ -969,9 +971,9 @@ TEST_F(PlacerTest, TestUnknownAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Assigned device '/job:foo' does not match any device"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Assigned device '/job:foo' does not match any device"));
 }
 
 // Test that placement fails when an op with no registered kernels is
@@ -986,12 +988,11 @@ TEST_F(PlacerTest, TestNoKernelsRegistered) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No OpKernel was registered to support Op 'VariableNoKernels'"));
   EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "No OpKernel was registered to support Op 'VariableNoKernels'"));
-  EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("<no registered kernels>"));
+      str_util::StrContains(s.error_message(), "<no registered kernels>"));
 }
 
 // Test that placement fails when a kernel is registered but no known
@@ -1011,10 +1012,10 @@ TEST_F(PlacerTest, TestNoDevicesRegistered) {
 
   Status s = Place(&g, &cpu_only);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("No OpKernel was registered to support "
-                            "Op 'VariableGPU'"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("device='FakeGPU'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No OpKernel was registered to support Op 'VariableGPU'"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "device='FakeGPU'"));
 }
 
 // Test that placement fails when a requested device is malformed.
@@ -1028,8 +1029,8 @@ TEST_F(PlacerTest, TestMalformedDeviceSpecification) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Malformed device specification '/foo:bar'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Malformed device specification '/foo:bar'"));
 }
 
 // Test that placement fails when a previously-assigned device is malformed.
@@ -1045,8 +1046,8 @@ TEST_F(PlacerTest, TestMalformedAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Malformed assigned device '/foo:bar'"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "Malformed assigned device '/foo:bar'"));
 }
 
 // Test that placement fails when a device was previously assigned to
@@ -1063,9 +1064,8 @@ TEST_F(PlacerTest, TestNonUniqueAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Assigned device '/job:a' does not match any device"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Assigned device '/job:a' does not match any device"));
 }
 
 // Test that ops request to be placed on non-existent devices will be relocated
@@ -1099,7 +1099,7 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakegpu:11"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakegpu:11"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1116,10 +1116,10 @@ TEST_F(PlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakecpu:0"));
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("no supported kernel for fakecpu devices is available"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakecpu:0"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "no supported kernel for fakecpu devices is available"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1137,9 +1137,9 @@ TEST_F(PlacerTest, TestNonExistentDevice) {
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("was explicitly assigned to /job:foo/replica:17 "
-                            "but available devices"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "was explicitly assigned to /job:foo/replica:17 but available devices"));
 }
 
 TEST_F(PlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
@@ -1205,8 +1205,8 @@ TEST_F(PlacerTest, TestUnsatisfiableConstraintWithReferenceConnections) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Cannot colocate nodes 'var' and 'assign'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Cannot colocate nodes 'var' and 'assign'"));
 }
 
 // Test that a generator node follows its consumers (where there are several
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index d69e8bc2a049e9d71ca4ef0298dfe0dc058f2c45..c7b8259f7872d623e112a5e0b21a3851461ba803 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -155,7 +155,10 @@ class ProcessFunctionLibraryRuntime {
 
     string target_device() { return target_device_; }
 
-    FunctionLibraryRuntime::LocalHandle local_handle() { return local_handle_; }
+    FunctionLibraryRuntime::LocalHandle local_handle() {
+      mutex_lock l(mu_);
+      return local_handle_;
+    }
 
     // Initializes the FunctionData object by potentially making an Initialize
     // call to the DistributedFunctionLibraryRuntime.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 2da67b084a04067d56f66dfca208287aa04d7b46..4fbf2abc6714bb27abb76b7f7b791868d0b3bdb7 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -132,7 +133,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                    });
     done2.WaitForNotification();
     EXPECT_TRUE(errors::IsNotFound(status));
-    EXPECT_TRUE(StringPiece(status.error_message()).contains("not found."));
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
 
     return Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index d738fef7be5b04f166651644542b2eadbe38715d..7ff360ee2677de033a2a99b4656d5f2044521efd 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/process_util.h"
 
+#ifdef INTEL_MKL
+#include <omp.h>
+#endif
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -46,6 +49,34 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
   return compute_pool;
 }
 
+int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
+  const int32 inter_op = options.config.inter_op_parallelism_threads();
+  if (inter_op != 0) return inter_op;
+#ifdef INTEL_MKL
+  // MKL library executes ops in parallel using OMP threads
+  // Set inter_op conservatively to avoid thread oversubscription that could 
+  // lead to severe perf degradations and OMP resource exhaustion
+  const int mkl_intra_op = omp_get_max_threads();
+  CHECK_GE(mkl_intra_op, 1);
+  const int32 mkl_inter_op = std::max(
+          (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
+  VLOG(0) << "Creating new thread pool with default inter op setting: "
+          << mkl_inter_op
+          << ". Tune using inter_op_parallelism_threads for best performance.";
+  return mkl_inter_op;
+#else
+  // Default to using the number of cores available in the process.
+  return port::NumSchedulableCPUs();
+#endif
+}
+
+thread::ThreadPool* NewThreadPoolFromSessionOptions(
+    const SessionOptions& options) {
+  const int32 num_threads = NumInterOpThreadsFromSessionOptions(options);
+  VLOG(1) << "Direct session inter op parallelism threads: " << num_threads;
+  return new thread::ThreadPool(options.env, "Compute", num_threads);
+}
+
 void SchedClosure(std::function<void()> closure) {
   if (port::Tracing::IsActive()) {
     const uint64 id = port::Tracing::UniqueId();
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index fc3a262fe1c14856819361f29ea9066193181695..5d9266671617320eea4cea60de1ebd7210f3b674 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -30,6 +30,13 @@ namespace tensorflow {
 // using 'options'.  Caller does not take ownership over threadpool.
 thread::ThreadPool* ComputePool(const SessionOptions& options);
 
+// Returns number of inter op threads.
+int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
+
+// Creates a thread pool with number of inter op threads.
+thread::ThreadPool* NewThreadPoolFromSessionOptions(
+    const SessionOptions& options);
+
 // Schedule "closure" in the default thread queue.
 void SchedClosure(std::function<void()> closure);
 
diff --git a/tensorflow/core/common_runtime/process_util_test.cc b/tensorflow/core/common_runtime/process_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46672ac92eef1c1da608720f17a30417f92d04bd
--- /dev/null
+++ b/tensorflow/core/common_runtime/process_util_test.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/process_util.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(ProcessUtilTest, NumThreads) {
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(10);
+  EXPECT_EQ(10, NumInterOpThreadsFromSessionOptions(opts));
+}
+
+TEST(ProcessUtilTest, ThreadPool) {
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(10);
+
+  thread::ThreadPool* pool = NewThreadPoolFromSessionOptions(opts);
+  EXPECT_EQ(10, pool->NumThreads());
+  delete pool;
+}
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/scoped_allocator.cc b/tensorflow/core/common_runtime/scoped_allocator.cc
index 31e7a5e3e224d381a9d6ca21396acb5523a528a2..a26672b79dab87d66ac98a8436cc7e2df7473677 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator.cc
@@ -75,7 +75,8 @@ void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) {
   if (num_bytes != f.bytes) {
     LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
                << num_bytes << " bytes from field " << field_index
-               << " which has precalculated size " << f.bytes;
+               << " which has precalculated size " << f.bytes << " and offset "
+               << f.offset;
     return nullptr;
   }
 
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index d0d05c6d1bbff317a145be0b03961631cdcfd803..be79cc4507124fbbef8104f87773045c07005ef6 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 Status ScopedAllocatorContainer::AddScopedAllocator(
     const Tensor& backing_tensor, int32 scope_id, const string& scope_name,
     const gtl::ArraySlice<ScopedAllocator::Field>& fields,
-    int32 expected_call_count, ScopedAllocator** sa_ptr) {
+    int32 expected_call_count) {
   VLOG(1) << "AddScopedAllocator " << mgr_->device_name()
           << " step_id_=" << step_id_ << " scope_id=" << scope_id;
   mutex_lock l(mu_);
@@ -41,17 +41,17 @@ Status ScopedAllocatorContainer::AddScopedAllocator(
     }
   }
   VLOG(2) << " container " << this << " step_id " << step_id_;
-  *sa_ptr = new ScopedAllocator(backing_tensor, scope_id, scope_name, fields,
-                                expected_call_count, this);
-  allocators_[scope_id] = ScopedAllocatorContainer::SAField(
-      ScopedAllocator::kBackingIndex, *sa_ptr);
+  ScopedAllocator* sa = new ScopedAllocator(
+      backing_tensor, scope_id, scope_name, fields, expected_call_count, this);
+  allocators_[scope_id] =
+      ScopedAllocatorContainer::SAField(ScopedAllocator::kBackingIndex, sa);
   VLOG(2) << "#fields " << fields.size();
   for (int i = 0; i < fields.size(); ++i) {
     const ScopedAllocator::Field& f = fields[i];
     VLOG(2) << "Adding instance with for " << mgr_->device_name()
             << " scope_id=" << f.scope_id;
     allocators_[f.scope_id] = ScopedAllocatorContainer::SAField(
-        i, new ScopedAllocatorInstance(*sa_ptr, i));
+        i, new ScopedAllocatorInstance(sa, i));
   }
   return Status::OK();
 }
@@ -103,7 +103,7 @@ ScopedAllocatorContainer::~ScopedAllocatorContainer() {
   // In normal execution the table should be empty and all of its
   // contents deleted via Drop.  When when a step ends early
   // (e.g. through abnormal termination) we need to clean up
-  // explicitly.  So long as graph exection of the associated step has
+  // explicitly.  So long as graph execution of the associated step has
   // completey terminated this should be safe.
   for (auto& it : allocators_) {
     if (it.second.field_index == ScopedAllocator::kBackingIndex) {
@@ -154,23 +154,26 @@ Status ScopedAllocatorMgr::AddScopedAllocator(
     const Tensor& backing_tensor, int64 step_id, int32 scope_id,
     const string& scope_name,
     const gtl::ArraySlice<ScopedAllocator::Field>& fields,
-    int32 expected_call_count, ScopedAllocator** sa_ptr) {
+    int32 expected_call_count) {
   ScopedAllocatorContainer* sac = GetContainer(step_id);
   return sac->AddScopedAllocator(backing_tensor, scope_id, scope_name, fields,
-                                 expected_call_count, sa_ptr);
+                                 expected_call_count);
 }
 
 void ScopedAllocatorMgr::PopulateFields(
-    int32 scope_id, const gtl::ArraySlice<TensorShape>& shapes, DataType dtype,
-    std::vector<ScopedAllocator::Field>* fields) {
+    int32 scope_id, const gtl::ArraySlice<TensorShape>& shapes,
+    const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
   const int32 num_fields = static_cast<int32>(shapes.size());
   fields->resize(num_fields);
   size_t offset = 0;
   for (int32 i = 0; i < num_fields; ++i) {
-    size_t bytes = shapes[i].num_elements() * sizeof(dtype);
+    size_t bytes = shapes[i].num_elements() * DataTypeSize(dtype);
     (*fields)[i].scope_id = scope_id + 1 + i;
     (*fields)[i].bytes = bytes;
     (*fields)[i].offset = offset;
+    VLOG(1) << "field=" << i << " scope_id=" << (*fields)[i].scope_id
+            << " bytes=" << (*fields)[i].bytes
+            << " offset=" << (*fields)[i].offset;
     offset += bytes;
     size_t overshoot = offset % Allocator::kAllocatorAlignment;
     if (overshoot > 0) {
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.h b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
index 4d5bc23dd9cb096c4a893af65672a035d344a04c..effc5f2d775336621a783d83b7dd5eece6d42292 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.h
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@@ -34,7 +34,7 @@ class ScopedAllocatorContainer : public core::RefCounted {
   Status AddScopedAllocator(
       const Tensor& backing_tensor, int32 scope_id, const string& scope_name,
       const gtl::ArraySlice<ScopedAllocator::Field>& fields,
-      int32 expected_call_count, ScopedAllocator** sa_ptr);
+      int32 expected_call_count);
 
   ScopedAllocatorInstance* GetInstance(int32 scope_id);
   ScopedAllocator* GetAllocator(int32 scope_id);
@@ -83,7 +83,7 @@ class ScopedAllocatorMgr {
       const Tensor& backing_tensor, int64 step_id, int32 scope_id,
       const string& scope_name,
       const gtl::ArraySlice<ScopedAllocator::Field>& fields,
-      int32 expected_call_count, ScopedAllocator** sa_ptr);
+      int32 expected_call_count);
 
   void Cleanup(int64 step_id);
 
@@ -91,7 +91,7 @@ class ScopedAllocatorMgr {
   // consecutive scope_id values following that of the base ScopedAllocator.
   static void PopulateFields(int32 scope_id,
                              const gtl::ArraySlice<TensorShape>& shapes,
-                             DataType dtype,
+                             const DataType dtype,
                              std::vector<ScopedAllocator::Field>* fields);
 
   const string& device_name() const { return device_name_; }
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
index 81cb3e7979d4e52c7239c942441375ea1c5146ad..38e07e47f24d6808e858721f8e7832668a556164 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
@@ -25,7 +25,7 @@ namespace {
 
 class ScopedAllocatorMgrTest : public ::testing::Test {
  public:
-  ScopedAllocatorMgrTest() : sam_("CPU0"), sa_(nullptr) {}
+  ScopedAllocatorMgrTest() : sam_("CPU0") {}
 
   void InitTensor() {
     backing_tensor_ = Tensor(cpu_allocator(), DT_FLOAT, backing_tensor_shape_);
@@ -42,7 +42,7 @@ class ScopedAllocatorMgrTest : public ::testing::Test {
             << " expected_use_count " << expected_use_count;
     return sam_.AddScopedAllocator(backing_tensor_, step_id_, scope_id,
                                    "tensor_shape_599", fields_,
-                                   expected_use_count, &sa_);
+                                   expected_use_count);
   }
 
   Status PrepScopedAllocatorMgr(int expected_use_count) {
@@ -87,7 +87,6 @@ class ScopedAllocatorMgrTest : public ::testing::Test {
   std::vector<TensorShape> fields_shapes_;
   std::vector<ScopedAllocator::Field> fields_;
   ScopedAllocatorMgr sam_;
-  ScopedAllocator* sa_;
   const int step_id_ = 101;
   const int scope_id_ = 599;
   std::vector<ScopedAllocatorInstance*> sa_instances_;
@@ -138,9 +137,9 @@ TEST_F(ScopedAllocatorMgrTest, ContainerAddAllocator) {
 
   // Cleanup the instances by invoking allocate and deallocate.
   void* ptr0 =
-      sa_instances_[0]->AllocateRaw(0 /* alignment */, 512 * sizeof(DT_FLOAT));
+      sa_instances_[0]->AllocateRaw(0 /* alignment */, 512 * sizeof(float));
   void* ptr1 =
-      sa_instances_[1]->AllocateRaw(0 /* alignment */, 512 * sizeof(DT_FLOAT));
+      sa_instances_[1]->AllocateRaw(0 /* alignment */, 512 * sizeof(float));
   sa_instances_[0]->DeallocateRaw(ptr0);
   sa_instances_[1]->DeallocateRaw(ptr1);
 }
@@ -153,7 +152,6 @@ TEST_F(ScopedAllocatorMgrTest, AllocatorSuccess) {
   fields_shapes_ = std::vector<TensorShape>({{512}, {3, 3}, {2, 256}});
   Status s = PrepScopedAllocatorMgr(3);
   other = sac->GetAllocator(scope_id_);
-  EXPECT_EQ(other, sa_);
 
   ScopedAllocatorInstance* inst0 = sac->GetInstance(scope_id_ + 1);
   char* ptr0 = static_cast<char*>(inst0->AllocateRaw(0, 512 * sizeof(float)));
@@ -187,8 +185,7 @@ TEST_F(ScopedAllocatorMgrTest, AllocatorInitFail) {
   fields_.resize(1);
   fields_[0].scope_id = scope_id_ + 1;
   fields_[0].offset = 0;
-  fields_[0].bytes =
-      backing_tensor_shape_.num_elements() * 2 * sizeof(DT_FLOAT);
+  fields_[0].bytes = backing_tensor_shape_.num_elements() * 2 * sizeof(float);
   // fields[0].offset + fields[0].bytes is larger than the size of the backing
   // tensor, so this check should fail
   EXPECT_DEATH(Status s = AddScopedAllocator(1, scope_id_), "");
@@ -208,20 +205,20 @@ TEST_F(ScopedAllocatorMgrTest, AllocatorFail) {
   // so we need to explicitly delete the instances to avoid a memleak.
   SaveInstances(fields_shapes_.size());
 
-  char* ptr0 = static_cast<char*>(
-      sa_instances_[0]->AllocateRaw(0, 512 * sizeof(DT_FLOAT)));
+  char* ptr0 =
+      static_cast<char*>(sa_instances_[0]->AllocateRaw(0, 512 * sizeof(float)));
   VLOG(2) << "Should fail because we deallocate ptr="
           << static_cast<void*>(ptr0 + 8) << " which we never allocated.";
   EXPECT_DEATH(sa_instances_[0]->DeallocateRaw(ptr0 + 8), "");
   VLOG(2) << "Should fail because we allocate smaller than the size of the "
           << "field.";
-  EXPECT_EQ(nullptr, sa_instances_[1]->AllocateRaw(0, 256 * sizeof(DT_FLOAT)));
+  EXPECT_EQ(nullptr, sa_instances_[1]->AllocateRaw(0, 256 * sizeof(float)));
   VLOG(2) << "Should fail because we allocate larger than the size of the "
           << "field.";
-  EXPECT_EQ(nullptr, sa_instances_[1]->AllocateRaw(0, 1024 * sizeof(DT_FLOAT)));
-  void* ptr1 = sa_instances_[1]->AllocateRaw(0, 512 * sizeof(DT_FLOAT));
+  EXPECT_EQ(nullptr, sa_instances_[1]->AllocateRaw(0, 1024 * sizeof(float)));
+  void* ptr1 = sa_instances_[1]->AllocateRaw(0, 512 * sizeof(float));
   VLOG(2) << "Should fail because we exceed expected_use_count.";
-  EXPECT_EQ(nullptr, sa_instances_[0]->AllocateRaw(0, 512 * sizeof(DT_FLOAT)));
+  EXPECT_EQ(nullptr, sa_instances_[0]->AllocateRaw(0, 512 * sizeof(float)));
   sa_instances_[0]->DeallocateRaw(ptr0);
   sa_instances_[1]->DeallocateRaw(ptr1);
 }
diff --git a/tensorflow/core/common_runtime/session_test.cc b/tensorflow/core/common_runtime/session_test.cc
index a074154450694e2135e07f345082393773d97084..feaf29c7bb528c6019da3ae273681997173fd372 100644
--- a/tensorflow/core/common_runtime/session_test.cc
+++ b/tensorflow/core/common_runtime/session_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/public/session.h"
 
 #include "tensorflow/core/common_runtime/session_factory.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -31,10 +32,9 @@ TEST(SessionTest, InvalidTargetReturnsNull) {
   Session* session;
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "No session factory registered for the given session options"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No session factory registered for the given session options"));
 }
 
 // Register a fake session factory to test error handling paths in
@@ -44,7 +44,7 @@ class FakeSessionFactory : public SessionFactory {
   FakeSessionFactory() {}
 
   bool AcceptsOptions(const SessionOptions& options) override {
-    return StringPiece(options.target).starts_with("fake");
+    return str_util::StartsWith(options.target, "fake");
   }
 
   Session* NewSession(const SessionOptions& options) override {
@@ -68,9 +68,9 @@ TEST(SessionTest, MultipleFactoriesForTarget) {
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::INTERNAL);
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("Multiple session factories"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("FAKE_SESSION_1"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("FAKE_SESSION_2"));
+      str_util::StrContains(s.error_message(), "Multiple session factories"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_1"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_2"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index cef50be3b1566de9f05b14783212f90da3107fc6..1b7e3138ee5073f48829ff55cba0108bd69785fc 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -351,6 +351,11 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
         }
       }
     }
+    if (node_context->requested_input_tensor_as_partial_shape(dst_input)) {
+      // The input value may have changed. Since we have no way to know if
+      // that's indeed the case, err on the safe side.
+      *refined = true;
+    }
 
     // Also propagate handle shape and dtype of edges which are carrying
     // resource handles.
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index adf5a9afff2ebc6848db8811506ebd4a031df2bb..f48638afc0f602e4b0c1376f7e5732f3c637d025 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -143,8 +144,8 @@ TEST_F(ShapeRefinerTest, BadShapes) {
   // an error.
   Status s = m.AddNode(mm.node());
   ASSERT_FALSE(s.ok());
-  ASSERT_TRUE(StringPiece(s.error_message())
-                  .contains("Dimensions must be equal, but are 1 and 2"));
+  ASSERT_TRUE(str_util::StrContains(
+      s.error_message(), "Dimensions must be equal, but are 1 and 2"));
 }
 
 TEST_F(ShapeRefinerTest, SetShape) {
@@ -1032,8 +1033,8 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
   TF_ASSERT_OK(m.AddNode(pack.node()));
-  EXPECT_TRUE(
-      StringPiece(m.AddNode(result).error_message()).contains("but is rank 2"));
+  EXPECT_TRUE(str_util::StrContains(m.AddNode(result).error_message(),
+                                    "but is rank 2"));
 }
 
 TEST_F(ShapeRefinerTest, ConstantValueAsShape_Concat) {
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 5aa01376ab047e7613ba7403bb32859a83a09f5a..6d8de6a3c06a84d50c22f6337632ef89cc77e2c8 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 
 #include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -40,7 +42,8 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
                                    Allocator* allocator)
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
-      allocator_(allocator) {}
+      allocator_(allocator),
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
@@ -65,6 +68,17 @@ Allocator* ThreadPoolDevice::GetAllocator(AllocatorAttributes attr) {
   return allocator_;
 }
 
+Allocator* ThreadPoolDevice::GetScopedAllocator(AllocatorAttributes attr,
+                                                int64 step_id) {
+  if (attr.scope_id > 0) {
+    return scoped_allocator_mgr_->GetContainer(step_id)->GetInstance(
+        attr.scope_id);
+  }
+  LOG(FATAL) << "Unexpected call to ThreadPoolDevice::GetScopedAllocator "
+             << "attr.scope_id = " << attr.scope_id;
+  return allocator_;
+}
+
 Status ThreadPoolDevice::MakeTensorFromProto(
     const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs,
     Tensor* tensor) {
diff --git a/tensorflow/core/common_runtime/threadpool_device.h b/tensorflow/core/common_runtime/threadpool_device.h
index 37cb745a0aa89b9aeae2c289d347faac2ae177dd..afc5d15ebc39883f3d24c91b42d86c46576883c0 100644
--- a/tensorflow/core/common_runtime/threadpool_device.h
+++ b/tensorflow/core/common_runtime/threadpool_device.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
-#define TENSORFLOW_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
@@ -31,6 +31,11 @@ class ThreadPoolDevice : public LocalDevice {
 
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
   Allocator* GetAllocator(AllocatorAttributes attr) override;
+  Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                int64 step_id) override;
+  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
+    return scoped_allocator_mgr_.get();
+  }
   Status MakeTensorFromProto(const TensorProto& tensor_proto,
                              const AllocatorAttributes alloc_attrs,
                              Tensor* tensor) override;
@@ -39,8 +44,9 @@ class ThreadPoolDevice : public LocalDevice {
 
  private:
   Allocator* allocator_;  // Not owned
+  std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_;
 };
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_THREADPOOL_DEVICE_H_
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index f6fe9edb022dce29286190e9948f385b933c5a07..5fab740e920519abe8ec109615b75555593ec4c8 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -339,18 +339,3 @@ cc_library(
 #     ],
 #     visibility = ["//visibility:public"],
 # )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 434626bd2da57ce4c4895017c0bb0abef58c6f44..b07cb8cdcb3198a5ca5d63816ce501ef615039f8 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -7,18 +7,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 049eec347c672de8e12f44eda9a6bebccb68043c..bafd9bfc68a3bd79492ec220257a0c145e535455 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -144,9 +144,9 @@ BaseRemoteRendezvous::~BaseRemoteRendezvous() {
 // Returns true if "device_name" is a valid full name of local device
 // of the "worker".  This helper is purely based on the worker name
 // and device name and does no lookups in the worker->device_mgr.
-static bool IsLocalDevice(const string& worker_name,
+static bool IsLocalDevice(const StringPiece worker_name,
                           const StringPiece device_name) {
-  return device_name.starts_with(worker_name);
+  return str_util::StartsWith(device_name, worker_name);
 }
 
 Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index e9d5390c63a59fc4987103f7ecaf9d6f276bc2db..9c655bfa312488e2bb435ea7c10a3cede2ab3bf2 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -5,18 +5,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
@@ -259,6 +247,7 @@ cc_library(
     hdrs = ["grpc_serialization_traits.h"],
     deps = [
         "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc_unsecure",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
index 730124c25e9a3e8d102a9dd39e4c4a17f2ce39d1..e7f5fb0c6ae24caa3ffe5039d5daddb771c4858d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "grpc++/impl/codegen/proto_utils.h"
 #include "grpc++/support/slice.h"
+#include "grpc/grpc.h"
 
 namespace grpc {
 
@@ -30,13 +31,13 @@ class GrpcBufferWriter final
  public:
   explicit GrpcBufferWriter(grpc_byte_buffer** bp, int block_size)
       : block_size_(block_size), byte_count_(0), have_backup_(false) {
-    *bp = g_core_codegen_interface->grpc_raw_byte_buffer_create(NULL, 0);
+    *bp = grpc_raw_byte_buffer_create(NULL, 0);
     slice_buffer_ = &(*bp)->data.raw.slice_buffer;
   }
 
   ~GrpcBufferWriter() override {
     if (have_backup_) {
-      g_core_codegen_interface->grpc_slice_unref(backup_slice_);
+      grpc_slice_unref(backup_slice_);
     }
   }
 
@@ -45,24 +46,24 @@ class GrpcBufferWriter final
       slice_ = backup_slice_;
       have_backup_ = false;
     } else {
-      slice_ = g_core_codegen_interface->grpc_slice_malloc(block_size_);
+      slice_ = grpc_slice_malloc(block_size_);
     }
     *data = GRPC_SLICE_START_PTR(slice_);
     // On win x64, int is only 32bit
     GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
     byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
-    g_core_codegen_interface->grpc_slice_buffer_add(slice_buffer_, slice_);
+    grpc_slice_buffer_add(slice_buffer_, slice_);
     return true;
   }
 
   void BackUp(int count) override {
-    g_core_codegen_interface->grpc_slice_buffer_pop(slice_buffer_);
+    grpc_slice_buffer_pop(slice_buffer_);
     if (count == block_size_) {
       backup_slice_ = slice_;
     } else {
-      backup_slice_ = g_core_codegen_interface->grpc_slice_split_tail(
-          &slice_, GRPC_SLICE_LENGTH(slice_) - count);
-      g_core_codegen_interface->grpc_slice_buffer_add(slice_buffer_, slice_);
+      backup_slice_ =
+          grpc_slice_split_tail(&slice_, GRPC_SLICE_LENGTH(slice_) - count);
+      grpc_slice_buffer_add(slice_buffer_, slice_);
     }
     // It's dangerous to keep an inlined grpc_slice as the backup slice, since
     // on a following Next() call, a reference will be returned to this slice
@@ -85,29 +86,12 @@ class GrpcBufferWriter final
 
 class GrpcBufferReader final
     : public ::grpc::protobuf::io::ZeroCopyInputStream {
-  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    (g_core_codegen_interface->*ptr)(reader, buffer);
-  }
-  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
-    (void)result;
-  }
-
  public:
   explicit GrpcBufferReader(grpc_byte_buffer* buffer)
       : byte_count_(0), backup_count_(0) {
-    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
-               buffer);
-  }
-  ~GrpcBufferReader() override {
-    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
+    (void)grpc_byte_buffer_reader_init(&reader_, buffer);
   }
+  ~GrpcBufferReader() override { grpc_byte_buffer_reader_destroy(&reader_); }
 
   bool Next(const void** data, int* size) override {
     if (backup_count_ > 0) {
@@ -118,11 +102,10 @@ class GrpcBufferReader final
       backup_count_ = 0;
       return true;
     }
-    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
-                                                                &slice_)) {
+    if (!grpc_byte_buffer_reader_next(&reader_, &slice_)) {
       return false;
     }
-    g_core_codegen_interface->grpc_slice_unref(slice_);
+    grpc_slice_unref(slice_);
     *data = GRPC_SLICE_START_PTR(slice_);
     // On win x64, int is only 32bit
     GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
@@ -176,18 +159,18 @@ class UnlimitedSizeProtoSerializationTraits {
       return Status(StatusCode::INTERNAL, "Message length was negative");
     } else if (byte_size <=
                tensorflow_helper::kGrpcBufferWriterMaxBufferLength) {
-      grpc_slice slice = g_core_codegen_interface->grpc_slice_malloc(byte_size);
+      grpc_slice slice = grpc_slice_malloc(byte_size);
       GPR_CODEGEN_ASSERT(
           GRPC_SLICE_END_PTR(slice) ==
           msg.SerializeWithCachedSizesToArray(GRPC_SLICE_START_PTR(slice)));
-      *bp = g_core_codegen_interface->grpc_raw_byte_buffer_create(&slice, 1);
-      g_core_codegen_interface->grpc_slice_unref(slice);
-      return g_core_codegen_interface->ok();
+      *bp = grpc_raw_byte_buffer_create(&slice, 1);
+      grpc_slice_unref(slice);
+      return Status::OK;
     } else {
       tensorflow_helper::GrpcBufferWriter writer(
           bp, tensorflow_helper::kGrpcBufferWriterMaxBufferLength);
       return msg.SerializeToZeroCopyStream(&writer)
-                 ? g_core_codegen_interface->ok()
+                 ? Status::OK
                  : Status(StatusCode::INTERNAL, "Failed to serialize message");
     }
   }
@@ -197,7 +180,7 @@ class UnlimitedSizeProtoSerializationTraits {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
     }
-    Status result = g_core_codegen_interface->ok();
+    Status result = Status::OK;
     {
       tensorflow_helper::GrpcBufferReader reader(buffer);
       ::grpc::protobuf::io::CodedInputStream decoder(&reader);
@@ -214,7 +197,7 @@ class UnlimitedSizeProtoSerializationTraits {
         result = Status(StatusCode::INTERNAL, "Did not read entire message");
       }
     }
-    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
+    grpc_byte_buffer_destroy(buffer);
     return result;
   }
 };
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 120a33f17b0d1f81e50dfbc844f56e3d85def096..3e79a406831fbaee2fa51348463cd425bfd9614e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
@@ -402,7 +403,7 @@ Status GrpcSession::Reset(const SessionOptions& options,
 class GrpcSessionFactory : public SessionFactory {
  public:
   bool AcceptsOptions(const SessionOptions& options) override {
-    return StringPiece(options.target).starts_with(kSchemePrefix);
+    return str_util::StartsWith(options.target, kSchemePrefix);
   }
 
   Session* NewSession(const SessionOptions& options) override {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index c237f2dce43b9391c340736f45166c9adc2a5b78..89f83f9f24d570d96704ea0b2d09da13147b1d6c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -57,7 +57,7 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
          strings::StrCat("--num_gpus=", num_gpus)});
-    ret->subprocesses_.emplace_back(testing::CreateSubProcess(argv));
+    ret->subprocesses_.emplace_back(CreateSubProcess(argv));
     bool success = ret->subprocesses_[i]->Start();
     if (!success) {
       return errors::Internal("Could not start subprocess");
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
index 4b3a03b1d708744bded25ff4d320979bb7eb38b2..d5baaae353a99b2681ae5e0873a4cef7161845f3 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 1a5e2edfb240198c50d3b5d00bec1127fceff725..2a2f7e3ffbef10f9f2997fc554f010d3f8689ca2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -88,7 +88,7 @@ class SerializationTraits<tensorflow::TensorResponse>
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
     }
-    Status result = g_core_codegen_interface->ok();
+    Status result = Status::OK;
     if (result.ok()) {
       ::tensorflow::GrpcByteSource source(buffer);
       auto s = msg->ParseFrom(&source);
@@ -98,7 +98,7 @@ class SerializationTraits<tensorflow::TensorResponse>
                             "TensorResponse parse error", s.ToString()));
       }
     }
-    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
+    grpc_byte_buffer_destroy(buffer);
     return result;
   }
 };
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index a382b8be95f143898a8f52f887b9396f3823372b..6182f95f285dd0cf38cca77165a4c2fd001a4b44 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -61,6 +61,26 @@ static bool cpu_allocator_collect_stats = false;
 // If true, cpu allocator collects full stats.
 static bool cpu_allocator_collect_full_stats = false;
 
+// Individual allocations large than this amount will trigger a warning.
+static const double kLargeAllocationWarningThreshold = 0.1;
+
+// If cpu_allocator_collect_stats is true, warn when the total allocated memory
+// exceeds this threshold.
+static const double kTotalAllocationWarningThreshold = 0.5;
+
+// Cache first invocation to port::AvailableRam, as it can be expensive.
+static int64_t LargeAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kLargeAllocationWarningThreshold);
+  return value;
+}
+
+static int64_t TotalAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kTotalAllocationWarningThreshold);
+  return value;
+}
+
 void EnableCPUAllocatorStats(bool enable) {
   cpu_allocator_collect_stats = enable;
 }
@@ -70,7 +90,8 @@ void EnableCPUAllocatorFullStats(bool enable) {
 
 class CPUAllocator : public VisitableAllocator {
  public:
-  CPUAllocator() : allocation_begun_(false) {}
+  CPUAllocator()
+      : total_allocation_warning_triggered_(false), allocation_begun_(false) {}
 
   ~CPUAllocator() override {}
 
@@ -81,6 +102,12 @@ class CPUAllocator : public VisitableAllocator {
       allocation_begun_ = true;
     }
 
+    if (num_bytes > LargeAllocationWarningBytes()) {
+      LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
+                   << 100 * kLargeAllocationWarningThreshold
+                   << "% of system memory.";
+    }
+
     void* p = port::AlignedMalloc(num_bytes, alignment);
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
@@ -91,6 +118,14 @@ class CPUAllocator : public VisitableAllocator {
           std::max<int64>(stats_.max_bytes_in_use, stats_.bytes_in_use);
       stats_.max_alloc_size =
           std::max<int64>(stats_.max_alloc_size, alloc_size);
+
+      if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
+          !total_allocation_warning_triggered_) {
+        LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use
+                     << "exceeds " << 100 * kTotalAllocationWarningThreshold
+                     << "% of system memory";
+        total_allocation_warning_triggered_ = true;
+      }
     }
 
     // visit each Visitor in alloc_visitors_
@@ -162,6 +197,7 @@ class CPUAllocator : public VisitableAllocator {
  private:
   mutex mu_;
   AllocatorStats stats_ GUARDED_BY(mu_);
+  bool total_allocation_warning_triggered_ GUARDED_BY(mu_);
 
   // visitor_mutex_ protects write access to alloc_visitors_ and free_visitors_.
   // While write access is mutually exclusive, reads may happen concurrently.
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 3ce1b612464291eceb6e08d9b0f2deca70cda27a..2c87156dca61188a3a8deabf9ad483c9180ccd55 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_ALLOCATOR_H_
-#define TENSORFLOW_FRAMEWORK_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
 
 #include <stdlib.h>
 
@@ -359,7 +359,12 @@ struct AllocatorAttributes {
   bool nic_compatible() const { return value & (0x1 << 1); }
   void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); }
   bool gpu_compatible() const { return value & (0x1 << 2); }
-  void Merge(AllocatorAttributes other) { value |= other.value; }
+  void Merge(AllocatorAttributes other) {
+    value |= other.value;
+    scope_id = (scope_id > 0 && other.scope_id == 0)
+                   ? scope_id
+                   : ((scope_id == 0) ? other.scope_id : 0);
+  }
   // Returns true if the fields set in *this is a subset of or equal to
   // those set in other.
   bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const {
@@ -371,6 +376,9 @@ struct AllocatorAttributes {
   // upper 8 bits in device-specific ways, and ops implemented for those
   // devices are responsible for setting those 8 bits appropriately.
   uint32 value = 0;
+  // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
+  // a named special-purpose allocator on the same device.
+  int32 scope_id = 0;
 };
 
 // Returns a trivial implementation of Allocator which uses the system
@@ -396,4 +404,4 @@ class SubAllocator {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index ebb56d525e52e7351e4159dce44349ce0649921c..87c1ddd15df4f89e29b1d073f4380e65dae531f9 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -186,7 +186,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   // check if has_list is false and some other field in attr_value is
   // set to flag the error.  This test can be made more strict once
   // support for GraphDef versions <= 4 is dropped.
-  if (StringPiece(type).starts_with("list(") && !attr_value.has_list()) {
+  if (str_util::StartsWith(type, "list(") && !attr_value.has_list()) {
     if (num_set) {
       return errors::InvalidArgument(
           "AttrValue missing value with expected type '", type, "'");
@@ -197,7 +197,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   }
 
   // Okay to have an empty list, but not to be missing a non-list value.
-  if (num_set == 0 && !StringPiece(type).starts_with("list(")) {
+  if (num_set == 0 && !str_util::StartsWith(type, "list(")) {
     return errors::InvalidArgument(
         "AttrValue missing value with expected type '", type, "'");
   }
@@ -241,29 +241,29 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
 bool ParseAttrValue(StringPiece type, StringPiece text, AttrValue* out) {
   // Parse type.
   string field_name;
-  bool is_list = type.Consume("list(");
-  if (type.Consume("string")) {
+  bool is_list = str_util::ConsumePrefix(&type, "list(");
+  if (str_util::ConsumePrefix(&type, "string")) {
     field_name = "s";
-  } else if (type.Consume("int")) {
+  } else if (str_util::ConsumePrefix(&type, "int")) {
     field_name = "i";
-  } else if (type.Consume("float")) {
+  } else if (str_util::ConsumePrefix(&type, "float")) {
     field_name = "f";
-  } else if (type.Consume("bool")) {
+  } else if (str_util::ConsumePrefix(&type, "bool")) {
     field_name = "b";
-  } else if (type.Consume("type")) {
+  } else if (str_util::ConsumePrefix(&type, "type")) {
     field_name = "type";
-  } else if (type.Consume("shape")) {
+  } else if (str_util::ConsumePrefix(&type, "shape")) {
     field_name = "shape";
-  } else if (type.Consume("tensor")) {
+  } else if (str_util::ConsumePrefix(&type, "tensor")) {
     field_name = "tensor";
-  } else if (type.Consume("func")) {
+  } else if (str_util::ConsumePrefix(&type, "func")) {
     field_name = "func";
-  } else if (type.Consume("placeholder")) {
+  } else if (str_util::ConsumePrefix(&type, "placeholder")) {
     field_name = "placeholder";
   } else {
     return false;
   }
-  if (is_list && !type.Consume(")")) {
+  if (is_list && !str_util::ConsumePrefix(&type, ")")) {
     return false;
   }
 
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index 17e6209f8e5ad5240dfc8ca1def75c178da45c27..206396a25ab784e93daa227bcf79fe608f5df706 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -37,19 +37,27 @@ float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
 
 struct Bfloat16TestParam {
   float input;
-  float expected;
+  float expected_truncation;
+  float expected_rounding;
 };
 
 class Bfloat16Test : public ::testing::Test,
                      public ::testing::WithParamInterface<Bfloat16TestParam> {};
 
 TEST_P(Bfloat16Test, TruncateTest) {
-  bfloat16 a(GetParam().input);
+  bfloat16 truncated(GetParam().input);
   if (std::isnan(GetParam().input)) {
-    EXPECT_TRUE(std::isnan(float(a)) || std::isinf(float(a)));
+    EXPECT_TRUE(std::isnan(float(truncated)) || std::isinf(float(truncated)));
     return;
   }
-  EXPECT_EQ(GetParam().expected, float(a));
+  EXPECT_EQ(GetParam().expected_truncation, float(truncated));
+
+  bfloat16 rounded = bfloat16::round_to_bfloat16((GetParam().input));
+  if (std::isnan(GetParam().input)) {
+    EXPECT_TRUE(std::isnan(float(rounded)) || std::isinf(float(rounded)));
+    return;
+  }
+  EXPECT_EQ(GetParam().expected_rounding, float(rounded));
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -57,37 +65,48 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         Bfloat16TestParam{
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001001, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(1, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000)},
+            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000),
+            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000001),
-            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000000)},
+            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000000),
+            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b11111111, 0b1111111, 0b1111111111111111),
-            BinaryToFloat(0, 0b11111111, 0b1111111, 0b0000000000000000)},
+            BinaryToFloat(0, 0b11111111, 0b1111111, 0b0000000000000000),
+            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(1, 0b10000000, 0b1001000, 0b1100000000000000),
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000)},
+            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000),
+            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
         Bfloat16TestParam{
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0100000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b00000000, 0b1001000, 0b1000000000000000),
+            BinaryToFloat(0, 0b00000000, 0b1001000, 0b0000000000000000),
             BinaryToFloat(0, 0b00000000, 0b1001000, 0b0000000000000000)},
         Bfloat16TestParam{
             BinaryToFloat(0, 0b00000000, 0b1111111, 0b1100000000000000),
-            BinaryToFloat(0, 0b00000000, 0b1111111, 0b0000000000000000)}));
+            BinaryToFloat(0, 0b00000000, 0b1111111, 0b0000000000000000),
+            BinaryToFloat(0, 0b00000001, 0b0000000, 0b0000000000000000)}));
 
 TEST(Bfloat16Test, Conversion) {
   float a[100];
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a26f2c2f315df749db2cd5995c4fac981abb73b2
--- /dev/null
+++ b/tensorflow/core/framework/collective.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/collective.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+string CollGroupParams::ToString() const {
+  return strings::StrCat("CollGroupParams {group_key=", group_key,
+                         " group_size=", group_size,
+                         " device_type=", device_type.type_string(),
+                         " num_tasks=", num_tasks, "}");
+}
+
+CollInstanceParams& CollInstanceParams::operator=(
+    const CollInstanceParams& other) {
+  if (this != &other) {
+    instance_key = other.instance_key;
+    type = other.type;
+    data_type = other.data_type;
+    shape = other.shape;
+    device_names.clear();
+    device_names.assign(other.device_names.begin(), other.device_names.end());
+    task_names.assign(other.task_names.begin(), other.task_names.end());
+    impl_details.subdiv_offsets.assign(
+        other.impl_details.subdiv_offsets.begin(),
+        other.impl_details.subdiv_offsets.end());
+    impl_details.subdiv_permutations.clear();
+    for (auto p : other.impl_details.subdiv_permutations) {
+      impl_details.subdiv_permutations.push_back(
+          std::vector<int>(p.begin(), p.end()));
+    }
+    impl_details.subdiv_source_rank.assign(
+        other.impl_details.subdiv_source_rank.begin(),
+        other.impl_details.subdiv_source_rank.end());
+  }
+  return *this;
+}
+
+string CollInstanceParams::ToString() const {
+  string v = strings::StrCat("CollInstanceParams { instance_key=", instance_key,
+                             " type=", type, " data_type=", data_type,
+                             " shape=", shape.DebugString(), " devices {");
+  for (const auto& d : device_names) {
+    strings::StrAppend(&v, d, ",");
+  }
+  strings::StrAppend(&v, "} task_names={");
+  for (const auto& n : task_names) {
+    strings::StrAppend(&v, n, ", ");
+  }
+  strings::StrAppend(&v, "}, subdiv_offsets={");
+  for (const auto& d : impl_details.subdiv_offsets) {
+    strings::StrAppend(&v, d, ",");
+  }
+  strings::StrAppend(&v, "}, subdiv_perms={");
+  for (const auto& p : impl_details.subdiv_permutations) {
+    strings::StrAppend(&v, "{");
+    for (const auto& i : p) {
+      strings::StrAppend(&v, i, ",");
+    }
+    strings::StrAppend(&v, "}");  // one subdiv
+  }
+  strings::StrAppend(&v, "}");  // all subdivs
+  return v;
+}
+
+string CollTaskParams::ToString() const {
+  string v = strings::StrCat("CollTaskParams {is_local={");
+  for (const auto& b : is_local) {
+    strings::StrAppend(&v, static_cast<int>(b), ",");
+  }
+  strings::StrAppend(&v, "}}");
+  return v;
+}
+
+string CollectiveParams::ToString() const {
+  string v = strings::StrCat("CollectiveParams ", name, " {", group.ToString());
+  strings::StrAppend(&v, " ", instance.ToString());
+  strings::StrAppend(&v, " ", task.ToString());
+  strings::StrAppend(&v, " default_rank=", default_rank,
+                     " is_source=", is_source, " subdiv_rank={");
+  for (const auto& r : subdiv_rank) {
+    strings::StrAppend(&v, r, ",");
+  }
+  if (!subdiv_source_rank.empty()) {
+    strings::StrAppend(&v, " subdiv_rank={");
+    for (const auto& r : subdiv_source_rank) {
+      strings::StrAppend(&v, r, ",");
+    }
+    strings::StrAppend(&v, "}");
+  }
+  strings::StrAppend(&v, "}}");
+  return v;
+}
+
+/*static*/ OpKernelContext::Params* CollectiveExecutor::CtxParams(
+    OpKernelContext* ctx) {
+  return ctx->params_;
+}
+
+/*static*/
+int64 CollectiveExecutor::kInvalidId = -1;
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
new file mode 100644
index 0000000000000000000000000000000000000000..362d345133aa292ac1755c0bc8c0ab04d10efab6
--- /dev/null
+++ b/tensorflow/core/framework/collective.h
@@ -0,0 +1,308 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
+#define TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class BufRendezvous;
+class CancellationManager;
+class CompleteGroupRequest;
+class CompleteGroupResponse;
+class CompleteInstanceRequest;
+class CompleteInstanceResponse;
+class DeviceLocality;
+class GetStepSequenceRequest;
+class GetStepSequenceResponse;
+class Op;
+class Tensor;
+
+// Types of supported collective operations.
+enum CollectiveType {
+  REDUCTION_COLLECTIVE = 0,
+  BROADCAST_COLLECTIVE,
+  UNDEFINED_COLLECTIVE,
+};
+
+// Data common to all members of a device group.
+// All members share the same device set but its order is
+// particular to an instance so it is stored there.
+struct CollGroupParams {
+  int32 group_key;
+  int32 group_size;
+  DeviceType device_type;
+  int32 num_tasks;  // number of distinct tasks in group
+  string ToString() const;
+  CollGroupParams() : device_type(DEVICE_CPU) {}
+};
+
+// The best implementation of a collective op depends on many factors
+// including the number of devices involved, the topology of
+// interconnects between them and the sizes of inputs.  This structure
+// is used in generating and representing data movement choreography
+// for each specific algorithm, hence it does not have a single, fixed
+// interpretation.  On first execution the runtime will update this
+// structure with decisions that will guide all subsequent executions.
+struct CollImplDetails {
+  std::vector<std::vector<int>> subdiv_permutations;
+  std::vector<int> subdiv_offsets;
+  // broadcast only: rank of source in each subdiv
+  std::vector<int> subdiv_source_rank;
+};
+
+// Data common to all members of a collective instance.
+struct CollInstanceParams {
+  int32 instance_key;  // Identifies all participating graph nodes.
+  CollectiveType type;
+  DataType data_type;
+  TensorShape shape;
+  // Fully qualified name of device for each member, in default rank order.
+  std::vector<string> device_names;
+  // Task name prefix of corresponding device name.
+  std::vector<string> task_names;
+  CollImplDetails impl_details;
+  string ToString() const;
+  CollInstanceParams& operator=(const struct CollInstanceParams& other);
+};
+
+// Data common to all instance members in the same task.
+struct CollTaskParams {
+  // True for devices that are local to the process, i.e. no RPC needed.
+  std::vector<bool> is_local;
+  string ToString() const;
+};
+
+// Unique to a single CollectiveOp node.
+struct CollectiveParams {
+  CollGroupParams group;
+  CollInstanceParams instance;
+  CollTaskParams task;
+
+  string name;       // node name used only for log or error messages
+  int default_rank;  // index of this op within device_names
+  bool is_source;    // broadcast only
+  // Rank of this device in each subdivision permutation.
+  std::vector<int> subdiv_rank;
+  std::vector<int> subdiv_source_rank;
+  const Tensor* in_tensor;             // kernel input
+  Tensor* out_tensor;                  // kernel output
+  std::unique_ptr<OpKernel> merge_op;  // reduction only
+  std::unique_ptr<OpKernel> final_op;  // reduction only
+  OpKernelContext* op_context;
+  string ToString() const;
+};
+
+class CollectiveExecutor;
+
+// Interface that provides resolution of device localities.
+class DeviceResolverInterface {
+ public:
+  virtual ~DeviceResolverInterface() {}
+
+  // Collects DeviceLocality protobufs from all of the devices identified
+  // in 'col_params'.
+  virtual void GetDeviceLocalitiesAsync(const CollInstanceParams& inst_params,
+                                        std::vector<DeviceLocality>* localities,
+                                        const StatusCallback& done) = 0;
+
+  // Populate *locality with the DeviceLocality of the specified
+  // device.
+  virtual void GetLocalityAsync(const string& device, const string& task,
+                                DeviceLocality* locality,
+                                const StatusCallback& done) = 0;
+
+  // Clear the cache of device data belonging
+  // to the specified task.
+  virtual void ClearTask(const string& task) = 0;
+};
+
+// Interface that provides resolution of shared CollectiveParams fields.
+class ParamResolverInterface {
+ public:
+  virtual ~ParamResolverInterface() {}
+
+  // Called by each collective op at first execution in order to fill out
+  // the CollectiveParams structure with data gathered from the full
+  // (maybe distributed) collection of peer nodes.
+  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   const StatusCallback& done) = 0;
+
+  // Used within a distributed implementation to discover/verify
+  // data shared across a device group.
+  virtual void CompleteGroupAsync(const CompleteGroupRequest* request,
+                                  CompleteGroupResponse* response,
+                                  CancellationManager* cancel_mgr,
+                                  const StatusCallback& done) = 0;
+
+  // Used within a distributed implementation to discover/verify data
+  // shared across an instance group.
+  virtual void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                                     CompleteInstanceResponse* response,
+                                     CancellationManager* cancel_mgr,
+                                     const StatusCallback& done) = 0;
+};
+
+// Graphs which utilize Collective Ops in a common instance must
+// execute with identical step_ids even if they are disjoint graphs
+// run by otherwise independent tasks.  This interface supplies
+// coordinated step_ids to use in such cases.
+class StepSequenceInterface {
+ public:
+  virtual ~StepSequenceInterface() {}
+
+  // Used with a distributed implementation to coordinate step_id
+  // sequences across tasks.
+  virtual void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                                    GetStepSequenceResponse* response,
+                                    const StatusCallback& done) = 0;
+
+  // Refresh the local per-graph_key step_id sequence from collective
+  // group leader, if applicable.
+  virtual void RefreshStepIdSequenceAsync(int64 graph_key,
+                                          const StatusCallback& done) = 0;
+
+  // Returns the the step_id that should be used for initiating a new execution
+  // on the specified graph. May return the same step_id multiple times if
+  // RetireStepId or RefreshStepIdReservation is not called.
+  virtual int64 NextStepId(int64 graph_key) = 0;
+
+  // Reports that execution of the given step has completed successfully.
+  // Should be called immediately after a step completes with OK status,
+  // prior to calling NextStepId().  If the step fails, don't call.
+  virtual void RetireStepId(int64 graph_key, int64 step_id) = 0;
+};
+
+// Interface that provides access to per-step CollectiveExecutor
+// instances and various distributed resolution capabilities.
+class CollectiveExecutorMgrInterface : public StepSequenceInterface {
+ public:
+  virtual ~CollectiveExecutorMgrInterface() {}
+
+  // Returns the step-specific CollectiveExecutor, creating if one does not
+  // already exist.  The caller assumes ownership of one Ref on the object.
+  virtual CollectiveExecutor* FindOrCreate(int64 step_id) = 0;
+
+  // If there is a CollectiveExecutor for step_id, remove it from the
+  // table.
+  virtual void Cleanup(int64 step_id) = 0;
+
+  virtual ParamResolverInterface* GetParamResolver() const = 0;
+
+  virtual DeviceResolverInterface* GetDeviceResolver() const = 0;
+};
+
+// Interface that a Collective Op implementation uses to exchange data
+// with peers.  Note that data exchange is currently limited to types
+// for which DMAHelper::CanUseDMA() returns true, i.e.  dense numeric
+// types.
+class PeerAccessInterface {
+ public:
+  virtual ~PeerAccessInterface() {}
+
+  virtual void RecvFromPeer(const string& peer_device, const string& peer_task,
+                            bool peer_is_local, const string& key,
+                            Device* to_device, DeviceContext* to_device_ctx,
+                            const AllocatorAttributes& to_alloc_attr,
+                            Tensor* to_tensor,
+                            const DeviceLocality& client_locality,
+                            const StatusCallback& done) = 0;
+
+  virtual void PostToPeer(const string& peer_device, const string& peer_task,
+                          const string& key, Device* from_device,
+                          DeviceContext* from_device_ctx,
+                          const AllocatorAttributes& from_alloc_attr,
+                          const Tensor* from_tensor,
+                          const DeviceLocality& client_locality,
+                          const StatusCallback& done) = 0;
+};
+
+class PerStepCollectiveRemoteAccess;
+
+// A step-specific object that can execute a collective operation completely
+// described by a CollectiveParams object.
+class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
+ public:
+  virtual void StartAbort(const Status& s) {}
+
+  virtual void ExecuteAsync(OpKernelContext* ctx,
+                            const CollectiveParams& col_params,
+                            const string& exec_key, StatusCallback done) {
+    done(errors::Internal(
+        "A collective Op has been called in a context in which "
+        "a CollectiveExecutor has not been provided."));
+  }
+
+  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   StatusCallback done) {
+    cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr, done);
+  }
+
+  virtual PerStepCollectiveRemoteAccess* remote_access() { return nullptr; }
+
+  // Used to designate an invalid group or instance key.
+  static int64 kInvalidId;
+
+  // Lexically scoped handle for Ref.
+  class Handle {
+   public:
+    explicit Handle(CollectiveExecutor* ce, bool inherit_ref) : ce_(ce) {
+      if (!inherit_ref) ce->Ref();
+    }
+    ~Handle() { ce_->Unref(); }
+    CollectiveExecutor* get() const { return ce_; }
+
+   private:
+    CollectiveExecutor* ce_;
+  };
+
+ protected:
+  explicit CollectiveExecutor(CollectiveExecutorMgrInterface* cem)
+      : cem_(cem) {}
+
+  // For use only by derived classes
+  static OpKernelContext::Params* CtxParams(OpKernelContext* ctx);
+  CollectiveExecutorMgrInterface* cem_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor);
+};
+
+// Interface of a helper object that provices a CollectiveExecutor with
+// all of the remote access it needs.
+class CollectiveRemoteAccess : public PeerAccessInterface,
+                               public DeviceResolverInterface {
+ public:
+  virtual ~CollectiveRemoteAccess() {}
+};
+
+// A per-step version of CollectiveRemoteAccess that cleans up outstanding
+// communications in case step execution is abandoned.
+class PerStepCollectiveRemoteAccess : public CollectiveRemoteAccess {
+ public:
+  virtual ~PerStepCollectiveRemoteAccess() {}
+  virtual void StartAbort(const Status& s) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 623248b6ce6adff8ed323acad7dae300742f8eba..72eeda7a43eb3255fcdec803771649f0e10bd823 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -504,8 +504,8 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
     input_shape =
         c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('2'), dim('C')}});
     stride_planes = strides[2];
-    stride_cols = strides[3];
-    stride_rows = strides[4];
+    stride_rows = strides[3];
+    stride_cols = strides[4];
   } else {
     stride_planes = strides[1];
     stride_rows = strides[2];
@@ -1210,7 +1210,7 @@ Status ConcatV2Shape(InferenceContext* c) {
                            c->num_inputs() - 1 /* dim_index */);
 }
 
-Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
+Status BroadcastBinaryOpOutputShapeFn(InferenceContext* c, int output_index) {
   ShapeHandle shape_x = c->input(0);
   ShapeHandle shape_y = c->input(1);
   if (!c->RankKnown(shape_x) || !c->RankKnown(shape_y)) {
@@ -1272,7 +1272,7 @@ Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
     }
   }
 
-  c->set_output(0, c->MakeShape(dims));
+  c->set_output(output_index, c->MakeShape(dims));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 293c40e04d6ad9b57aabfda678216b1805a006f4..789746b4037fbf9f11d34c425272bfc44d8623be 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -265,9 +265,15 @@ Status ConcatShape(shape_inference::InferenceContext* c,
 // Shape function for concat operations.
 Status ConcatV2Shape(shape_inference::InferenceContext* c);
 
+// Shape function for binary operators that broadcast their inputs
+// and with output to output_index.
+Status BroadcastBinaryOpOutputShapeFn(InferenceContext* c, int output_index);
+
 // Shape function for binary operators that broadcast their inputs.
 // Tested by ops/math_ops_test.cc.
-Status BroadcastBinaryOpShapeFn(InferenceContext* c);
+inline Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
+  return BroadcastBinaryOpOutputShapeFn(c, 0);
+}
 
 // Shape function for random operations.
 Status RandomShape(shape_inference::InferenceContext* c);
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 5f3e5ad45731750bfd73181c41cd029f23aab55f..13d429b89519cf7cc696a251030a26a0d8ff67f4 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -140,9 +141,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains("Invalid argument: Shape must be rank 2 but is rank 1"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 1"));
   }
 
   {
@@ -161,10 +161,9 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5}), S({3, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains(
-                "Invalid argument: Dimensions must be equal, but are 5 and 3"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(),
+        "Invalid argument: Dimensions must be equal, but are 5 and 3"));
   }
 
   {
@@ -173,9 +172,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5, 3}), S({3, 5, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains("Invalid argument: Shape must be rank 2 but is rank 3"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 3"));
   }
 
   {
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index cfe23d1ffe47cc10ca9a171ed731f26245c5c0b0..9e7ffe6c0be1ed1d8b9fa8d4be50b9b516c034c6 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -201,7 +201,7 @@ class GraphDefBuilderWrapper {
   // Also looks up the `op_def->name` in the global
   // `WhitelistedStatefulOpRegistry`.
   bool IsOpWhitelisted(const OpDef* op_def) const {
-    return (StringPiece(op_def->name()).ends_with("Dataset") &&
+    return (str_util::EndsWith(op_def->name(), "Dataset") &&
             op_def->output_arg_size() == 1 &&
             op_def->output_arg(0).type() == DT_VARIANT) ||
            dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index fb6d5c69e135c0263845cf71b93ac53bb2a359ed..8473b228d3dbba4611d5bc3173d758701ac2cb58 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_DEVICE_BASE_H_
-#define TENSORFLOW_FRAMEWORK_DEVICE_BASE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
 
 #include <memory>
 #include <string>
@@ -48,6 +48,7 @@ class Env;
 class EventMgr;
 class OpKernelContext;
 class ResourceMgr;
+class ScopedAllocatorMgr;
 class TensorProto;
 
 namespace thread {
@@ -179,6 +180,16 @@ class DeviceBase {
     return GetAllocator(attr);
   }
 
+  // Return an Allocator prepared for use in particular places by graph
+  // optimization
+  virtual Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                        int64 step_id) {
+    LOG(FATAL) << "Device does not implement GetScopedAllocator()";
+    return nullptr;
+  }
+
+  virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
+
   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device() {
     CHECK(eigen_cpu_device_ != nullptr);
     return eigen_cpu_device_;
@@ -243,4 +254,4 @@ class DeviceBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_DEVICE_BASE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 3e7b89d4ebc91df42ee81c1c9fe67c68e755f736..bdc1af9fdaeb2e2ba7605b6f67ea55fa0bb7977a 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -278,7 +279,7 @@ class FunctionInstantiationHelper {
       auto it = index_.lower_bound(node_name);
       while (it != index_.end() && it->first <= node_colon_bound) {
         if (it->first == node_name ||
-            tensorflow::StringPiece(it->first).starts_with(node_colon)) {
+            tensorflow::str_util::StartsWith(it->first, node_colon)) {
           nid = it->second.nid;
           break;
         }
@@ -502,7 +503,7 @@ string Print(const NodeDef& n) {
   std::vector<StringPiece> dat;
   std::vector<string> dep;
   for (StringPiece s : n.input()) {
-    if (s.Consume("^")) {
+    if (str_util::ConsumePrefix(&s, "^")) {
       dep.push_back(s.ToString());
     } else {
       dat.push_back(s);
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 23685e9c536a67ca33fbabdab438e7192c8a47fc..44e1383719c9c903f956fca0b1ba93ec5df4adb4 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -496,7 +496,7 @@ MySelect(x:float) -> (z:float) {
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
 
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index 896cb3cd7ffe45f2d528761403cfa4aaed902d96..f7539d37be08ce1235f35dcc0a8fd0bfcb12b434 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb_text.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -94,7 +95,7 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
   std::vector<string> to_remove;
   for (const auto& attr : node_def->attr()) {
     // If the attr is not in consumer_op_def and doesn't start with '_'...
-    if (!StringPiece(attr.first).starts_with("_") &&
+    if (!str_util::StartsWith(attr.first, "_") &&
         FindAttr(attr.first, *consumer_op_def) == nullptr) {
       const OpDef::AttrDef* producer_attr_def =
           FindAttr(attr.first, *producer_op_def);
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index e836873f667a6971b2c12d44860e5436a04cb93c..cc583df348b8d4d5416e428698fe1a49c29f3637 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -82,7 +83,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
     for (const string& message : messages) {
-      EXPECT_TRUE(StringPiece(status.error_message()).contains(message))
+      EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
           << status << ", " << message;
     }
   }
@@ -103,7 +104,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     }
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
-    EXPECT_TRUE(StringPiece(status.error_message()).contains(message))
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
         << "Actual error: " << status.error_message()
         << "\nDoes not contain: " << message;
   }
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 95fb3863144e8150d78f5d21722f6bc102c451ea..bad92ca9b3d8c981a5dc56485d218179190e83d0 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -131,7 +132,7 @@ Status AttrSlice::Find(StringPiece attr_name,
   // Skip AttachDef for internal attrs since it is a little bit
   // expensive and it is common for them to correctly not be included
   // in a NodeDef.
-  if (!attr_name.starts_with("_") && ndef_ != nullptr) {
+  if (!str_util::StartsWith(attr_name, "_") && ndef_ != nullptr) {
     s = AttachDef(s, *ndef_);
   }
   return s;
@@ -399,7 +400,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   size_t num_inputs = 0;
   // TODO(josh11b): Unify the input field validation.
   for (const string& input : node_def.input()) {
-    if (StringPiece(input).starts_with("^")) {
+    if (str_util::StartsWith(input, "^")) {
       seen_control = true;
       if (input.find(':') != string::npos) {
         return errors::InvalidArgument(
@@ -425,7 +426,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   }
   for (const auto& attr : node_def.attr()) {
     // Allow internal optional attributes with names starting with "_".
-    if (StringPiece(attr.first).starts_with("_")) {
+    if (str_util::StartsWith(attr.first, "_")) {
       continue;
     }
     auto iter = op_attrs.find(attr.first);
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index ae3a93eafeefb2be3a85e546c085691a72caf2e1..2a49425dba9edeacf71b0ba41b78c082809ab2ae 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -65,7 +66,7 @@ void ExpectFailure(const NodeDef& bad, const OpDef& op_def,
       << "; OpDef: " << SummarizeOpDef(op_def);
 
   LOG(INFO) << "Message: " << status.error_message();
-  EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+  EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
       << "NodeDef: " << SummarizeNodeDef(bad)
       << "; OpDef: " << SummarizeOpDef(op_def) << "\nActual error: " << status
       << "\nDoes not contain: " << message;
@@ -265,7 +266,7 @@ void ExpectInvalidSyntax(const NodeDef& bad, const string& message) {
   EXPECT_TRUE(errors::IsInvalidArgument(status))
       << status << "; NodeDef: " << SummarizeNodeDef(bad);
 
-  EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.ToString()), message))
       << "NodeDef: " << SummarizeNodeDef(bad) << ", " << status << ", "
       << message;
 }
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index fc5467b3c86934908c3f1261c79659c6a0469350..5f68c59fe9ae084569d16a38b99294b02a5c5bd8 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -142,7 +143,7 @@ void OpRegistry::Export(bool include_internal, OpList* ops) const {
   out->Reserve(sorted.size());
 
   for (const auto& item : sorted) {
-    if (include_internal || !StringPiece(item.first).starts_with("_")) {
+    if (include_internal || !str_util::StartsWith(item.first, "_")) {
       *out->Add() = item.second->op_def;
     }
   }
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index b57bdcb841592578de4a2026d70b0e91bae66b02..c782480f1fa859715c46785faa22d01675c3c16e 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -96,7 +97,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeOpDef(old_op_def) << " vs. "
                     << SummarizeOpDef(new_op_def);
     } else {
-      EXPECT_TRUE(StringPiece(status.error_message()).contains(error))
+      EXPECT_TRUE(str_util::StrContains(status.error_message(), error))
           << status << " does not contain " << error;
     }
   }
@@ -118,7 +119,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeNodeDef(*node_def());
     } else {
       EXPECT_TRUE(
-          StringPiece(status.error_message()).contains(validation_error))
+          str_util::StrContains(status.error_message(), validation_error))
           << status << " does not contain " << validation_error;
     }
 
@@ -179,7 +180,7 @@ class OpCompatibilityTest : public OpsTestBase {
                     << SummarizeOpDef(*new_op_def);
     } else {
       EXPECT_TRUE(
-          StringPiece(status.error_message()).contains(compatibility_error))
+          str_util::StrContains(status.error_message(), compatibility_error))
           << status << " does not contain " << compatibility_error;
     }
   }
diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index ba545a19949e5574086756dc2092033341be4b30..ca0e5e7133af61a4c8e15be7cf8df903eeb648b0 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -126,6 +126,12 @@ message OpDef {
   // -------------------------------------------------------------------------
   // Optimization constraints.
 
+  // Ops are marked as stateful if their behavior depends on some state beyond
+  // their input tensors (e.g. variable reading op) or if they have
+  // a side-effect (e.g. printing or asserting ops). Equivalently, stateless ops
+  // must always produce the same output for the same input and have
+  // no side-effects.
+  //
   // By default Ops may be moved between devices.  Stateful ops should
   // either not be moved, or should only be moved if that state can also
   // be moved (e.g. via some sort of save / restore).
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 962bc11ccbd2b9abdd4ce26dc3e75c45862cdc74..403bd0b5e22a314309ad0994a879d588c124fe54 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -112,9 +112,11 @@ bool ConsumeAttrNumber(StringPiece* sp, int64* out) {
 
 bool ConsumeCompoundAttrType(StringPiece* sp, StringPiece* out) {
   auto capture_begin = sp->begin();
-  if (sp->Consume("numbertype") || sp->Consume("numerictype") ||
-      sp->Consume("quantizedtype") || sp->Consume("realnumbertype") ||
-      sp->Consume("realnumberictype")) {
+  if (str_util::ConsumePrefix(sp, "numbertype") ||
+      str_util::ConsumePrefix(sp, "numerictype") ||
+      str_util::ConsumePrefix(sp, "quantizedtype") ||
+      str_util::ConsumePrefix(sp, "realnumbertype") ||
+      str_util::ConsumePrefix(sp, "realnumberictype")) {
     *out = StringPiece(capture_begin, sp->begin() - capture_begin);
     return true;
   }
@@ -155,32 +157,32 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   bool is_list = ConsumeListPrefix(&spec);
   string type;
   StringPiece type_string;  // Used if type == "type"
-  if (spec.Consume("string")) {
+  if (str_util::ConsumePrefix(&spec, "string")) {
     type = "string";
-  } else if (spec.Consume("int")) {
+  } else if (str_util::ConsumePrefix(&spec, "int")) {
     type = "int";
-  } else if (spec.Consume("float")) {
+  } else if (str_util::ConsumePrefix(&spec, "float")) {
     type = "float";
-  } else if (spec.Consume("bool")) {
+  } else if (str_util::ConsumePrefix(&spec, "bool")) {
     type = "bool";
-  } else if (spec.Consume("type")) {
+  } else if (str_util::ConsumePrefix(&spec, "type")) {
     type = "type";
-  } else if (spec.Consume("shape")) {
+  } else if (str_util::ConsumePrefix(&spec, "shape")) {
     type = "shape";
-  } else if (spec.Consume("tensor")) {
+  } else if (str_util::ConsumePrefix(&spec, "tensor")) {
     type = "tensor";
-  } else if (spec.Consume("func")) {
+  } else if (str_util::ConsumePrefix(&spec, "func")) {
     type = "func";
   } else if (ConsumeCompoundAttrType(&spec, &type_string)) {
     type = "type";
     AttrValue* allowed = attr->mutable_allowed_values();
     VERIFY(ProcessCompoundType(type_string, allowed),
            "Expected to see a compound type, saw: ", type_string);
-  } else if (spec.Consume("{")) {
+  } else if (str_util::ConsumePrefix(&spec, "{")) {
     // e.g. "{ int32, float, bool }" or "{ \"foo\", \"bar\" }"
     AttrValue* allowed = attr->mutable_allowed_values();
     str_util::RemoveLeadingWhitespace(&spec);
-    if (spec.starts_with("\"") || spec.starts_with("'")) {
+    if (str_util::StartsWith(spec, "\"") || str_util::StartsWith(spec, "'")) {
       type = "string";  // "{ \"foo\", \"bar\" }" or "{ 'foo', 'bar' }"
       while (true) {
         StringPiece escaped_string;
@@ -193,11 +195,12 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                "Trouble unescaping \"", escaped_string,
                "\", got error: ", error);
         allowed->mutable_list()->add_s(unescaped);
-        if (spec.Consume(",")) {
+        if (str_util::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (spec.Consume("}")) break;  // Allow ending with ", }".
+          if (str_util::ConsumePrefix(&spec, "}"))
+            break;  // Allow ending with ", }".
         } else {
-          VERIFY(spec.Consume("}"),
+          VERIFY(str_util::ConsumePrefix(&spec, "}"),
                  "Expected , or } after strings in list, not: '", spec, "'");
           break;
         }
@@ -215,11 +218,12 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                  "Unrecognized type string '", type_string, "'");
           allowed->mutable_list()->add_type(dt);
         }
-        if (spec.Consume(",")) {
+        if (str_util::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (spec.Consume("}")) break;  // Allow ending with ", }".
+          if (str_util::ConsumePrefix(&spec, "}"))
+            break;  // Allow ending with ", }".
         } else {
-          VERIFY(spec.Consume("}"),
+          VERIFY(str_util::ConsumePrefix(&spec, "}"),
                  "Expected , or } after types in list, not: '", spec, "'");
           break;
         }
@@ -232,7 +236,8 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
 
   // Write the type into *attr.
   if (is_list) {
-    VERIFY(spec.Consume(")"), "Expected ) to close 'list(', not: '", spec, "'");
+    VERIFY(str_util::ConsumePrefix(&spec, ")"),
+           "Expected ) to close 'list(', not: '", spec, "'");
     str_util::RemoveLeadingWhitespace(&spec);
     attr->set_type(strings::StrCat("list(", type, ")"));
   } else {
@@ -240,7 +245,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Read optional minimum constraint at the end.
-  if ((is_list || type == "int") && spec.Consume(">=")) {
+  if ((is_list || type == "int") && str_util::ConsumePrefix(&spec, ">=")) {
     int64 min_limit = -999;
     VERIFY(ConsumeAttrNumber(&spec, &min_limit),
            "Could not parse integer lower limit after '>=', found '", spec,
@@ -250,7 +255,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Parse default value, if present.
-  if (spec.Consume("=")) {
+  if (str_util::ConsumePrefix(&spec, "=")) {
     str_util::RemoveLeadingWhitespace(&spec);
     VERIFY(ParseAttrValue(attr->type(), spec, attr->mutable_default_value()),
            "Could not parse default value '", spec, "'");
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index c80802aad3a6b84572096f726d90133ac5536526..9be0dc69d2c190274b3f8d473df170f3b4ed3660 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -239,7 +240,7 @@ static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
 Status ValidateOpDef(const OpDef& op_def) {
   using ::tensorflow::strings::Scanner;
 
-  if (!StringPiece(op_def.name()).starts_with("_")) {
+  if (!str_util::StartsWith(op_def.name(), "_")) {
     VALIDATE(Scanner(op_def.name())
                  .One(Scanner::UPPERLETTER)
                  .Any(Scanner::LETTER_DIGIT)
@@ -259,11 +260,11 @@ Status ValidateOpDef(const OpDef& op_def) {
 
     // Validate type
     StringPiece type(attr.type());
-    bool is_list = type.Consume("list(");
+    bool is_list = str_util::ConsumePrefix(&type, "list(");
     bool found = false;
     for (StringPiece valid : {"string", "int", "float", "bool", "type", "shape",
                               "tensor", "func"}) {
-      if (type.Consume(valid)) {
+      if (str_util::ConsumePrefix(&type, valid)) {
         found = true;
         break;
       }
@@ -271,8 +272,9 @@ Status ValidateOpDef(const OpDef& op_def) {
     VALIDATE(found, "Unrecognized type '", type, "' in attr '", attr.name(),
              "'");
     if (is_list) {
-      VALIDATE(type.Consume(")"), "'list(' is missing ')' in attr ",
-               attr.name(), "'s type ", attr.type());
+      VALIDATE(str_util::ConsumePrefix(&type, ")"),
+               "'list(' is missing ')' in attr ", attr.name(), "'s type ",
+               attr.type());
     }
     VALIDATE(type.empty(), "Extra '", type, "' at the end of attr ",
              attr.name(), "'s type ", attr.type());
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 2b9812d4fcbc145540155959b19dd37cf902c1a2..4514d92e387b9de90b767d0a775272469006cf04 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -57,7 +57,7 @@ class ValidateOpDefTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << "Did not see error with: " << message;
     if (!status.ok()) {
       LOG(INFO) << "message: " << status;
-      EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+      EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
           << "Actual: " << status << "\nExpected to contain: " << message;
     }
   }
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 5f2eb9d99ab11f9862bd277d93af61c05e2517f4..7f23272871abe96dfa2fd7240bfc82015178bda6 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -50,10 +50,10 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
     StringPiece to_append = str.substr(0, space);
     str.remove_prefix(space + 1);
     // Remove spaces at break.
-    while (to_append.ends_with(" ")) {
+    while (str_util::EndsWith(to_append, " ")) {
       to_append.remove_suffix(1);
     }
-    while (str.Consume(" ")) {
+    while (str_util::ConsumePrefix(&str, " ")) {
     }
 
     // Go on to the next line.
@@ -65,8 +65,9 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
 }
 
 bool ConsumeEquals(StringPiece* description) {
-  if (description->Consume("=")) {
-    while (description->Consume(" ")) {  // Also remove spaces after "=".
+  if (str_util::ConsumePrefix(description, "=")) {
+    while (str_util::ConsumePrefix(description,
+                                   " ")) {  // Also remove spaces after "=".
     }
     return true;
   }
@@ -98,7 +99,7 @@ static bool StartsWithFieldName(StringPiece line,
                                 const std::vector<string>& multi_line_fields) {
   StringPiece up_to_colon;
   if (!SplitAt(':', &line, &up_to_colon)) return false;
-  while (up_to_colon.Consume(" "))
+  while (str_util::ConsumePrefix(&up_to_colon, " "))
     ;  // Remove leading spaces.
   for (const auto& field : multi_line_fields) {
     if (up_to_colon == field) {
@@ -119,9 +120,9 @@ static bool ConvertLine(StringPiece line,
   StringPiece up_to_colon;
   StringPiece after_colon = line;
   SplitAt(':', &after_colon, &up_to_colon);
-  while (after_colon.Consume(" "))
+  while (str_util::ConsumePrefix(&after_colon, " "))
     ;  // Remove leading spaces.
-  if (!after_colon.Consume("\"")) {
+  if (!str_util::ConsumePrefix(&after_colon, "\"")) {
     // We only convert string fields, so don't convert this line.
     return false;
   }
@@ -181,9 +182,9 @@ string PBTxtToMultiline(StringPiece pbtxt,
 static bool FindMultiline(StringPiece line, size_t colon, string* end) {
   if (colon == StringPiece::npos) return false;
   line.remove_prefix(colon + 1);
-  while (line.Consume(" ")) {
+  while (str_util::ConsumePrefix(&line, " ")) {
   }
-  if (line.Consume("<<")) {
+  if (str_util::ConsumePrefix(&line, "<<")) {
     *end = line.ToString();
     return true;
   }
@@ -228,7 +229,7 @@ string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
     string suffix;
     while (!multiline_pbtxt.empty()) {
       SplitAt('\n', &multiline_pbtxt, &line);
-      if (line.Consume(end)) break;
+      if (str_util::ConsumePrefix(&line, end)) break;
       if (first) {
         first = false;
       } else {
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 8654437059ca449432e6381b9eb3c4ba15e56f48..cfde1e8ea33b46f84a1fb185d5e2dc45e116deec 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -282,8 +282,13 @@ OpKernelContext::~OpKernelContext() {
 }
 
 Allocator* OpKernelContext::get_allocator(AllocatorAttributes attr) {
-  Allocator* allocator =
-      params_->device->GetStepAllocator(attr, resource_manager());
+  Allocator* allocator = nullptr;
+  if (attr.scope_id > 0) {
+    allocator = params_->device->GetScopedAllocator(attr, step_id());
+    CHECK(allocator);
+  } else {
+    allocator = params_->device->GetStepAllocator(attr, resource_manager());
+  }
   if (track_allocations()) {
     mutex_lock lock(mu_);
     for (const auto& wrapped : wrapped_allocators_) {
@@ -360,7 +365,7 @@ Status OpKernelContext::input_ref_mutex(StringPiece name, mutex** out_mutex) {
 
 const Tensor& OpKernelContext::input(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, num_inputs());
+  DCHECK_LT(index, num_inputs()) << " name: " << op_kernel().name();
   DCHECK(!input_is_ref(index));
   const Tensor& tensor = *((*params_->inputs)[index].tensor);
   record_tensor_reference(tensor);
@@ -415,8 +420,8 @@ bool OpKernelContext::forward_input_to_output_with_shape(
                                ? AllocatorAttributes()
                                : output_alloc_attr(output_index);
   std::unique_ptr<Tensor> new_tensor = forward_input(
-      input_index, expected_output_dtype(output_index), output_shape,
-      output_memory_type(output_index), output_attr);
+      input_index, output_index, expected_output_dtype(output_index),
+      output_shape, output_memory_type(output_index), output_attr);
   if (new_tensor != nullptr) {
     // Transfer ownership to the output slot in OpKernelContext.
     outputs_[output_index] = TensorValue(new_tensor.release());
@@ -456,35 +461,66 @@ Status OpKernelContext::forward_input_to_output_with_shape(
 }
 
 std::unique_ptr<Tensor> OpKernelContext::forward_input(
-    int input_index, DataType output_dtype, const TensorShape& output_shape,
-    MemoryType output_memory_type, const AllocatorAttributes& output_attr) {
+    int input_index, int output_index, DataType output_dtype,
+    const TensorShape& output_shape, MemoryType output_memory_type,
+    const AllocatorAttributes& output_attr) {
   DCHECK_GE(input_index, 0);
   DCHECK_LT(input_index, num_inputs());
   const TensorValue& input = (*params_->inputs)[input_index];
-  // Check that input tensor exists, is not a ref, and has no other consumers.
-  if (input.tensor == nullptr || input.is_ref() || !input->RefCountIsOne()) {
+  // Check whether at graph construction time this output was marked
+  // either for no forwarding or with a reservation for this input.
+  // If it's reserved for this input we'll skip the refcount and
+  // AllocatorAttribute checks.
+  // TODO(tucker): Maybe we should skip all of the checks?
+  bool never_forward =
+      (params_->forward_from_array != nullptr && output_index >= 0 &&
+       params_->forward_from_array[output_index] == Params::kNeverForward);
+  if (never_forward) return nullptr;
+  bool forward_expected =
+      (params_->forward_from_array != nullptr && output_index >= 0 &&
+       params_->forward_from_array[output_index] == input_index);
+  if (!forward_expected && params_->forward_from_array != nullptr) {
+    // Check for possibly conflicting forward.
+    for (int i = 0; i < num_outputs(); ++i) {
+      if (params_->forward_from_array[i] == input_index) {
+        // This input is reserved for output i.
+        return nullptr;
+      }
+    }
+  }
+  // Check that input tensor exists and is not a ref.
+  if (input.tensor == nullptr || input.is_ref()) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that input type matches.
   if (input_dtype(input_index) != output_dtype) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that the input and output sizes are compatible.
   if (input.tensor->shape().num_elements() != output_shape.num_elements()) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that input and output memory types match, i.e.
   // that they either both live in host or both live in device memory.
   if (input_memory_type(input_index) != output_memory_type) {
+    CHECK(!forward_expected);
     return nullptr;
   }
-  // Check that output allocator attributes are not more restrictive than
-  // input allocator attributes.
-  const auto input_attr = params_->input_alloc_attrs == nullptr
-                              ? AllocatorAttributes()
-                              : input_alloc_attr(input_index);
-  if (!output_attr.IsEqualOrLessRestrictiveThan(input_attr)) {
-    return nullptr;
+  if (!forward_expected) {
+    if (!input->RefCountIsOne()) {
+      return nullptr;
+    }
+    // Check that output allocator attributes are not more restrictive than
+    // input allocator attributes.
+    const auto input_attr = params_->input_alloc_attrs == nullptr
+                                ? AllocatorAttributes()
+                                : input_alloc_attr(input_index);
+    if (!output_attr.IsEqualOrLessRestrictiveThan(input_attr)) {
+      return nullptr;
+    }
   }
   // TODO(rmlarsen): Use MakeUnique here. There is already a copy in
   // tensorflow/compiler/xla/ptr_util.h. Perhaps this should be part of
@@ -500,7 +536,8 @@ Status OpKernelContext::forward_input_or_allocate_temp(
     Tensor* out_temp) {
   for (int input_index : candidate_input_indices) {
     std::unique_ptr<Tensor> new_tensor =
-        forward_input(input_index, type, shape, DEVICE_MEMORY, allocator_attr);
+        forward_input(input_index, Params::kNoReservation /*output_index*/,
+                      type, shape, DEVICE_MEMORY, allocator_attr);
     if (new_tensor != nullptr) {
       *out_temp = std::move(*new_tensor);
       return Status::OK();
@@ -590,6 +627,14 @@ Status OpKernelContext::allocate_output(int index, const TensorShape& shape,
                                         Tensor** output) {
   DCHECK_GE(index, 0);
   DCHECK_LT(index, num_outputs());
+  bool forward_expected =
+      (params_->forward_from_array != nullptr && index >= 0 &&
+       params_->forward_from_array[index] >= 0);
+  if (forward_expected) {
+    return errors::Internal(
+        "Explicit allocate_output call where input forwarding required.  Try "
+        "turning off the ScopedAllocator optimizer.");
+  }
   AllocatorAttributes attr = output_alloc_attr(index);
   return allocate_output(index, shape, output, attr);
 }
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 5ccd45efc980393aa02582595dde873be7426e26..67943377b9f5cd2dcb5f9dc347011db22fac1726 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -64,10 +64,11 @@ class AsyncOpKernel;
 class CallFrameInterface;
 class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
-class OpKernelContext;       // declared below
+class OpKernelContext;       // declared below,
 class OpRegistryInterface;
 class ResourceMgr;
 class ScopedStepContainer;
+class CollectiveExecutor;
 class StepStatsCollector;
 
 class OpKernel {
@@ -532,6 +533,10 @@ class OpKernelContext {
     // computations running on other devices.
     Rendezvous* rendezvous = nullptr;
 
+    // Mechanism for executing a collective op that needs to coordinate
+    // with parallel instances runing on other devices.
+    CollectiveExecutor* collective_executor = nullptr;
+
     // The session state for this op.
     SessionState* session_state = nullptr;
 
@@ -565,6 +570,12 @@ class OpKernelContext {
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
+
+    // Support for forwarding reservations (used by ScopedAllocator).
+    static const int kNeverForward = -2;
+    static const int kNoReservation = -1;
+    // Values in [0,...) represent reservations for the indexed output.
+    const int* forward_from_array = nullptr;
   };
 
   // params must outlive the OpKernelContext.
@@ -707,14 +718,31 @@ class OpKernelContext {
   //     input[input_index] are compatible with those given in dtype, shape,
   //     memory_type, and attr,
   //   * refcount on the underlying buffer is one.
+  //   * Either there is no forwarding reservation for either input_index
+  //     or output_index or the specified input is reserved for the specified
+  //     output. More precisely:
+  //
+  //     These cases mean neither input nor output has a reservation:
+  //        forward_from_array = nullptr
+  //     OR (input_index is not in forward_from_array AND
+  //         (output_index == kNoReservation OR
+  //          forward_from_array[output_index] == kNoReservation))
+  //
+  //     This case means that input_index is reserved for output_index:
+  //        forward_from_array[output_index] == input_index
+  //
+  //     This case means the output is reserved to always be allocated,
+  //     never assigned a forwarded input:
+  //        forward_from_array[output_index] == kNeverForward
+  //
   // Otherwise returns nullptr.
   // NOTE: For Cuda kernels that read inputs using the __ldg() intrinsic,
   // forwarding is only safe if there are no reads via __ldg() after writes
   // to the same address.
   std::unique_ptr<Tensor> forward_input(
-      int input_index, DataType dtype, const TensorShape& shape,
-      MemoryType memory_type,
-      const AllocatorAttributes& attr) TF_MUST_USE_RESULT;
+      int input_index, int output_index, DataType output_dtype,
+      const TensorShape& output_shape, MemoryType output_memory_type,
+      const AllocatorAttributes& output_attr) TF_MUST_USE_RESULT;
 
   // Tries to forward one of the inputs given in input_indices to
   // output[output_index]. If none of the given inputs can be forwarded, calls
@@ -934,6 +962,10 @@ class OpKernelContext {
   // Rendezvous Send() and Recv().
   Rendezvous* rendezvous() const { return params_->rendezvous; }
 
+  CollectiveExecutor* collective_executor() const {
+    return params_->collective_executor;
+  }
+
   // An op kernel can access the session state it belongs to.
   SessionState* session_state() const { return params_->session_state; }
 
@@ -1101,7 +1133,8 @@ class OpKernelContext {
   void NotifyUseOfPersistentTensor(const Tensor& tensor);
 
   Status status_;
-  Params* params_;    // not owned
+  friend class CollectiveExecutor;  // for access to params_
+  Params* params_;                  // not owned
   mutable mutex mu_;  // mutable so const accessors can acquire the lock
   gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators_ GUARDED_BY(mu_);
   gtl::InlinedVector<TensorValue, 4> outputs_;
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index b53b877f28d2c80e969fb418aa316ad96c6e2eaa..bcd409e5c54b7d63137dd9d236d21bb3ec7b4f56 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -546,9 +546,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) {
                                             {"T|list(type)|[DT_FLOAT]"}));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
-  EXPECT_TRUE(
-      StringPiece(GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}))
-          .contains("Invalid argument: "));
+  EXPECT_TRUE(str_util::StrContains(
+      GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}),
+      "Invalid argument: "));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {"T|int|7"},
                 error::INVALID_ARGUMENT);
@@ -565,8 +565,8 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Multiple OpKernel registrations match NodeDef"));
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernel", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
 }
@@ -585,8 +585,8 @@ TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Multiple OpKernel registrations match NodeDef"));
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernelForT", DEVICE_CPU, {"T|type|DT_FLOAT"},
                 error::INVALID_ARGUMENT);
@@ -606,8 +606,9 @@ TEST_F(OpKernelBuilderTest, BadConstraint) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("OpKernel 'BadConstraint' has constraint on attr "
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "OpKernel 'BadConstraint' has constraint on attr "
                             "'T' not in NodeDef"));
 
   ExpectFailure("BadConstraint", DEVICE_CPU, {"dtype|type|DT_FLOAT"},
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 9a458431e7c1b038c3177b2aa58e21dfa3e4e837..c84ea3b034cc20329b20af111f6b08ceebbfb80b 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -319,14 +319,13 @@ class IsResourceInitialized : public OpKernel {
 // specified type. The type will be a part of the generated op name.
 // TODO(apassos): figure out how to get non-cpu-allocated tensors to work
 // through constant folding so this doesn't have to be marked as stateful.
-#define REGISTER_RESOURCE_HANDLE_OP(Type)                   \
-  REGISTER_OP(#Type "HandleOp")                             \
-      .Attr("container: string = ''")                       \
-      .Attr("shared_name: string = ''")                     \
-      .Output("resource: resource")                         \
-      .SetIsStateful()                                      \
-      .SetShapeFn(tensorflow::shape_inference::ScalarShape) \
-      .Doc("Creates a handle to a " #Type)
+#define REGISTER_RESOURCE_HANDLE_OP(Type) \
+  REGISTER_OP(#Type "HandleOp")           \
+      .Attr("container: string = ''")     \
+      .Attr("shared_name: string = ''")   \
+      .Output("resource: resource")       \
+      .SetIsStateful()                    \
+      .SetShapeFn(tensorflow::shape_inference::ScalarShape)
 
 // Utility op kernel to produce a handle to a resource of type T.
 template <typename T>
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 07272e2374cbf4fb46c5b8da5df73ef4d6858c62..798220d4c35c502df61c93b78ccd100d7c4b5ad5 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -71,7 +72,7 @@ string LookupOrCreate(ResourceMgr* rm, const string& container,
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 641681973a1004f15163217684001c96592731d8..54ecaa5dd431d435fe04948223d565802d525be0 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -298,13 +298,23 @@ bool InferenceContext::FullyDefined(ShapeHandle s) {
 DimensionHandle InferenceContext::NumElements(ShapeHandle s) {
   const auto rank = Rank(s);
   if (rank == kUnknownRank) return UnknownDim();
+  bool found_unknown = false;
   int64 size = 1;
   for (int i = 0; i < rank; ++i) {
     int64 dim_val = Value(Dim(s, i));
-    if (dim_val == kUnknownDim) return UnknownDim();
-    size *= dim_val;
+    if (dim_val == kUnknownDim) {
+      found_unknown = true;
+    } else if (dim_val == 0) {
+      return MakeDim(0);
+    } else {
+      size *= dim_val;
+    }
+  }
+  if (found_unknown) {
+    return UnknownDim();
+  } else {
+    return MakeDim(size);
   }
-  return MakeDim(size);
 }
 
 string InferenceContext::DebugString(ShapeHandle s) {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index e3cc848a169bd848b8f3617d552938ba1ced3663..accc587000767554f87a195e0ea33640cd696244 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -317,6 +317,7 @@ class InferenceContext {
     input_tensors_as_shapes_ = input_tensors_as_shapes;
   }
 
+  ShapeHandle output(int64 idx) const { return outputs_[idx]; }
   void set_output(int idx, ShapeHandle shape) { outputs_[idx] = shape; }
   Status set_output(StringPiece output_name,
                     const std::vector<ShapeHandle>& shapes);
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index f48a7b9c47df3cfa93434ccf585dda8c5a29a2ba..da103bfec97b3b487f94c1dfd5de21bcca4717ca 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -152,10 +153,9 @@ TEST_F(ShapeInferenceTest, Run) {
     };
     Status s = c.Run(fn);
     // Extra error message is attached when Run fails.
-    EXPECT_TRUE(StringPiece(s.ToString())
-                    .contains("Shape must be at most rank 0 but "
-                              "is rank 1 for 'foo' (op: "
-                              "'foo_op')"))
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(),
+        "Shape must be at most rank 0 but is rank 1 for 'foo' (op: 'foo_op')"))
         << s;
   }
 }
@@ -367,10 +367,9 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
 
   // WithRankAtMost on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(
-      StringPiece(c.WithRankAtMost(in1, 2, &s1).ToString())
-          .contains(
-              "Invalid argument: Shape must be at most rank 2 but is rank 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.WithRankAtMost(in1, 2, &s1).ToString(),
+      "Invalid argument: Shape must be at most rank 2 but is rank 3"));
 
   EXPECT_FALSE(IsSet(s1));
   EXPECT_TRUE(c.WithRankAtMost(in1, 3, &s1).ok());
@@ -406,10 +405,9 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
 
   // WithRankAtLeast on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(
-      StringPiece(c.WithRankAtLeast(in1, 4, &s1).ToString())
-          .contains(
-              "Invalid argument: Shape must be at least rank 4 but is rank 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.WithRankAtLeast(in1, 4, &s1).ToString(),
+      "Invalid argument: Shape must be at least rank 4 but is rank 3"));
 
   EXPECT_FALSE(IsSet(s1));
   EXPECT_TRUE(c.WithRankAtLeast(in1, 3, &s1).ok());
@@ -449,12 +447,14 @@ TEST_F(ShapeInferenceTest, WithValue) {
   // WithValue on dimension with known size.
   out1 = d0;
 
-  EXPECT_TRUE(StringPiece(c.WithValue(d0, 0, &out1).ToString())
-                  .contains("Invalid argument: Dimension must be 0 but is 1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.WithValue(d0, 0, &out1).ToString(),
+                            "Invalid argument: Dimension must be 0 but is 1"));
   EXPECT_FALSE(IsSet(out1));
   out1 = d0;
-  EXPECT_TRUE(StringPiece(c.WithValue(d0, 2, &out1).ToString())
-                  .contains("Invalid argument: Dimension must be 2 but is 1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.WithValue(d0, 2, &out1).ToString(),
+                            "Invalid argument: Dimension must be 2 but is 1"));
 
   EXPECT_FALSE(IsSet(out1));
   EXPECT_TRUE(c.WithValue(d0, 1, &out1).ok());
@@ -513,16 +513,14 @@ TEST_F(ShapeInferenceTest, MergeDim) {
   EXPECT_EQ(3, merged_dims.size());
 
   // Merging unequal values is an error.
-  EXPECT_TRUE(
-      StringPiece(c.Merge(d2, d1, &out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 2 and 1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(d2, d1, &out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 2 and 1"));
 
   EXPECT_FALSE(IsSet(out));
-  EXPECT_TRUE(
-      StringPiece(c.Merge(d1, d2, &out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(d1, d2, &out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
 
@@ -729,26 +727,23 @@ TEST_F(ShapeInferenceTest, MergeShape) {
 
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_u_2, s_1_3, &out).ToString())
-          .contains(
-              "Invalid argument: Dimension 1 in both shapes must be equal, but "
-              "are 2 and 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_u_2, s_1_3, &out).ToString(),
+      "Invalid argument: Dimension 1 in both shapes must be equal, but "
+      "are 2 and 3"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_1_3, s_u_2, &out).ToString())
-          .contains(
-              "Invalid argument: Dimension 1 in both shapes must be equal, but "
-              "are 3 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_1_3, s_u_2, &out).ToString(),
+      "Invalid argument: Dimension 1 in both shapes must be equal, but "
+      "are 3 and 2"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_1, s_1_2, &out).ToString())
-          .contains(
-              "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_1, s_1_2, &out).ToString(),
+      "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
 
@@ -795,22 +790,18 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
   // Incompatible merges give errors and set outs to nullptr.
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(
-          c.MergePrefix(s_1_u_3, s_2_4, &s_out, &s_prefix_out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MergePrefix(s_1_u_3, s_2_4, &s_out, &s_prefix_out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(s_out));
   EXPECT_FALSE(IsSet(s_prefix_out));
 
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(
-          c.MergePrefix(s_2_4, s_1_u_3, &s_out, &s_prefix_out).ToString())
-          .contains(
-              "Invalid argument: Shape must be at least rank 3 but is rank 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MergePrefix(s_2_4, s_1_u_3, &s_out, &s_prefix_out).ToString(),
+      "Invalid argument: Shape must be at least rank 3 but is rank 2"));
   EXPECT_FALSE(IsSet(s_out));
   EXPECT_FALSE(IsSet(s_prefix_out));
 }
@@ -868,24 +859,21 @@ TEST_F(ShapeInferenceTest, Subshape) {
 
   // Errors.
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, 6, -3, &out).ToString())
-                  .contains("Invalid argument: Subshape must have computed "
-                            "start <= end, but is 5 "
-                            "and 2 (computed from start 6 and end -3 over "
-                            "shape with rank 5)"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Subshape(in0, 6, -3, &out).ToString(),
+      "Invalid argument: Subshape must have computed start <= end, but is 5 "
+      "and 2 (computed from start 6 and end -3 over shape with rank 5)"));
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, -50, 100, &out).ToString())
-                  .contains("Invalid argument: Subshape start out of "
-                            "bounds: -50, for shape with "
-                            "rank 5"));
+  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, -50, 100, &out).ToString(),
+                                    "Invalid argument: Subshape start out of "
+                                    "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, 0, -50, &out).ToString())
-                  .contains("Invalid argument: Subshape end out of bounds: "
-                            "-50, for shape with rank "
-                            "5"));
+  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, 0, -50, &out).ToString(),
+                                    "Invalid argument: Subshape end out of "
+                                    "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
 }
@@ -1094,27 +1082,26 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   EXPECT_EQ("[]", create(&t));
 
   t = ::tensorflow::test::AsTensor<float>({1, 2, 3});
-  EXPECT_TRUE(
-      StringPiece(create(&t))
-          .contains("Input tensor must be int32 or int64, but was float"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Input tensor must be int32 or int64, but was float"));
 
   t = ::tensorflow::test::AsScalar<int32>(1);
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Input tensor must be rank 1, but was rank 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Input tensor must be rank 1, but was rank 0"));
 
   t = ::tensorflow::test::AsTensor<int32>({1, 2}, TensorShape{2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Input tensor must be rank 1, but was rank 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Input tensor must be rank 1, but was rank 2"));
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int64>({3, -2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Invalid value in tensor used for shape: -2"));
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int32>({3, -2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Invalid value in tensor used for shape: -2"));
 
   // Test when the input shape is wrong.
   {
@@ -1172,9 +1159,9 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
   EXPECT_TRUE(c.MakeShapeFromShapeProto(proto, &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
   proto.add_dim()->set_size(0);
-  EXPECT_TRUE(
-      StringPiece(c.MakeShapeFromShapeProto(proto, &out).error_message())
-          .contains("An unknown shape must not have any dimensions set."));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MakeShapeFromShapeProto(proto, &out).error_message(),
+      "An unknown shape must not have any dimensions set."));
   EXPECT_FALSE(IsSet(out));
 
   // With known rank.
@@ -1188,10 +1175,10 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
 
   // With invalid dimension value.
   proto.add_dim()->set_size(-2);
-  EXPECT_TRUE(
-      StringPiece(c.MakeShapeFromShapeProto(proto, &out).error_message())
-          .contains("Shape [0,?,1000,-2] has dimensions with values below -1 "
-                    "(where -1 means unknown)"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MakeShapeFromShapeProto(proto, &out).error_message(),
+      "Shape [0,?,1000,-2] has dimensions with values below -1 "
+      "(where -1 means unknown)"));
 
   EXPECT_FALSE(IsSet(out));
 }
@@ -1257,9 +1244,10 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
   EXPECT_EQ("20", c.DebugString(d));
 
-  EXPECT_TRUE(StringPiece(c.MakeDimForScalarInput(1, &d).error_message())
-                  .contains("Dimension size, given by scalar input 1, must "
-                            "be non-negative but is -1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                            "Dimension size, given by scalar input 1, must be "
+                            "non-negative but is -1"));
 
   // Same tests, with int64 values.
   t1 = tensorflow::test::AsScalar<int64>(20);
@@ -1267,9 +1255,10 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
   EXPECT_EQ("20", c.DebugString(d));
 
-  EXPECT_TRUE(StringPiece(c.MakeDimForScalarInput(1, &d).error_message())
-                  .contains("Dimension size, given by scalar input 1, must "
-                            "be non-negative but is -1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                            "Dimension size, given by scalar input 1, must be "
+                            "non-negative but is -1"));
 }
 
 TEST_F(ShapeInferenceTest, GetAttr) {
@@ -1322,33 +1311,33 @@ TEST_F(ShapeInferenceTest, Divide) {
   EXPECT_TRUE(c.Divide(d_6, d_2, evenly_divisible, &out).ok());
   EXPECT_EQ("3", c.DebugString(out));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 5, evenly_divisible, &out).error_message())
-          .contains("Dimension size must be evenly divisible by 5 but is 6"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 5, evenly_divisible, &out).error_message(),
+      "Dimension size must be evenly divisible by 5 but is 6"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, d_0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, d_0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, -1, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is -1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is -1"));
 
   // Repeat error cases above with evenly_divisible=false.
   evenly_divisible = false;
   EXPECT_TRUE(c.Divide(d_6, 5, evenly_divisible, &out).ok());
   EXPECT_EQ("1", c.DebugString(out));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, -1, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is -1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is -1"));
 }
 
 TEST_F(ShapeInferenceTest, Add) {
@@ -1396,11 +1385,9 @@ TEST_F(ShapeInferenceTest, Add) {
   EXPECT_TRUE(c.Add(d_0, d_6, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(
-      StringPiece(c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out)
-                      .error_message())
-          .contains(
-              "Dimension size overflow from adding 6 and 9223372036854775802"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out).error_message(),
+      "Dimension size overflow from adding 6 and 9223372036854775802"));
 }
 
 TEST_F(ShapeInferenceTest, Subtract) {
@@ -1448,9 +1435,9 @@ TEST_F(ShapeInferenceTest, Subtract) {
   EXPECT_TRUE(c.Subtract(d_6, d_0, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(
-      StringPiece(c.Subtract(d_5, d_6, &out).error_message())
-          .contains("Negative dimension size caused by subtracting 6 from 5"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Subtract(d_5, d_6, &out).error_message(),
+      "Negative dimension size caused by subtracting 6 from 5"));
 }
 
 TEST_F(ShapeInferenceTest, Multiply) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index b4765ab0b2c41a1b510364d755984b6ae68dd07a..b54dd220ab919a640c9cd58e112459999762e4d1 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -100,7 +100,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
       }
     }
 
-    if (expected.starts_with("in")) {
+    if (str_util::StartsWith(expected, "in")) {
       if (in_index == -1) {
         return Unknown(err_prefix,
                        " should have matched an input shape by "
@@ -135,7 +135,9 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     }
 
     // Verify the dimensions.
-    CHECK(expected.starts_with("[") && expected.ends_with("]")) << expected;
+    CHECK(str_util::StartsWith(expected, "[") &&
+          str_util::EndsWith(expected, "]"))
+        << expected;
     expected.remove_prefix(1);
     expected.remove_suffix(1);
 
@@ -176,7 +178,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
           return Unknown(err_prefix, " expected to be unknown but was ",
                          c.Value(out_dim), err_suffix);
         }
-      } else if (expected_dim.starts_with("d")) {
+      } else if (str_util::StartsWith(expected_dim, "d")) {
         // Compare the dimension values.
         auto v = str_util::Split(expected_dim, '|');
         if (in_dim_idx.first == -1) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 7977841482efa396c8e0797d8c80a40c11b4df56..2a99af7659d9be0dbab505fc7147e7fcc15d67c9 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
 
@@ -83,17 +84,17 @@ class ShapeInferenceTestutil {
       "", ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
               op, i, o)                                                       \
               .error_message())
-#define INFER_ERROR(error_substring, op, i)                                 \
-  {                                                                         \
-    string error_message =                                                  \
-        ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
-            op, i, "e")                                                     \
-            .error_message();                                               \
-    const string& substring = error_substring;                              \
-    EXPECT_NE("", error_message);                                           \
-    EXPECT_TRUE(StringPiece(error_message).contains(substring))             \
-        << "Expected to see '" << substring << "' in '" << error_message    \
-        << "'";                                                             \
+#define INFER_ERROR(error_substring, op, i)                                    \
+  {                                                                            \
+    string error_message =                                                     \
+        ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes(    \
+            op, i, "e")                                                        \
+            .error_message();                                                  \
+    const string& substring = error_substring;                                 \
+    EXPECT_NE("", error_message);                                              \
+    EXPECT_TRUE(::tensorflow::str_util::StrContains(error_message, substring)) \
+        << "Expected to see '" << substring << "' in '" << error_message       \
+        << "'";                                                                \
   }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index 20a6807064bea96f41cbd6035327d7a6db2f73b8..a4405b502cb68444fd43ad21af5922e3bd42ec42 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -25,10 +26,11 @@ namespace shape_inference {
 
 namespace {
 
-#define EXPECT_CONTAINS(str, substr)                                 \
-  do {                                                               \
-    string s = (str);                                                \
-    EXPECT_TRUE(StringPiece(s).contains(substr)) << "String: " << s; \
+#define EXPECT_CONTAINS(str, substr)                            \
+  do {                                                          \
+    string s = (str);                                           \
+    EXPECT_TRUE(::tensorflow::str_util::StrContains(s, substr)) \
+        << "String: " << s;                                     \
   } while (false)
 
 static OpShapeInferenceFn* global_fn_ptr = nullptr;
@@ -97,8 +99,8 @@ TEST(ShapeInferenceTestutilTest, Failures) {
   auto error_message = ShapeInferenceTestutil::InferShapes(
                            ShapeInferenceTestOp("NoSuchOp"), "", "")
                            .error_message();
-  EXPECT_TRUE(StringPiece(error_message)
-                  .starts_with("Op type not registered 'NoSuchOp'"));
+  EXPECT_TRUE(
+      str_util::StartsWith(error_message, "Op type not registered 'NoSuchOp'"));
 
   // Wrong shape error messages.
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "?", fn_copy_input_0),
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 5d32b71628263fe89d6f54fd07b2fe18bbb55e53..e2111d60389d51702463f377602067ddc1bade08 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -884,6 +884,20 @@ bool Tensor::CanUseDMA() const {
 #undef CASE
 
 namespace {
+
+// StrCat and StrAppend don't support Eigen::half directly at the moment, and
+// we would like to keep them compatible with their absl counterparts, for ease
+// of migration. We could rely on errors::internal::PrepareForStrCat() but the
+// logic is so simple we can just replicate it here, where it is close to its
+// usage and easy to change later. And there's the extra benefit of not
+// accessing an 'internal' namespace.
+inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a) {
+  return a;
+}
+inline float PrintOneElement(const Eigen::half& h) {
+  return static_cast<float>(h);
+}
+
 // Print from left dim to right dim recursively.
 template <typename T>
 void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
@@ -896,7 +910,7 @@ void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
     for (int64 i = 0; i < element_count; i++) {
       if (*data_index >= limit) return;
       if (i > 0) strings::StrAppend(result, " ");
-      strings::StrAppend(result, data[(*data_index)++]);
+      strings::StrAppend(result, PrintOneElement(data[(*data_index)++]));
     }
     return;
   }
@@ -927,7 +941,7 @@ string SummarizeArray(int64 limit, int64 num_elts,
   if (shape.empty()) {
     for (int64 i = 0; i < limit; ++i) {
       if (i > 0) strings::StrAppend(&ret, " ");
-      strings::StrAppend(&ret, array[i]);
+      strings::StrAppend(&ret, PrintOneElement(array[i]));
     }
     if (num_elts > limit) strings::StrAppend(&ret, "...");
     return ret;
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 9ae4bb5a2c54021ddc1030abd7538540e7dba10f..4d10f7efb5d3aa2549912b043424c2d41dff27c0 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -483,6 +483,8 @@ class Tensor {
   friend class TensorTestHelper;      // For access to set_shape
   friend class OpKernelContext;       // For access to RefCountIsOne().
   friend class ScopedAllocator;       // For access to buf_.
+  friend class XlaTensorBuffer;  // For access to the private constructor taking
+                                 // the buffer
   template <typename Device, typename T>
   friend class AssignVariableOp;  // For access to RefCountIsOne().
   template <typename Device, typename T>
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index fe2ba375aa0c5c50009b3155338cd8860070d47a..be7e740c335ced3ec6826e804090927962d57285 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -271,6 +271,12 @@ class TensorShapeBase : public TensorShapeRep {
   friend Status MakeShapeHelper(const T*, int64, S*);
 };
 
+/// Outputs `TensorShapeBase` to `std::ostream`.
+template <typename Shape>
+std::ostream& operator<<(std::ostream& os, const TensorShapeBase<Shape>& tsb) {
+  return os << tsb.DebugString();
+}
+
 /// Represents the shape of a Tensor.
 ///
 /// A tensor's shape is denoted by its number of dimensions and a size for each
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index d7517bb311d517351f4dd2a59438780482485dff..6329aa6d8edf3795ed8018b7802661749683fe41 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -198,6 +198,13 @@ TEST(TensorShapeTest, DataType) {
   EXPECT_EQ(TensorShapeTestHelper::data_type(&s2), DT_INVALID);
 }
 
+TEST(TensorShapeTest, ostream) {
+  TensorShape s({10, 5, 4});
+  std::stringstream ss;
+  ss << s;
+  EXPECT_EQ(ss.str(), "[10,5,4]");
+}
+
 // -----------------------------------------------------------------------
 // An old implementation of TensorShape using a different representation,
 // preserved here in the unittest to allow us to have a randomized unittest
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index adf4e1bae307d81d91e7e597fc882caf4c87601f..2280114de5110630a0b64742e1f050e589d00bd0 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -114,7 +114,7 @@ string DataTypeString(DataType dtype) {
 }
 
 bool DataTypeFromString(StringPiece sp, DataType* dt) {
-  if (sp.ends_with("_ref")) {
+  if (str_util::EndsWith(sp, "_ref")) {
     sp.remove_suffix(4);
     DataType non_ref;
     if (DataTypeFromString(sp, &non_ref) && !IsRefType(non_ref)) {
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 60f2b4135a68c4eed618e3efb07758fbab85fa07..16b069c70a7640b4859680a630920990dea087ce 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 
 #include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -140,9 +141,8 @@ TEST(TypesTest, ComplexTypes) {
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
     const string name = DataTypeString(dt);
-    const StringPiece n = name;
-    EXPECT_EQ(DataTypeIsInteger(dt),
-              n.starts_with("int") || n.starts_with("uint"))
+    EXPECT_EQ(DataTypeIsInteger(dt), str_util::StartsWith(name, "int") ||
+                                         str_util::StartsWith(name, "uint"))
         << "DataTypeInteger failed for " << name;
   }
 }
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index 85e014f80434d2a2de2851d2cb361f4b0a0c9433..60fa7bd55937b81555d18dab455640326d98a73d 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/port.h"
 
@@ -259,8 +260,8 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
   ClientSession session(root);
   std::vector<Tensor> outputs;
   Status s = session.Run({create_const}, &outputs);
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("GPU copy from non-DMA string tensor"))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "GPU copy from non-DMA string tensor"))
       << s.ToString();
 }
 
@@ -365,8 +366,9 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPUStringFailsSafely) {
   std::vector<Tensor> outputs;
   Status err = session.Run({create_op, identity}, &outputs);
   EXPECT_EQ(err.code(), errors::Code::INVALID_ARGUMENT);
-  EXPECT_TRUE(StringPiece(err.error_message())
-                  .contains("During Variant Host->Device Copy: non-DMA-copy "
+  EXPECT_TRUE(
+      str_util::StrContains(err.error_message(),
+                            "During Variant Host->Device Copy: non-DMA-copy "
                             "attempted of tensor type: string"))
       << err.error_message();
 }
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 06ca211c762748b1dacd4eb9623ffd2d72762cca..7055e62c0e745f61e072914ff2af4d4ff582963a 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include "tensorflow/core/lib/strings/str_util.h"
 
 #define EIGEN_USE_THREADS
 
@@ -130,7 +131,7 @@ TEST(VariantOpShapeRegistryTest, TestBasic) {
   Variant v = vv_early_exit;
   Status s0 = (*shape_fn)(v, &shape);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit!"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit!"));
 
   VariantValue vv_ok{false /* early_exit */};
   v = vv_ok;
@@ -229,7 +230,7 @@ TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(
-      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -254,7 +255,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(
-      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -299,7 +300,7 @@ TEST(VariantOpAddRegistryTest, TestBasicCPU) {
   Status s0 = BinaryOpVariants<CPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit add"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
@@ -325,7 +326,7 @@ TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   Status s0 = BinaryOpVariants<GPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit add"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index a7af5e2312af716ef25cb35c8f247d6feccb6d9c..fb8a6c39e6786c9dbf3f14c68b8af66b01a20f29 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -567,6 +567,11 @@ void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
         inputs[edge->dst_input()] = edge;
       }
     }
+    // Sort the control inputs for more predictable serialization.
+    std::sort(inputs.begin() + node->num_inputs(), inputs.end(),
+              [](const Edge* a, const Edge* b) -> bool {
+                return a->src()->name() < b->src()->name();
+              });
     node_def->clear_input();
     node_def->mutable_input()->Reserve(inputs.size());
 
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index cbd58b051afde592731ddf2b2ed61854cdfac49e..f7ca7d0620f4d483d31eaad66dfb8ef10dcab027 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -124,7 +124,8 @@ class Node {
   // Inputs requested by the NodeDef.  For the actual inputs, use in_edges.
   const protobuf::RepeatedPtrField<string>& requested_inputs() const;
 
-  // Get the neighboring nodes via edges either in or out of this node.
+  // Get the neighboring nodes via edges either in or out of this node.  This
+  // includes control edges.
   gtl::iterator_range<NeighborIter> in_nodes() const;
   gtl::iterator_range<NeighborIter> out_nodes() const;
   const EdgeSet& in_edges() const { return in_edges_; }
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 627309078ac51a25fe2924935c191ec1c4d2a534..f15e2ce9fa7b02fd07209d0784cd436b8e68f10b 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/version.h"
 
@@ -73,7 +74,7 @@ class GraphConstructor {
     Options(const ImportGraphDefOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(false),
           expect_device_spec(false),
-          prefix(in.prefix.empty() || StringPiece(in.prefix).ends_with("/")
+          prefix(in.prefix.empty() || str_util::EndsWith(in.prefix, "/")
                      ? in.prefix
                      : in.prefix + "/"),
           uniquify_names(in.uniquify_names),
@@ -436,7 +437,7 @@ Status GraphConstructor::BuildNodeIndex() {
     bool in_control_dependence = false;
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name = node_def.input(i);
-      if (!input_name.empty() && input_name.starts_with("^")) {
+      if (!input_name.empty() && str_util::StartsWith(input_name, "^")) {
         in_control_dependence = true;
       } else if (in_control_dependence) {
         return errors::InvalidArgument(
@@ -484,7 +485,7 @@ Status GraphConstructor::InitFromEdges() {
       bool has_loop_back_edge = false;
       for (int i = 0; i < node_def.input_size(); ++i) {
         StringPiece input_name(node_def.input(i));
-        if (input_name.starts_with("^")) {
+        if (str_util::StartsWith(input_name, "^")) {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
@@ -534,7 +535,7 @@ Status GraphConstructor::ValidateColocationConstraints(
   if (iter == node_def.attr().end()) return Status::OK();
   for (const string& c : iter->second.list().s()) {
     StringPiece s(c);
-    if (s.Consume(kColocationGroupPrefix) &&
+    if (str_util::ConsumePrefix(&s, kColocationGroupPrefix) &&
         gdef_nodes_.find(s) == gdef_nodes_.end()) {
       return errors::InvalidArgument(
           "Node '", node_def.name(),
@@ -568,13 +569,22 @@ Status GraphConstructor::ValidateShape(Node* node) {
   auto* ic = refiner_->GetContext(node);
   DCHECK(ic != nullptr)
       << "ShapeRefiner::AddNode() should have created the InferenceContext";
-  if (shape_attrs.size() != node->num_outputs()) {
+  if (shape_attrs.size() < node->num_outputs()) {
     return errors::InvalidArgument(
         "Node '", node->name(), "' has ", node->num_outputs(),
         " outputs but the ", kAttrName, " attribute specifies shapes for ",
         shape_attrs.size(), " outputs");
   }
-  for (int i = 0; i < shape_attrs.size(); ++i) {
+  // NOTE(skyewm): we don't raise an error here because some users depend on
+  // this behavior, even though it's unsafe.
+  // TODO(b/74619486): raise an error.
+  if (shape_attrs.size() > node->num_outputs()) {
+    LOG(WARNING) << "Node '" << node->name() << "' has " << node->num_outputs()
+                 << " outputs but the " << kAttrName
+                 << " attribute specifies shapes for " << shape_attrs.size()
+                 << " outputs. Output shapes may be inaccurate.";
+  }
+  for (int i = 0; i < node->num_outputs(); ++i) {
     const TensorShapeProto& p = shape_attrs[i];
     shape_inference::ShapeHandle h;
     Status s = ic->MakeShapeFromShapeProto(p, &h);
@@ -755,7 +765,7 @@ void GraphConstructor::AddPrefixToNodeDef(
     // Skip remapped inputs (which already exist in g_ and are not being
     // imported).
     if (input_already_exists[i]) continue;
-    if (input.Consume("^")) {
+    if (str_util::ConsumePrefix(&input, "^")) {
       node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
       node_def->set_input(i, strings::StrCat(prefix_, input));
@@ -767,7 +777,7 @@ void GraphConstructor::AddPrefixToNodeDef(
         node_def->mutable_attr()->at(kColocationAttrName).mutable_list();
     for (int i = 0; i < list->s_size(); ++i) {
       StringPiece v(list->s(i));
-      if (v.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&v, kColocationGroupPrefix)) {
         list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
       }
     }
@@ -810,7 +820,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     bool updated = false;
     for (int i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
-      if (val.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
         const auto& name_pair = uniquified_names_.find(val.ToString());
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 963c1dc024b4265e14314c610399fc92331f053c..c18ccf6ce442655de6a31d28d07de619eb84d8b8 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -156,7 +156,9 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     StringPiece loc(value[0]);
-    return loc.Consume(kColocationGroupPrefix) ? loc.ToString() : "";
+    return str_util::ConsumePrefix(&loc, kColocationGroupPrefix)
+               ? loc.ToString()
+               : "";
   }
 
   string GraphDebugString() const {
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 17a174101b2be479bea834a407544b3a74dc08cf..877e4f1b44e005b310667f48dcc0bfd0d0a7e1d5 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -372,7 +373,7 @@ string ControlLoopName(const string& name) {
 
 bool IsControlLoop(const Node* node) {
   const string& name = node->name();
-  return StringPiece(name).starts_with("_cloop");
+  return str_util::StartsWith(name, "_cloop");
 }
 
 // An enter node for control flow.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 6841f2914989b22d6aef91831ac6101b0ba6555f..83b24cafe2cb364b2afd5dcb6533bf662dc40a1b 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -120,7 +121,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
       if (ndef.op() == "_Recv") {
         bool has_control = false;
         for (const string& input_name : ndef.input()) {
-          if (StringPiece(input_name).starts_with("^")) {
+          if (str_util::StartsWith(input_name, "^")) {
             has_control = true;
             break;
           }
@@ -128,7 +129,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
         EXPECT_TRUE(has_control);
       }
       // Must have a control loop
-      if (StringPiece(ndef.name()).starts_with("_cloop")) {
+      if (str_util::StartsWith(ndef.name(), "_cloop")) {
         if (ndef.op() == "Enter") {
           has_control_enter = true;
         }
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index e2ce0ba046f26b69bdb8f427afeb480727977844..c8c2b225fea721bd19683fbdb805601bb9be494b 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -408,7 +409,7 @@ TEST_F(GraphTest, NewName) {
   EXPECT_NE(a1, a2);
   EXPECT_NE(a1, b1);
   EXPECT_NE(a2, b1);
-  EXPECT_TRUE(StringPiece(a1).starts_with("A")) << a1;
+  EXPECT_TRUE(str_util::StartsWith(a1, "A")) << a1;
 }
 
 TEST_F(GraphTest, IsValidNode) {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 1507b6eae26596686f279155c27925e0b5d80cf4..5368774f2d2b748c07cb7874083e211d42e6db16 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -3103,8 +3103,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorProto proto;
   proto.set_dtype(dt);
   uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
-                           8);
+  proto.set_tensor_content(string(reinterpret_cast<char*>(&zero), 8));
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
@@ -3219,7 +3218,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     // For that let's first find filter node that is 2nd input (slot 1)
     // of BackpropInput.
     Node* filter_node = nullptr;
-    old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node);
+    TF_CHECK_OK(old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx,
+                                     &filter_node));
     CHECK_NOTNULL(filter_node);
 
     // Now check which nodes receive from filter_node. Filter feeds as
@@ -3399,8 +3399,7 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorProto proto;
   proto.set_dtype(dt);
   float zero[1] = {0};
-  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
-                           4);
+  proto.set_tensor_content(string(reinterpret_cast<char*>(&zero), 4));
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
@@ -3876,7 +3875,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
 
   // Create node.
   Node* new_node;
-  nb.Finalize(&**g, &new_node);
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
   CHECK_NOTNULL(new_node);
 
   // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
@@ -3987,7 +3986,7 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
 
   // Create node.
   Node* new_node;
-  nb.Finalize(&**g, &new_node);
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
   CHECK_NOTNULL(new_node);
 
   // Incoming data edges from BiasAddGrad node and Conv2DBackpropFilter node to
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index cb0fc8a1547a8498aa0bd089a2c9395119de2789..3b6e8cc2339a42285a68c6898c99b1ec4b585917 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -259,8 +259,14 @@ Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
   const string restore_op_name = strings::StrCat(name_prefix, "/RestoreV2");
   const string assign_op_name = strings::StrCat(name_prefix, "/Assign");
   for (Node* var : variables) {
-    string new_restore_op_name = graph->NewName(restore_op_name);
-    string new_assign_op_name = graph->NewName(assign_op_name);
+    // Add an extra prefix after calling graph->NewName because the "unique"
+    // name may conflict with names generated for Send nodes.
+    // TODO(b/77547936): fix this more generally and get rid of the extra prefix
+    // here.
+    string new_restore_op_name =
+        strings::StrCat(graph->NewName(restore_op_name), "_qt");
+    string new_assign_op_name =
+        strings::StrCat(graph->NewName(assign_op_name), "_qt");
     string tensor_names_op_name =
         strings::StrCat(new_restore_op_name, "/tensor_names");
     string shape_and_slices_op_name =
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
index 2ad69dbd0c608fa79354c73e01167c3b02ff4fc2..e46f92bc24de9fc7d7923e4b9ebe0f04882beae4 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
@@ -215,7 +216,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"),
                       &found_node);
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
@@ -269,7 +270,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"),
                       &found_node);
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index ca93d049d0983d0bba7827443d5ad2aa058a66b7..193cf88aed3da8c871f457c02d8dbb714b926737 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -28,13 +28,13 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace subgraph {
 
 // ----------------------------------------------------------------------------
 // Subgraph construction-related routines
@@ -44,6 +44,8 @@ namespace tensorflow {
 
 namespace {
 
+typedef std::unordered_map<StringPiece, Node*, StringPieceHasher> NameIndex;
+
 // Rewrite graph by replacing the output tensors specified in
 // "fed_outputs" with special feed nodes for each specified output
 // tensor, and removing any nodes that are now disconnected from the
@@ -53,59 +55,33 @@ namespace {
 // Return true on success.  On error, return false and sets *error to
 // an appropriate error message (and *g is left in an indeterminate
 // state).
-static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
-                         const gtl::ArraySlice<string>& fed_outputs,
-                         bool use_function_convention,
-                         subgraph::NameIndex* name_index,
-                         DataTypeVector* out_feed_types) {
+Status FeedInputs(
+    Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
+    NameIndex* name_index, DataTypeVector* out_feed_types) {
   out_feed_types->clear();
-  out_feed_types->reserve(fed_outputs.size());
-  for (size_t i = 0; i < fed_outputs.size(); ++i) {
-    const string& t = fed_outputs[i];
+  out_feed_types->reserve(feed_rewrites.size());
+  for (size_t i = 0; i < feed_rewrites.size(); ++i) {
+    const string& t = feed_rewrites[i]->endpoint_name();
     TensorId id(ParseTensorName(t));
 
     auto iter = name_index->find(id.first);
     if (iter == name_index->end()) {
       return errors::NotFound("FeedInputs: unable to find feed output ", t);
     }
-    const Node* n = iter->second;
+    Node* n = iter->second;
     DCHECK_EQ(n->name(), id.first);
     if (id.second >= n->num_outputs()) {
       return errors::InvalidArgument(
           "FeedInputs: ", t, " should have output index < ", n->num_outputs());
     }
 
-    Node* recv_node;
-
-    if (!use_function_convention) {
-      TF_RETURN_IF_ERROR(
-          NodeBuilder(strings::StrCat("_recv_", id.first, "_", id.second),
-                      "_Recv")
-              .Attr("tensor_type", BaseType(n->output_type(id.second)))
-              .Attr("tensor_name", t)
-              .Attr("send_device", device_info.name())
-              .Attr("recv_device", device_info.name())
-              .Attr("send_device_incarnation",
-                    static_cast<int64>(device_info.incarnation()))
-              .Attr("client_terminated", true)
-              .Finalize(g, &recv_node));
-    } else {
-      // NOTE(mrry): We must include the index as part of the node
-      // name, because _Arg is a "stateful" kernel and therefore
-      // its name must uniquely identify a kernel instance across all
-      // graphs in the same session.
-      TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("_arg_", id.first, "_",
-                                                     id.second, "_", i),
-                                     "_Arg")
-                             .Attr("T", BaseType(n->output_type(id.second)))
-                             .Attr("index", static_cast<int32>(i))
-                             .Finalize(g, &recv_node));
-    }
-    recv_node->set_assigned_device_name(device_info.name());
+    Node* feed_node;
+    TF_RETURN_IF_ERROR(
+        feed_rewrites[i]->AddNode(g, {n, id.second}, &feed_node));
 
     // Update name_index
-    (*name_index)[recv_node->name()] = recv_node;
-    g->AddControlEdge(g->source_node(), recv_node);
+    (*name_index)[feed_node->name()] = feed_node;
+    g->AddControlEdge(g->source_node(), feed_node);
 
     // Look through edges coming out of "n" for edges whose src_output() index
     // matches "output_index".  If found, replace the edges with a connection
@@ -119,7 +95,7 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
                   n->type_string() == "PlaceholderV2")) {
         // When feeding a Placeholder node, any outgoing control edges
         // will be replaced with a control edge from the replacement
-        // recv_node.
+        // feed_node.
         // TODO(josh11b,mrry): Come up with a more elegant way of addressing
         // the general version of this problem.
         to_remove.emplace_back(e);
@@ -128,10 +104,10 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
 
     for (const Edge* e : to_remove) {
       if (e->src_output() == id.second) {
-        g->AddEdge(recv_node, 0, e->dst(), e->dst_input());
+        g->AddEdge(feed_node, 0, e->dst(), e->dst_input());
       } else {
         CHECK_EQ(Graph::kControlSlot, e->src_output());
-        g->AddControlEdge(recv_node, e->dst());
+        g->AddControlEdge(feed_node, e->dst());
       }
       g->RemoveEdge(e);
     }
@@ -140,9 +116,61 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
   return Status::OK();
 }
 
-static bool AddNodeToTargets(const string& node_or_tensor_name,
-                             const subgraph::NameIndex& name_index,
-                             std::unordered_set<const Node*>* targets) {
+Status FetchOutputs(
+    Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
+    NameIndex* name_index, std::vector<Node*>* out_fetch_nodes,
+    DataTypeVector* out_fetch_types) {
+  out_fetch_nodes->clear();
+  out_fetch_nodes->reserve(fetch_rewrites.size());
+  for (size_t i = 0; i < fetch_rewrites.size(); ++i) {
+    const string& t = fetch_rewrites[i]->endpoint_name();
+
+    // Parse t into node_name and output_index.
+    TensorId id(ParseTensorName(t));
+
+    // Find node in graph with that name.
+    auto iter = name_index->find(id.first);
+    if (iter == name_index->end()) {
+      return errors::NotFound("FetchOutputs node ", t, ": not found");
+    }
+    Node* n = iter->second;
+    DCHECK_EQ(n->name(), id.first);
+    VLOG(2) << "Found fetch node for " << t;
+
+    // Validate output_index
+    if (n->num_outputs() == 0) {
+      return errors::InvalidArgument(
+          "Tried to fetch data for '", t,
+          "', which produces no output.  To run to a node but not fetch any "
+          "data, pass '",
+          t,
+          "' as an argument to the 'target_node_names' argument of the "
+          "Session::Run API.");
+    } else if (id.second >= n->num_outputs()) {
+      return errors::InvalidArgument("FetchOutputs ", t,
+                                     ": output index too large, must be < ",
+                                     n->num_outputs());
+    }
+
+    // Create the fetch Node and connect it up
+    Node* fetch_node;
+    TF_RETURN_IF_ERROR(
+        fetch_rewrites[i]->AddNode(g, {n, id.second}, &fetch_node));
+
+    // Update the index.
+    (*name_index)[fetch_node->name()] = fetch_node;
+
+    g->AddControlEdge(fetch_node, g->sink_node());
+    out_fetch_nodes->push_back(fetch_node);
+    out_fetch_types->push_back(BaseType(n->output_type(id.second)));
+  }
+
+  return Status::OK();
+}
+
+bool AddNodeToTargets(const string& node_or_tensor_name,
+                      const NameIndex& name_index,
+                      std::unordered_set<const Node*>* targets) {
   TensorId id = ParseTensorName(node_or_tensor_name);
   auto iter = name_index.find(id.first);
   if (iter == name_index.end()) {
@@ -154,9 +182,9 @@ static bool AddNodeToTargets(const string& node_or_tensor_name,
   return true;
 }
 
-static Status PruneForTargets(Graph* g, const subgraph::NameIndex& name_index,
-                              const std::vector<Node*>& fetch_nodes,
-                              const gtl::ArraySlice<string>& target_nodes) {
+Status PruneForTargets(Graph* g, const NameIndex& name_index,
+                       const std::vector<Node*>& fetch_nodes,
+                       const gtl::ArraySlice<string>& target_nodes) {
   string not_found;
   std::unordered_set<const Node*> targets;
   for (Node* n : fetch_nodes) {
@@ -183,108 +211,149 @@ static Status PruneForTargets(Graph* g, const subgraph::NameIndex& name_index,
 
 }  // namespace
 
-namespace subgraph {
+Status ArgFeedRewrite::AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                               Node** out_node) {
+  // NOTE(mrry): We must include the index as part of the node
+  // name, because _Arg is a "stateful" kernel and therefore
+  // its name must uniquely identify a kernel instance across all
+  // graphs in the same session.
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat("_arg_", feed_tensor.node->name(), "_",
+                                  feed_tensor.index, "_", arg_index_),
+                  "_Arg")
+          .Attr("T", BaseType(feed_tensor.node->output_type(feed_tensor.index)))
+          .Attr("index", arg_index_)
+          .Finalize(g, out_node));
+  (*out_node)->set_assigned_device_name(device_info().name());
+  return Status::OK();
+}
 
-Status FetchOutputs(Graph* g, const DeviceAttributes& device_info,
-                    const gtl::ArraySlice<string>& fetch_outputs,
-                    bool use_function_convention, NameIndex* name_index,
-                    std::vector<Node*>* out_fetch_nodes,
-                    DataTypeVector* out_fetch_types) {
-  out_fetch_nodes->clear();
-  out_fetch_nodes->reserve(fetch_outputs.size());
-  for (size_t i = 0; i < fetch_outputs.size(); ++i) {
-    const string& t = fetch_outputs[i];
+Status RecvFeedRewrite::AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                                Node** out_node) {
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat("_recv_", feed_tensor.node->name(), "_",
+                                  feed_tensor.index),
+                  "_Recv")
+          .Attr("tensor_type",
+                BaseType(feed_tensor.node->output_type(feed_tensor.index)))
+          .Attr("tensor_name", endpoint_name())
+          .Attr("send_device", device_info().name())
+          .Attr("recv_device", device_info().name())
+          .Attr("send_device_incarnation",
+                static_cast<int64>(device_info().incarnation()))
+          .Attr("client_terminated", true)
+          .Finalize(g, out_node));
+
+  (*out_node)->set_assigned_device_name(device_info().name());
+  return Status::OK();
+}
 
-    // Parse t into node_name and output_index.
-    TensorId id(ParseTensorName(t));
+Status RetvalFetchRewrite::AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                                   Node** out_node) {
+  // NOTE(mrry): We must include the index as part of the node
+  // name, because _Retval is a "stateful" kernel and therefore
+  // its name must uniquely identify a kernel instance across all
+  // graphs in the same session.
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat("_retval_", fetch_tensor.node->name(), "_",
+                                  fetch_tensor.index, "_", retval_index_),
+                  "_Retval")
+          .Input(fetch_tensor.node, fetch_tensor.index)
+          .Attr("T",
+                BaseType(fetch_tensor.node->output_type(fetch_tensor.index)))
+          .Attr("index", retval_index_)
+          .Finalize(g, out_node));
+  (*out_node)->set_assigned_device_name(device_info().name());
+  return Status::OK();
+}
 
-    // Find node in graph with that name.
-    auto iter = name_index->find(id.first);
-    if (iter == name_index->end()) {
-      return errors::NotFound("FetchOutputs node ", t, ": not found");
-    }
-    Node* n = iter->second;
-    DCHECK_EQ(n->name(), id.first);
-    VLOG(2) << "Found fetch node for " << t;
+Status SendFetchRewrite::AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                                 Node** out_node) {
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat("_send_", fetch_tensor.node->name(), "_",
+                                  fetch_tensor.index),
+                  "_Send")
+          .Input(fetch_tensor.node, fetch_tensor.index)
+          .Attr("tensor_name", endpoint_name())
+          .Attr("send_device", device_info().name())
+          .Attr("recv_device", device_info().name())
+          .Attr("send_device_incarnation",
+                static_cast<int64>(device_info().incarnation()))
+          .Attr("client_terminated", true)
+          .Finalize(g, out_node));
+  (*out_node)->set_assigned_device_name(device_info().name());
+  return Status::OK();
+}
 
-    // Validate output_index
-    if (n->num_outputs() == 0) {
-      return errors::InvalidArgument(
-          "Tried to fetch data for '", t,
-          "', which produces no output.  To run to a node but not fetch any "
-          "data, pass '",
-          t,
-          "' as an argument to the 'target_node_names' argument of the "
-          "Session::Run API.");
-    } else if (id.second >= n->num_outputs()) {
-      return errors::InvalidArgument("FetchOutputs ", t,
-                                     ": output index too large, must be < ",
-                                     n->num_outputs());
+Status RewriteGraphForExecution(
+    Graph* g, const gtl::ArraySlice<string>& fed_outputs,
+    const gtl::ArraySlice<string>& fetch_outputs,
+    const gtl::ArraySlice<string>& target_node_names,
+    const DeviceAttributes& device_info, bool use_function_convention,
+    RewriteGraphMetadata* out_metadata) {
+  std::vector<std::unique_ptr<PruneRewrite>> feed_rewrites;
+  feed_rewrites.reserve(fed_outputs.size());
+  if (use_function_convention) {
+    for (size_t i = 0; i < fed_outputs.size(); ++i) {
+      feed_rewrites.emplace_back(new ArgFeedRewrite(
+          &fed_outputs[i], &device_info, static_cast<int32>(i)));
     }
-
-    // Create the fetch Node and connect it up
-    Node* send_node;
-    if (!use_function_convention) {
-      TF_RETURN_IF_ERROR(
-          NodeBuilder(strings::StrCat("_send_", id.first, "_", id.second),
-                      "_Send")
-              .Input(n, id.second)
-              .Attr("tensor_name", t)
-              .Attr("send_device", device_info.name())
-              .Attr("recv_device", device_info.name())
-              .Attr("send_device_incarnation",
-                    static_cast<int64>(device_info.incarnation()))
-              .Attr("client_terminated", true)
-              .Finalize(g, &send_node));
-    } else {
-      // NOTE(mrry): We must include the index as part of the node
-      // name, because _Retval is a "stateful" kernel and therefore
-      // its name must uniquely identify a kernel instance across all
-      // graphs in the same session.
-      TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("_retval_", id.first, "_",
-                                                     id.second, "_", i),
-                                     "_Retval")
-                             .Input(n, id.second)
-                             .Attr("T", BaseType(n->output_type(id.second)))
-                             .Attr("index", static_cast<int32>(i))
-                             .Finalize(g, &send_node));
+  } else {
+    for (const string& fed_output : fed_outputs) {
+      feed_rewrites.emplace_back(
+          new RecvFeedRewrite(&fed_output, &device_info));
     }
-    send_node->set_assigned_device_name(device_info.name());
-
-    // Update the index.
-    (*name_index)[send_node->name()] = send_node;
+  }
 
-    g->AddControlEdge(send_node, g->sink_node());
-    out_fetch_nodes->push_back(send_node);
-    out_fetch_types->push_back(BaseType(n->output_type(id.second)));
+  std::vector<std::unique_ptr<PruneRewrite>> fetch_rewrites;
+  fetch_rewrites.reserve(fetch_outputs.size());
+  if (use_function_convention) {
+    for (size_t i = 0; i < fetch_outputs.size(); ++i) {
+      fetch_rewrites.emplace_back(new RetvalFetchRewrite(
+          &fetch_outputs[i], &device_info, static_cast<int32>(i)));
+    }
+  } else {
+    for (const string& fetch_output : fetch_outputs) {
+      fetch_rewrites.emplace_back(
+          new SendFetchRewrite(&fetch_output, &device_info));
+    }
   }
 
-  return Status::OK();
+  return RewriteGraphForExecution(g, feed_rewrites, fetch_rewrites,
+                                  target_node_names, out_metadata);
+}
+
+namespace {
+template <typename StringContainer>
+std::vector<string> ConvertToVector(StringContainer field) {
+  return std::vector<string>(field.begin(), field.end());
 }
+}  // namespace
 
 Status RewriteGraphForExecution(
-    Graph* g, const gtl::ArraySlice<string>& fed_outputs,
-    const gtl::ArraySlice<string>& fetch_outputs,
+    Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
+    const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
     const gtl::ArraySlice<string>& target_node_names,
-    const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata) {
-  if (fetch_outputs.empty() && target_node_names.empty()) {
+  if (fetch_rewrites.empty() && target_node_names.empty()) {
     return errors::InvalidArgument(
         "Must specify at least one target to fetch or execute.");
   }
 
   std::unordered_set<string> endpoints;
-  for (const string& endpoint_name : fed_outputs) {
-    auto result = endpoints.insert(endpoint_name);
+  for (const auto& feed_rewrite : feed_rewrites) {
+    auto result = endpoints.insert(feed_rewrite->endpoint_name());
     if (!result.second) {
-      return errors::InvalidArgument("Endpoint \"", endpoint_name,
+      return errors::InvalidArgument("Endpoint \"",
+                                     feed_rewrite->endpoint_name(),
                                      "\" fed more than once.");
     }
   }
 
-  for (const auto& fetch : fetch_outputs) {
-    if (endpoints.count(fetch) > 0) {
-      return errors::InvalidArgument(fetch, " is both fed and fetched.");
+  for (const auto& fetch_rewrite : fetch_rewrites) {
+    if (endpoints.count(fetch_rewrite->endpoint_name()) > 0) {
+      return errors::InvalidArgument(fetch_rewrite->endpoint_name(),
+                                     " is both fed and fetched.");
     }
   }
 
@@ -297,19 +366,17 @@ Status RewriteGraphForExecution(
   }
 
   // Add the feeds.  This may replace nodes in the graph, including the nodes
-  // currently listed in "fetch_nodes".  We pass "name_index" so the index is
+  // currently listed in "fetch_rewrites".  We pass "name_index" so the index is
   // kept up to date.
-  if (!fed_outputs.empty()) {
-    TF_RETURN_IF_ERROR(FeedInputs(g, device_info, fed_outputs,
-                                  use_function_convention, &name_index,
-                                  &out_metadata->feed_types));
+  if (!feed_rewrites.empty()) {
+    TF_RETURN_IF_ERROR(
+        FeedInputs(g, feed_rewrites, &name_index, &out_metadata->feed_types));
   }
 
   // Add the fetch nodes, also updating "name_index".
   std::vector<Node*> fetch_nodes;
-  if (!fetch_outputs.empty()) {
-    TF_RETURN_IF_ERROR(FetchOutputs(g, device_info, fetch_outputs,
-                                    use_function_convention, &name_index,
+  if (!fetch_rewrites.empty()) {
+    TF_RETURN_IF_ERROR(FetchOutputs(g, fetch_rewrites, &name_index,
                                     &fetch_nodes, &out_metadata->fetch_types));
   }
 
@@ -323,25 +390,6 @@ Status RewriteGraphForExecution(
   return Status::OK();
 }
 
-namespace {
-template <typename StringContainer>
-std::vector<string> ConvertToVector(StringContainer field) {
-  return std::vector<string>(field.begin(), field.end());
-}
-}  // namespace
-
-Status RewriteGraphForExecution(Graph* g,
-                                const CallableOptions& callable_options,
-                                const DeviceAttributes& device_info,
-                                bool use_function_convention,
-                                RewriteGraphMetadata* out_metadata) {
-  return RewriteGraphForExecution(g, ConvertToVector(callable_options.feed()),
-                                  ConvertToVector(callable_options.fetch()),
-                                  ConvertToVector(callable_options.target()),
-                                  device_info, use_function_convention,
-                                  out_metadata);
-}
-
 }  // namespace subgraph
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index 0dc59582f426c216baa09714e427371c1db6bd3a..ba35846d937bfeeeab825be2a2897aa6f3a195b7 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -39,6 +40,37 @@ struct RewriteGraphMetadata {
   DataTypeVector fetch_types;
 };
 
+// Describes the action to take on a particular tensor endpoint (described by
+// a "<node_name>:<output_index>" pair) when pruning the graph.
+//
+// The `AddNode()` method must be overridden to describe this action. The method
+// will be invoked once during `RewriteGraphForExecution()` with tensor endpoint
+// named by `endpoint_name`, and it may either create a single new node, or fail
+// with an error if the resulting graph would be invalid.
+class PruneRewrite {
+ public:
+  // `endpoint_name` and `device_info` must outlive this object.
+  PruneRewrite(const string* endpoint_name, const DeviceAttributes* device_info)
+      : endpoint_name_(endpoint_name), device_info_(device_info) {}
+  virtual ~PruneRewrite() {}
+
+  // Creates a new node whose output replaces the given `tensor` in graph `g`.
+  // The node will be assigned to the device named in `device_info`.
+  virtual Status AddNode(Graph* g, NodeBuilder::NodeOut tensor,
+                         Node** out_node) = 0;
+
+  // Returns the name of the tensor to which this rewrite applies.
+  const string& endpoint_name() { return *endpoint_name_; }
+
+ protected:
+  // The device on which the new node will be created.
+  const DeviceAttributes& device_info() { return *device_info_; }
+
+ private:
+  const string* const endpoint_name_;          // Not owned.
+  const DeviceAttributes* const device_info_;  // Not owned.
+};
+
 // Rewrite the graph structure of "*g" to deal with feeding node
 // outputs, fetching node outputs, and only running a subset of the
 // graph.  "fed_outputs" and "fetch_outputs" are both lists of
@@ -49,7 +81,7 @@ struct RewriteGraphMetadata {
 // In the resulting graph "*g", output edges in "fed_outputs" have
 // been redirected to special "_recv" nodes introduced into the graph.
 // If these fed nodes are not needed in order to compute the effects
-// of the nodes in "targets_nodes" and "fetch_outputs", then these may
+// of the nodes in "target_node_names" and "fetch_outputs", then these may
 // be omitted from the graph.
 //
 // In the resulting graph "*g", additional "_send" nodes are connected
@@ -71,25 +103,61 @@ Status RewriteGraphForExecution(
     const gtl::ArraySlice<string>& target_node_names,
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata);
-Status RewriteGraphForExecution(Graph* g,
-                                const CallableOptions& callable_options,
-                                const DeviceAttributes& device_info,
-                                bool use_function_convention,
-                                RewriteGraphMetadata* out_metadata);
-
-typedef std::unordered_map<StringPiece, Node*, StringPieceHasher> NameIndex;
-
-// Augment "*g" by adding special "fetch" nodes that connect to the
-// tensor outputs specified in "fetch_outputs" to retrieve the output
-// of the tensors.  The new nodes added are set up to execute on
-// "client_device_name", and are returned in "*fetch_nodes".
-//
-// Return OK on success.  On error, return false and sets *error to
-// an appropriate error message (and *g is left in an indeterminate
-// state).
-Status FetchOutputs(Graph* g, const DeviceAttributes& device_info,
-                    const gtl::ArraySlice<string>& fetch_outputs,
-                    NameIndex* name_index, std::vector<Node*>* fetch_nodes);
+
+// A more general version of the above function that supports
+// customizable rewriting actions for each fed and fetched tensor.
+Status RewriteGraphForExecution(
+    Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
+    const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
+    const gtl::ArraySlice<string>& target_node_names,
+    RewriteGraphMetadata* out_metadata);
+
+/////////////////////////////////////////////////////////
+// Custom rewrite actions for fed and fetched tensors. //
+/////////////////////////////////////////////////////////
+
+// A rewrite action that adds an _Arg node for a fed tensor.
+class ArgFeedRewrite : public PruneRewrite {
+ public:
+  ArgFeedRewrite(const string* endpoint_name,
+                 const DeviceAttributes* device_info, int32 arg_index)
+      : PruneRewrite(endpoint_name, device_info), arg_index_(arg_index) {}
+  Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                 Node** out_node) override;
+
+ private:
+  const int32 arg_index_;
+};
+
+// A rewrite action that adds a client-terminated _Recv node for a fed tensor.
+class RecvFeedRewrite : public PruneRewrite {
+ public:
+  using PruneRewrite::PruneRewrite;
+  Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                 Node** out_node) override;
+};
+
+// A rewrite action that adds a _Retval node for a fetched tensor.
+class RetvalFetchRewrite : public PruneRewrite {
+ public:
+  RetvalFetchRewrite(const string* endpoint_name,
+                     const DeviceAttributes* device_info, int32 retval_index)
+      : PruneRewrite(endpoint_name, device_info), retval_index_(retval_index) {}
+  Status AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                 Node** out_node) override;
+
+ private:
+  const int32 retval_index_;
+};
+
+// A rewrite action that adds a client-terminated _Send node for a
+// fetched tensor.
+class SendFetchRewrite : public PruneRewrite {
+ public:
+  using PruneRewrite::PruneRewrite;
+  Status AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
+                 Node** out_node) override;
+};
 
 }  // namespace subgraph
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index 7219d9812f3e4a01cffa4b6b17d38781f7d5e2b0..6c014a8d44388eaeff80fb0850ac1575d3ec023a 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -312,8 +312,8 @@ TEST_F(SubgraphTest, ChainOfFools) {
   EXPECT_TRUE(HasEdge("e", 0, "_send_e_0", 0));
 }
 
-static bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+static bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/core/graph/tensor_id.cc b/tensorflow/core/graph/tensor_id.cc
index 089ea5e527ab18322af01b6e80154cd759b9e980..8af1936d64e503d0cdcf10b7a492847b494c8664 100644
--- a/tensorflow/core/graph/tensor_id.cc
+++ b/tensorflow/core/graph/tensor_id.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -45,7 +46,7 @@ TensorId ParseTensorName(StringPiece name) {
   if (p > base && *p == ':' && mul > 1) {
     id.first = StringPiece(base, p - base);
     id.second = index;
-  } else if (name.starts_with("^")) {
+  } else if (str_util::StartsWith(name, "^")) {
     // Control edge
     id.first = StringPiece(base + 1);
     id.second = Graph::kControlSlot;
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index cb6d107cadc153930d99ea13cad985dd60c8b393..d58cdc3c5baf02f89cff52ef0396816cb00b48a3 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -60,7 +61,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedDefaultAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -83,7 +84,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -91,7 +92,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   // Validation should still fail.
   s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 }
 
 TEST(ValidateGraphDefAgainstOpListTest, GraphWithOpOnlyInOpList) {
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 2ca9b720ee127b892c06230efb3517f5afabea45..9dcc6765f5b356438c325f84c4891d70e0089efd 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -3,18 +3,6 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "op_types",
     srcs = ["op_types.cc"],
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index b653f902e857ce804f797a016ebde551bf3b6695..9ecf5a6cf789fed2c44508e5b53d352b73e1fdea 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -8,18 +8,6 @@ load(
     "tf_cuda_tests_tags",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 config_setting(
     name = "xsmm",
     licenses = ["notice"],
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index 39bfca244ed2d40544dd2a17a019dadbe50f6d29..8d8c6084ec9743dea4b45820a6d4a5b2d938979b 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -62,6 +62,10 @@ void Cluster::DisableOptimizer(bool disable) {
         options_.config.mutable_graph_options()->mutable_rewrite_options();
     rewriter_config->set_layout_optimizer(RewriterConfig::OFF);
     rewriter_config->set_disable_model_pruning(true);
+    rewriter_config->set_function_optimization(RewriterConfig::OFF);
+    rewriter_config->set_arithmetic_optimization(RewriterConfig::OFF);
+    rewriter_config->set_loop_optimization(RewriterConfig::OFF);
+    rewriter_config->set_dependency_optimization(RewriterConfig::OFF);
     rewriter_config->set_constant_folding(RewriterConfig::OFF);
     rewriter_config->set_memory_optimization(RewriterConfig::NO_MEM_OPT);
     rewriter_config->mutable_auto_parallel()->set_enable(false);
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index b54b34959a53b56022a449ca286ff0ba823f2aa5..50d6e6468faac01de1a06b0a9dea5aa0aa7f3dd2 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -54,7 +54,7 @@ DeviceProperties GetLocalCPUInfo() {
 
   int64 free_mem = port::AvailableRam();
   if (free_mem < INT64_MAX) {
-    device.set_memory_size(free_mem * 1024);
+    device.set_memory_size(free_mem);
   }
 
   (*device.mutable_environment())["cpu_instruction_set"] =
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index ae70c9860823dae1a85ba20e00afe15b218cd2b4..abfa7bc48e6e2484acee59a1e2b0c2bfc4e60fb7 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -66,6 +66,7 @@ Status VirtualCluster::Run(const GraphDef& graph,
   }
 
   Costs node_costs;
+  int node_id = 0;
   do {
     OpContext op_context = scheduler.GetCurrNode();
     node_costs = node_estimator_->PredictCosts(op_context);
@@ -73,6 +74,7 @@ Status VirtualCluster::Run(const GraphDef& graph,
       CostGraphDef::Node* cost_node =
           metadata->mutable_cost_graph()->add_node();
       const string& op_name = op_context.name;
+      cost_node->set_id(node_id++);
       cost_node->set_name(op_name);
       cost_node->set_device(op_context.device_name);
       cost_node->set_compute_cost(
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 5336df1f51dbb5dd5f48857a088ece1b1a04dbb5..33949319d5f050100d6b58e7ee324370e4232bec 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -6,18 +6,6 @@ load(
     "tf_protos_grappler",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "graph_properties_testdata",
     srcs = glob([
@@ -55,6 +43,7 @@ cc_library(
         ":utils",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 817247e3794ca3e165b2f0445ab164938577336f..8fe154dbf3c7d634ec9266b86135f721b25edcc9 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -251,8 +252,7 @@ typename DisjointSet<Handle>::Rep* DisjointSet<Handle>::Find(Handle value) {
 }
 
 bool IsQueue(const Node& node) {
-  StringPiece type(node.type_string());
-  return type.ends_with("QueueV2");
+  return str_util::EndsWith(node.type_string(), "QueueV2");
 }
 
 // Returns true if the node is an Enter op AND its input is a Queue.
@@ -926,7 +926,6 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   shape_refiner.set_require_shape_inference_fns(false);
   shape_refiner.set_disable_constant_propagation(true);
-  shape_refiner.set_function_library_for_shape_inference(&function_library);
   ImportGraphDefOptions options;
   // Graph optimization happens at the late stage of graph execution,
   // when colocation constraints are already validated previously and
@@ -1012,6 +1011,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
     // Skip any information that comes from fed nodes.
     if (fed_ports.find(node->name()) != fed_ports.end()) {
+      VLOG(2) << "Skipping feed node shape: " << node->name();
       continue;
     }
     for (const auto& merged_shapes : node_ctx->MergedShapes()) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 5aa496207231cbac6a0c07fe01efccf65f67bc78..8ff572fe4f826a9346d6822b2c84b51065986be0 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -29,9 +29,12 @@ namespace grappler {
 class SymbolicShapeRefiner;
 class TopoQueue;
 
-// A TensorFlow model to optimize.
-// Models are represented by the combination of a graph, one of more fetch
-// nodes, and potentially a set of nodes to feed.
+// Infer OpInfo::TensorProperties for graph nodes inputs/outputs.
+//
+// Typical use case, is to infer tensor properties from a graph, before doing
+// optimization pass. Nodes modified during optimization pass have to be
+// invalidated, to prevent further incorrect optimizations based on wrong shape
+// and data type properties.
 class GraphProperties {
  public:
   explicit GraphProperties(const GrapplerItem& item) : item_(item) {}
@@ -64,15 +67,12 @@ class GraphProperties {
       const string& node_name) const;
   const std::vector<OpInfo::TensorProperties>& GetOutputProperties(
       const string& node_name) const;
+  // Invalidate input/output properties for nodes modified during graph
+  // optimization pass, to prevent potential optimizations, based on incorrect
+  // shape information.
   void ClearInputProperties(const string& node_name);
   void ClearOutputProperties(const string& node_name);
 
-  static void FillTensorPropertiesFromContext(
-      const shape_inference::ShapeHandle&, const DataType&,
-      shape_inference::InferenceContext*,
-      std::unordered_map<const shape_inference::Dimension*, int>* dim_ids,
-      OpInfo::TensorProperties*);
-
  private:
   // Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 284d9d409bb4d9439cf007e1692838667caff26a..db4dae96de44cba70221fde551e3f997e4db93cc 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -742,6 +742,8 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   EXPECT_EQ("float: [128,256]", PropToString(prop));
 }
 
+#if 0
+// Disabled for now since this doesnt' seem to work when functions are instantiated inside while loops. It's also unclear whether it's correct when the same function is instantiated twice.
 TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
   // Test graph produced in python using:
   /*
@@ -775,6 +777,7 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
   EXPECT_TRUE(shape.IsFullyDefined());
   EXPECT_FALSE(shape.unknown_rank());
 }
+#endif
 
 TEST_F(GraphPropertiesTest, SymbolicShapes) {
   // Build a simple graph with placeholders of unknown dimensions. These
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index ea4320687af366ccdd82e46cf28adf4ee9c100c0..833205ac6f12a73d96c93455bb355ee511d6700a 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <limits>
 
 #include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/robust_stats.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -52,6 +53,8 @@ Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
 Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
                                             CostGraphDef* cost_graph,
                                             Costs* costs) const {
+  const bool running_simulation = (cluster_->type() == "virtual");
+
   std::vector<double> times(measurement_steps_);
   BlockingCounter barrier(measurement_steps_);
 
@@ -80,9 +83,23 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     }
 
     const Costs::MicroSeconds finish = Env::Default()->NowMicros();
-    const double time = (finish - start).count() * 1e3;
-    times[step] = time;
-
+    if (running_simulation) {
+      // When running simulation, return the estimated runtime, not the time it
+      // takes to run the simulation.
+      double time = 0.0;
+      for (const DeviceStepStats& stepstats :
+           metadata.step_stats().dev_stats()) {
+        for (const NodeExecStats& node_stats : stepstats.node_stats()) {
+          const double completion_time =
+              node_stats.all_end_rel_micros() + node_stats.all_start_micros();
+          time = std::max(time, completion_time * 1e3);
+        }
+      }
+      times[step] = time;
+    } else {
+      const double time = (finish - start).count() * 1e3;
+      times[step] = time;
+    }
     if (cost_graph && (step + 1 == measurement_steps_)) {
       metadata.mutable_cost_graph()->Swap(cost_graph);
     }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 29ef317e46f13bd64847fd898fcb2eb9fee67f1c..14e46ecdd97f5e516b1f8ed2da43ecbcaba27faf 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 
 namespace tensorflow {
@@ -46,6 +47,15 @@ constexpr char kShape[] = "Shape";
 constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
+constexpr char kGather[] = "Gather";
+constexpr char kGatherV2[] = "GatherV2";
+constexpr char kSlice[] = "Slice";
+constexpr char kMaxPool[] = "MaxPool";
+constexpr char kMaxPoolGrad[] = "MaxPoolGrad";
+constexpr char kAvgPool[] = "AvgPool";
+constexpr char kAvgPoolGrad[] = "AvgPoolGrad";
+constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
+constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
 
 static const Costs::Duration kMinComputeTime(1);
 
@@ -67,14 +77,39 @@ Padding GetPadding(const OpInfo& op_features) {
   return Padding::SAME;  // Default padding.
 }
 
+bool IsTraining(const OpInfo& op_info) {
+  if (op_info.attr().find("is_training") != op_info.attr().end() &&
+      op_info.attr().at("is_training").b()) {
+    return true;
+  }
+  return false;
+}
+
+// TODO(dyoon): support non-4D tensors in the c ost functions of convolution
+// related ops (Conv, Pool, BatchNorm, and their backprops) and the related
+// helper functions.
 std::vector<int64> GetStrides(const OpInfo& op_features) {
   if (op_features.attr().find("strides") != op_features.attr().end()) {
     const auto strides = op_features.attr().at("strides").list().i();
+    CHECK(strides.size() == 4) << "Attr strides is not a length-4 vector: "
+                               << op_features.DebugString();
     return {strides[0], strides[1], strides[2], strides[3]};
   }
   return {1, 1, 1, 1};
 }
 
+std::vector<int64> GetKernelSize(const OpInfo& op_info) {
+  if (op_info.attr().find("ksize") != op_info.attr().end()) {
+    const auto ksize = op_info.attr().at("ksize").list().i();
+    CHECK(ksize.size() == 4)
+        << "Attr ksize is not a length-4 vector: " << op_info.DebugString();
+    return {ksize[0], ksize[1], ksize[2], ksize[3]};
+  }
+  // Note that FusedBatchNorm doesn't have ksize attr, but GetKernelSize returns
+  // {1, 1, 1, 1} in that case.
+  return {1, 1, 1, 1};
+}
+
 int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
                     const Padding& padding) {
   // Logic for calculating output shape is from GetWindowedOutputSizeVerbose()
@@ -167,6 +202,10 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
 
+      {kGather, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
+      {kGatherV2, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
+      {kSlice, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
+
       {kPlaceholder, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kRefIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
@@ -182,109 +221,86 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
-      {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)}};
-
-  elementwise_ops_ = {
-      // Unary ops alphabetically sorted
-      {"Acos", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_acos_op<float>>::Cost},
-      {"Asin", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_asin_op<float>>::Cost},
-      {"Atan", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_atan_op<float>>::Cost},
-      {"Atan2", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_quotient_op<float>>::Cost +
-                    Eigen::internal::functor_traits<
-                        Eigen::internal::scalar_atan_op<float>>::Cost},
-      {"Ceil", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_ceil_op<float>>::Cost},
-      {"Cos", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_cos_op<float>>::Cost},
-      {"Erf", 1},
-      {"Erfc", 1},
-      {"Exp", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_exp_op<float>>::Cost},
-      {"Expm1", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_expm1_op<float>>::Cost},
-      {"Floor", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_floor_op<float>>::Cost},
-      {"Inv", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_inverse_op<float>>::Cost},
-      {"InvGrad", 1},
-      {"Lgamma", 1},
-      {"Log", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_log_op<float>>::Cost},
-      {"Log1p", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_log1p_op<float>>::Cost},
-      {"Neg", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_opposite_op<float>>::Cost},
-      {"Reciprocal", Eigen::internal::functor_traits<
-                         Eigen::internal::scalar_inverse_op<float>>::Cost},
-      {"Rint", 1},
-      {"Round", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_round_op<float>>::Cost},
-      {"Rsqrt", Eigen::internal::functor_traits<
-                    Eigen::internal::scalar_rsqrt_op<float>>::Cost},
-      {"Sqrt", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_sqrt_op<float>>::Cost},
-      {"Square", Eigen::internal::functor_traits<
-                     Eigen::internal::scalar_square_op<float>>::Cost},
-      {"Tanh", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_tanh_op<float>>::Cost},
-      {"Relu", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_max_op<float>>::Cost},
-      {"Sigmoid", Eigen::internal::functor_traits<
-                      Eigen::internal::scalar_sigmoid_op<float>>::Cost},
-      {"Sign", Eigen::internal::functor_traits<
-                   Eigen::internal::scalar_sign_op<float>>::Cost},
-      {"Sin", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_sin_op<float>>::Cost},
-      {"Tan", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_tan_op<float>>::Cost},
-      // Binary ops alphabetically sorted
-      {"Add", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_sum_op<float>>::Cost},
-      {"ApproximateEqual", 1},
-      {"BiasAdd", Eigen::internal::functor_traits<
-                      Eigen::internal::scalar_sum_op<float>>::Cost},
-      {"Div", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_quotient_op<float>>::Cost},
-      {"Equal", 1},
-      {"FloorDiv", Eigen::internal::functor_traits<
-                       Eigen::internal::scalar_quotient_op<float>>::Cost},
-      {"FloorMod", Eigen::internal::functor_traits<
-                       Eigen::internal::scalar_mod_op<float>>::Cost},
-      {"Greater", 1},
-      {"GreaterEqual", 1},
-      {"Less", 1},
-      {"LessEqual", 1},
-      {"LogicalAnd", Eigen::internal::functor_traits<
-                         Eigen::internal::scalar_boolean_and_op>::Cost},
-      {"LogicalNot", 1},
-      {"LogicalOr", Eigen::internal::functor_traits<
-                        Eigen::internal::scalar_boolean_or_op>::Cost},
-      {"Maximum", Eigen::internal::functor_traits<
-                      Eigen::internal::scalar_max_op<float>>::Cost},
-      {"Minimum", Eigen::internal::functor_traits<
-                      Eigen::internal::scalar_min_op<float>>::Cost},
-      {"Mod", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_mod_op<float>>::Cost},
-      {"Mul", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_product_op<float>>::Cost},
-      {"NotEqual", 1},
-      {"QuantizedAdd", Eigen::internal::functor_traits<
-                           Eigen::internal::scalar_sum_op<float>>::Cost},
-      {"QuantizedMul", Eigen::internal::functor_traits<
-                           Eigen::internal::scalar_product_op<float>>::Cost},
-      {"RealDiv", Eigen::internal::functor_traits<
-                      Eigen::internal::scalar_quotient_op<float>>::Cost},
-      {"SquareDifference", 1},
-      {"Sub", Eigen::internal::functor_traits<
-                  Eigen::internal::scalar_difference_op<float>>::Cost},
-      {"TruncateDiv", Eigen::internal::functor_traits<
-                          Eigen::internal::scalar_quotient_op<float>>::Cost},
-      {"TruncateMod", Eigen::internal::functor_traits<
-                          Eigen::internal::scalar_mod_op<float>>::Cost}};
+      {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)},
+      {kMaxPool, wrap(&OpLevelCostEstimator::PredictMaxPool)},
+      {kMaxPoolGrad, wrap(&OpLevelCostEstimator::PredictMaxPoolGrad)},
+      {kAvgPool, wrap(&OpLevelCostEstimator::PredictAvgPool)},
+      {kAvgPoolGrad, wrap(&OpLevelCostEstimator::PredictAvgPoolGrad)},
+      {kFusedBatchNorm, wrap(&OpLevelCostEstimator::PredictFusedBatchNorm)},
+      {kFusedBatchNormGrad,
+       wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad)},
+  };
+
+#define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
+
+  // Quantize = apply min and max bounds, multiply by scale factor and round.
+  const int quantize_v2_cost =
+      EIGEN_COST(scalar_product_op<float>) + EIGEN_COST(scalar_max_op<float>) +
+      EIGEN_COST(scalar_min_op<float>) + EIGEN_COST(scalar_round_op<float>);
+
+  elementwise_ops_ = {// Unary ops alphabetically sorted
+                      {"Acos", EIGEN_COST(scalar_acos_op<float>)},
+                      {"Asin", EIGEN_COST(scalar_asin_op<float>)},
+                      {"Atan", EIGEN_COST(scalar_atan_op<float>)},
+                      {"Atan2", EIGEN_COST(scalar_quotient_op<float>) +
+                                    EIGEN_COST(scalar_atan_op<float>)},
+                      {"Ceil", EIGEN_COST(scalar_ceil_op<float>)},
+                      {"Cos", EIGEN_COST(scalar_cos_op<float>)},
+                      {"Dequantize", EIGEN_COST(scalar_product_op<float>)},
+                      {"Erf", 1},
+                      {"Erfc", 1},
+                      {"Exp", EIGEN_COST(scalar_exp_op<float>)},
+                      {"Expm1", EIGEN_COST(scalar_expm1_op<float>)},
+                      {"Floor", EIGEN_COST(scalar_floor_op<float>)},
+                      {"Inv", EIGEN_COST(scalar_inverse_op<float>)},
+                      {"InvGrad", 1},
+                      {"Lgamma", 1},
+                      {"Log", EIGEN_COST(scalar_log_op<float>)},
+                      {"Log1p", EIGEN_COST(scalar_log1p_op<float>)},
+                      {"Neg", EIGEN_COST(scalar_opposite_op<float>)},
+                      {"QuantizeV2", quantize_v2_cost},
+                      {"Reciprocal", EIGEN_COST(scalar_inverse_op<float>)},
+                      {"Rint", 1},
+                      {"Round", EIGEN_COST(scalar_round_op<float>)},
+                      {"Rsqrt", EIGEN_COST(scalar_rsqrt_op<float>)},
+                      {"Sqrt", EIGEN_COST(scalar_sqrt_op<float>)},
+                      {"Square", EIGEN_COST(scalar_square_op<float>)},
+                      {"Tanh", EIGEN_COST(scalar_tanh_op<float>)},
+                      {"Relu", EIGEN_COST(scalar_max_op<float>)},
+                      {"Sigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
+                      {"Sign", EIGEN_COST(scalar_sign_op<float>)},
+                      {"Sin", EIGEN_COST(scalar_sin_op<float>)},
+                      {"Tan", EIGEN_COST(scalar_tan_op<float>)},
+                      // Binary ops alphabetically sorted
+                      {"Add", EIGEN_COST(scalar_sum_op<float>)},
+                      {"ApproximateEqual", 1},
+                      {"BiasAdd", EIGEN_COST(scalar_sum_op<float>)},
+                      {"Div", EIGEN_COST(scalar_quotient_op<float>)},
+                      {"Equal", 1},
+                      {"FloorDiv", EIGEN_COST(scalar_quotient_op<float>)},
+                      {"FloorMod", EIGEN_COST(scalar_mod_op<float>)},
+                      {"Greater", 1},
+                      {"GreaterEqual", 1},
+                      {"Less", 1},
+                      {"LessEqual", 1},
+                      {"LogicalAnd", EIGEN_COST(scalar_boolean_and_op)},
+                      {"LogicalNot", 1},
+                      {"LogicalOr", EIGEN_COST(scalar_boolean_or_op)},
+                      {"Maximum", EIGEN_COST(scalar_max_op<float>)},
+                      {"Minimum", EIGEN_COST(scalar_min_op<float>)},
+                      {"Mod", EIGEN_COST(scalar_mod_op<float>)},
+                      {"Mul", EIGEN_COST(scalar_product_op<float>)},
+                      {"NotEqual", 1},
+                      {"QuantizedAdd", EIGEN_COST(scalar_sum_op<float>)},
+                      {"QuantizedMul", EIGEN_COST(scalar_product_op<float>)},
+                      {"RealDiv", EIGEN_COST(scalar_quotient_op<float>)},
+                      {"ReluGrad", EIGEN_COST(scalar_max_op<float>)},
+                      {"SquareDifference", 1},
+                      {"Sub", EIGEN_COST(scalar_difference_op<float>)},
+                      {"TruncateDiv", EIGEN_COST(scalar_quotient_op<float>)},
+                      {"TruncateMod", EIGEN_COST(scalar_mod_op<float>)}};
+
+#undef EIGEN_COST
 
   // By default, use sum of memory_time and compute_time for execution_time.
   compute_memory_overlap_ = false;
@@ -411,28 +427,33 @@ Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp(
 }
 
 Costs OpLevelCostEstimator::PredictOpCountBasedCost(
-    double operations, const OpInfo& op_features) const {
-  DeviceInfo device_perf = GetDeviceInfo(op_features.device());
-  if (device_perf.gigaops <= 0 || device_perf.gb_per_sec <= 0) {
-    VLOG(1) << "BAD DEVICE. Op:" << op_features.op()
-            << " device type:" << op_features.device().type()
-            << " device model:" << op_features.device().model();
-  }
+    double operations, const OpInfo& op_info) const {
+  bool unknown_shapes = false;
+  const double input_size = CalculateInputSize(op_info, &unknown_shapes);
+  const double output_size = CalculateOutputSize(op_info, &unknown_shapes);
+  const double total_io_bytes = input_size + output_size;
+  Costs costs = PredictOpCountBasedCost(operations, total_io_bytes, op_info);
+  costs.inaccurate = unknown_shapes;
+  costs.max_memory = output_size;
+  return costs;
+}
 
-  Costs::NanoSeconds compute_cost(std::ceil(operations / device_perf.gigaops));
-  VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9
-          << " Execution Time (ns):" << compute_cost.count();
+Costs OpLevelCostEstimator::PredictOpCountBasedCost(
+    double operations, double total_io_bytes, const OpInfo& op_info) const {
+  const DeviceInfo device_info = GetDeviceInfo(op_info.device());
+  if (device_info.gigaops <= 0 || device_info.gb_per_sec <= 0) {
+    VLOG(1) << "BAD DEVICE. Op:" << op_info.op()
+            << " device type:" << op_info.device().type()
+            << " device model:" << op_info.device().model();
+  }
 
-  bool found_unknown_shapes = false;
-  const double total_input_size =
-      CalculateInputSize(op_features, &found_unknown_shapes);
-  const double total_output_size =
-      CalculateOutputSize(op_features, &found_unknown_shapes);
-  const double total_io_size = total_input_size + total_output_size;
+  Costs::NanoSeconds compute_cost(std::ceil(operations / device_info.gigaops));
+  VLOG(1) << "Op:" << op_info.op() << " GOps:" << operations / 1e9
+          << " Compute Time (ns):" << compute_cost.count();
 
   Costs::NanoSeconds memory_cost(
-      std::ceil(total_io_size / device_perf.gb_per_sec));
-  VLOG(1) << "Op:" << op_features.op() << " Size (KB):" << (total_io_size) / 1e3
+      std::ceil(total_io_bytes / device_info.gb_per_sec));
+  VLOG(1) << "Op:" << op_info.op() << " Size (KB):" << (total_io_bytes) / 1e3
           << " Memory Time (ns):" << memory_cost.count();
 
   Costs costs;
@@ -443,8 +464,6 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
   } else {
     costs.execution_time = compute_cost + memory_cost;
   }
-  costs.inaccurate = found_unknown_shapes;
-  costs.max_memory = total_output_size;
   return costs;
 }
 
@@ -795,6 +814,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
   if (!shape_found) {
     // Set the minimum filter size that's feasible.
+    input_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       input_shape.add_dim()->set_size(1);
     }
@@ -837,6 +857,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
   }
   if (!shape_found) {
     // Set the minimum filter size that's feasible.
+    filter_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       filter_shape.add_dim()->set_size(1);
     }
@@ -867,7 +888,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
 
 int64 OpLevelCostEstimator::CalculateTensorElementCount(
     const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) const {
-  VLOG(2) << "   with " << tensor.dtype() << " tensor of shape "
+  VLOG(2) << "   with " << DataTypeString(tensor.dtype()) << " tensor of shape "
           << tensor.shape().DebugString();
   int64 tensor_size = 1;
   int num_dims = std::max(1, tensor.shape().dim_size());
@@ -1028,5 +1049,331 @@ Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictGatherOrSlice(
+    const OpContext& op_context) const {
+  // Gather & Slice ops can have a very large input, but only access a small
+  // part of it. For these op the size of the output determines the memory cost.
+  const auto& op_info = op_context.op_info;
+
+  const int inputs_needed = op_info.op() == "Slice" ? 3 : 2;
+  if (op_info.outputs_size() == 0 || op_info.inputs_size() < inputs_needed) {
+    Costs costs = Costs::ZeroCosts();
+    costs.inaccurate = true;
+    return costs;
+  }
+
+  bool unknown_shapes = false;
+
+  // Each output element is a copy of some element from input.
+  // For roofline estimate we assume each copy has a unit cost.
+  const int64 op_count =
+      CalculateTensorElementCount(op_info.outputs(0), &unknown_shapes);
+
+  const double output_size = CalculateOutputSize(op_info, &unknown_shapes);
+  double input_size = output_size;
+  if (op_info.op() == "Slice") {
+    // Add 'begin' & 'size' tensors sizes.
+    input_size +=
+        CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes) +
+        CalculateTensorElementCount(op_info.inputs(2), &unknown_shapes);
+  } else {
+    // Assuming this is "Gather" or "GatherV2" op, add 'indices' size.
+    input_size +=
+        CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes);
+  }
+
+  const double total_io = input_size + output_size;
+  Costs costs = PredictOpCountBasedCost(op_count, total_io, op_info);
+  costs.inaccurate = unknown_shapes;
+  costs.max_memory = output_size;
+
+  return costs;
+}
+
+/* static */
+OpLevelCostEstimator::ConvolutionDimensions
+OpLevelCostEstimator::OpDimensionsFromInputs(
+    const TensorShapeProto& original_image_shape, const OpInfo& op_info,
+    bool* found_unknown_shapes) {
+  VLOG(2) << "op features: " << op_info.DebugString();
+  VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
+  auto image_shape =
+      MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
+  VLOG(2) << "Image shape: " << image_shape.DebugString();
+
+  int x_index, y_index, channel_index;
+  const string& data_format = GetDataFormat(op_info);
+  if (data_format == "NCHW") {
+    x_index = 2;
+    y_index = 3;
+    channel_index = 1;
+  } else {
+    x_index = 1;
+    y_index = 2;
+    channel_index = 3;
+  }
+  int64 batch = image_shape.dim(0).size();
+  int64 ix = image_shape.dim(x_index).size();
+  int64 iy = image_shape.dim(y_index).size();
+  int64 iz = image_shape.dim(channel_index).size();
+
+  // Note that FusedBatchNorm doesn't have ksize attr, but GetKernelSize returns
+  // {1, 1, 1, 1} in that case.
+  std::vector<int64> ksize = GetKernelSize(op_info);
+  int64 kx = ksize[x_index];
+  int64 ky = ksize[y_index];
+
+  std::vector<int64> strides = GetStrides(op_info);
+  int64 sx = strides[x_index];
+  int64 sy = strides[y_index];
+  const auto padding = GetPadding(op_info);
+
+  int64 ox = GetOutputSize(ix, kx, sx, padding);
+  int64 oy = GetOutputSize(iy, ky, sy, padding);
+  int64 oz = iz;
+
+  OpLevelCostEstimator::ConvolutionDimensions conv_dims = {
+      batch, ix, iy, iz, kx, ky, oz, ox, oy, sx, sy, padding};
+  return conv_dims;
+}
+
+Costs OpLevelCostEstimator::PredictMaxPool(const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+  // kx * ky - 1 comparisons per output (kx * xy > 1)
+  // or 1 copy per output (kx * k1 = 1).
+  int per_output_ops = dims.kx * dims.ky == 1 ? 1 : dims.kx * dims.ky - 1;
+  int64 ops = dims.batch * dims.ox * dims.oy * dims.oz * per_output_ops;
+
+  double total_input_size = 0;
+  if (dims.ky >= dims.sy) {
+    total_input_size =
+        CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  } else {  // dims.ky < dims.sy
+    // Vertical stride is larger than vertical kernel; assuming row-major
+    // format, skip unnecessary rows (or read every kx rows per sy rows, as the
+    // others are not used for output).
+    const auto data_size = DataTypeSize(BaseType(op_info.inputs(0).dtype()));
+    total_input_size =
+        data_size * dims.batch * dims.ix * dims.ky * dims.oy * dims.iz;
+  }
+  const double total_output_size =
+      CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictMaxPoolGrad(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  // y: op_info.inputs(1)
+  // y_grad: op_info.inputs(2)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+
+  int64 ops = 0;
+  if (dims.kx == 1 && dims.ky == 1) {
+    // 1x1 window. No need to know which input was max.
+    ops = dims.batch * dims.ix * dims.iy * dims.iz;
+  } else if (dims.kx <= dims.sx && dims.ky <= dims.sy) {
+    // Non-overlapping window: re-run maxpool, then assign zero or y_grad.
+    ops = dims.batch * dims.iz *
+          (dims.ox * dims.oy * (dims.kx * dims.ky - 1) + dims.ix * dims.iy);
+  } else {
+    // Overlapping window: initialize with zeros, re-run maxpool, then
+    // accumulate y_gad to proper x_grad locations.
+    ops = dims.batch * dims.iz *
+          (dims.ox * dims.oy * (dims.kx * dims.ky - 1) + dims.ix * dims.iy * 2);
+  }
+
+  // Just read x and y_grad; no need to read y as we assume MaxPoolGrad re-run
+  // MaxPool internally.
+  double total_input_size =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  total_input_size +=
+      CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
+  // Write x_grad; size equal to x.
+  const double total_output_size =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictAvgPool(const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+
+  // kx * ky - 1 additions and 1 multiplication per output.
+  int64 ops = dims.batch * dims.ox * dims.oy * dims.oz * dims.kx * dims.ky;
+
+  double total_input_size = 0;
+  if (dims.ky >= dims.sy) {
+    total_input_size =
+        CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  } else {  // dims.ky < dims.sy
+    // vertical stride is larger than vertical kernel; assuming row-major
+    // format, skip unnecessary rows (or read every kx rows per sy rows, as the
+    // others are not used for output).
+    const auto data_size = DataTypeSize(BaseType(op_info.inputs(0).dtype()));
+    total_input_size =
+        data_size * dims.batch * dims.ix * dims.ky * dims.oy * dims.iz;
+  }
+  const double total_output_size =
+      CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictAvgPoolGrad(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x's shape: op_info.inputs(0)
+  // y_grad: op_info.inputs(1)
+
+  // Extract x_shape from op_info.inputs(0).value() or op_info.outputs(0).
+  bool shape_found = false;
+  TensorShapeProto x_shape;
+  if (op_info.inputs_size() >= 1 && op_info.inputs(0).has_value()) {
+    const TensorProto& value = op_info.inputs(0).value();
+    shape_found = GetTensorShapeProtoFromTensorProto(value, &x_shape);
+  }
+  if (!shape_found && op_info.outputs_size() > 0) {
+    x_shape = op_info.outputs(0).shape();
+    shape_found = true;
+  }
+  if (!shape_found) {
+    // Set the minimum shape that's feasible.
+    x_shape.Clear();
+    for (int i = 0; i < 4; ++i) {
+      x_shape.add_dim()->set_size(1);
+    }
+    found_unknown_shapes = true;
+  }
+
+  ConvolutionDimensions dims =
+      OpDimensionsFromInputs(x_shape, op_info, &found_unknown_shapes);
+
+  int64 ops = 0;
+  if (dims.kx <= dims.sx && dims.ky <= dims.sy) {
+    // Non-overlapping window.
+    ops = dims.batch * dims.iz * (dims.ix * dims.iy + dims.ox * dims.oy);
+  } else {
+    // Overlapping window.
+    ops = dims.batch * dims.iz *
+          (dims.ix * dims.iy + dims.ox * dims.oy * (dims.kx * dims.ky + 1));
+  }
+
+  const double total_input_size =
+      CalculateInputSize(op_info, &found_unknown_shapes);
+  const double total_output_size =
+      CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictFusedBatchNorm(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  // scale: op_info.inputs(1)
+  // offset: op_info.inputs(2)
+  // mean: op_info.inputs(3)  --> only for inference
+  // variance: op_info.inputs(4) --> only for inference
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+  const bool is_training = IsTraining(op_info);
+
+  int64 ops = 0;
+  const auto rsqrt_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_rsqrt_op<float>>::Cost;
+  if (is_training) {
+    ops = dims.iz * (dims.batch * dims.ix * dims.iy * 4 + 6 + rsqrt_cost);
+  } else {
+    ops = dims.batch * dims.ix * dims.iy * dims.iz * 2;
+  }
+
+  const double size_nhwc =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  const double size_c =
+      CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
+  double total_input_size = 0.0;
+  double total_internal_read_size = 0.0;
+  double total_output_size = 0.0;
+  if (is_training) {
+    total_input_size = size_nhwc + size_c * 2;
+    total_output_size = size_nhwc + size_c * 4;
+    total_internal_read_size = size_nhwc;
+  } else {
+    total_input_size = size_nhwc + size_c * 4;
+    total_output_size = size_nhwc;
+  }
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size + total_internal_read_size,
+      op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // y_backprop: op_info.inputs(0)
+  // x: op_info.inputs(1)
+  // scale: op_info.inputs(2)
+  // mean: op_info.inputs(3)
+  // variance or inverse of variance: op_info.inputs(4)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(1).shape(), op_info, &found_unknown_shapes);
+
+  int64 ops = 0;
+  const auto rsqrt_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_rsqrt_op<float>>::Cost;
+  ops = dims.iz * (dims.batch * dims.ix * dims.iy * 11 + 5 + rsqrt_cost);
+
+  const double size_nhwc =
+      CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
+  const double size_c =
+      CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
+  double total_input_size = size_nhwc * 2 + size_c * 2;
+  double total_internal_read_size = size_nhwc;
+  double total_output_size = size_nhwc * 1 + size_c * 2;
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size + total_internal_read_size,
+      op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 7bb530fe31a9f70d168ae16783fac7d487e5f12d..fcbecbb6dc4d556c9ae3d2e1d0e5dee265f4f974 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -51,10 +51,15 @@ class OpLevelCostEstimator {
   // Predict cost of an op for which no accurate estimator is defined.
   Costs PredictCostOfAnUnknownOp(const OpContext& op_context) const;
 
-  // Naive cost estimate based on operations divided by device ops/sec,
-  // and input/output tensor sizes.
-  Costs PredictOpCountBasedCost(double operations,
-                                const OpInfo& op_features) const;
+  // Naive cost estimate based on the given operations count and total
+  // input/output tensor sizes of the given op_info combined.
+  Costs PredictOpCountBasedCost(double operations, const OpInfo& op_info) const;
+
+  // Naive cost estimate based on the given operations count and the given total
+  // io size in bytes. Sizes of op_info inputs and outputs are not taken into
+  // consideration.
+  Costs PredictOpCountBasedCost(double operations, double total_io_bytes,
+                                const OpInfo& op_info) const;
 
   // This family of routines counts the number of operations to perform the
   // specified TensorFlow Op.
@@ -125,7 +130,7 @@ class OpLevelCostEstimator {
   // implementation just divides the operations to
   // perform the op (from the "Count" routines,
   // above) by the device peak operations per
-  // second. Override to supply a better estimate.
+  // second.
   // Implementation of costs other than
   // execution_time is optional, depending on the
   // device.
@@ -139,6 +144,13 @@ class OpLevelCostEstimator {
   Costs PredictVariable(const OpContext& op_context) const;
   Costs PredictBatchMatMul(const OpContext& op_context) const;
   Costs PredictMetadata(const OpContext& op_context) const;
+  Costs PredictGatherOrSlice(const OpContext& op_context) const;
+  Costs PredictMaxPool(const OpContext& op_context) const;
+  Costs PredictMaxPoolGrad(const OpContext& op_context) const;
+  Costs PredictAvgPool(const OpContext& op_context) const;
+  Costs PredictAvgPoolGrad(const OpContext& op_context) const;
+  Costs PredictFusedBatchNorm(const OpContext& op_context) const;
+  Costs PredictFusedBatchNormGrad(const OpContext& op_context) const;
 
   // Utility function for safe division. Returns 0
   // if rhs is 0 or negative.
@@ -150,9 +162,15 @@ class OpLevelCostEstimator {
     }
   }
 
+  // For convolution and its grad ops.
   static ConvolutionDimensions ConvolutionDimensionsFromInputs(
       const TensorShapeProto& original_image_shape,
-      const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+      const TensorShapeProto& original_filter_shape, const OpInfo& op_info,
+      bool* found_unknown_shapes);
+
+  // For Pooling, FusedBatchNorm, and their grad ops.
+  static ConvolutionDimensions OpDimensionsFromInputs(
+      const TensorShapeProto& original_image_shape, const OpInfo& op_info,
       bool* found_unknown_shapes);
 
  protected:
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 4790b9bab2c7d67e7a29d45aaf9f964c470c63df..d797a8a8c1943133f7c92f01eb9a61ae0d1e3b4f 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -55,28 +57,11 @@ OpContext DescribeMatMul(int m, int n, int l, int k) {
   return op_context;
 }
 
-// Returns an OpInfo for MatMul with unknown input shapes.
-OpContext DescribeMatMulUnknownShape() {
-  OpContext op_context;
-  SetCpuDevice(&op_context.op_info);
-  op_context.op_info.set_op("MatMul");
-
-  auto input = op_context.op_info.add_inputs();
-  auto shape = input->mutable_shape();
-  shape->set_unknown_rank(true);
-
-  input = op_context.op_info.add_inputs();
-  shape = input->mutable_shape();
-  shape->set_unknown_rank(true);
-
-  return op_context;
-}
-
 // Wrangles the minimum number of proto fields to set up an input of
 // arbitrary rank and type.
 void DescribeArbitraryRankInput(const std::vector<int>& dims, DataType dtype,
-                                OpInfo* op_features) {
-  auto input = op_features->add_inputs();
+                                OpInfo* op_info) {
+  auto input = op_info->add_inputs();
   input->set_dtype(dtype);
   auto shape = input->mutable_shape();
   for (auto d : dims) {
@@ -84,6 +69,18 @@ void DescribeArbitraryRankInput(const std::vector<int>& dims, DataType dtype,
   }
 }
 
+// Wrangles the minimum number of proto fields to set up an output of
+// arbitrary rank and type.
+void DescribeArbitraryRankOutput(const std::vector<int>& dims, DataType dtype,
+                                 OpInfo* op_info) {
+  auto output = op_info->add_outputs();
+  output->set_dtype(dtype);
+  auto shape = output->mutable_shape();
+  for (auto d : dims) {
+    shape->add_dim()->set_size(d);
+  }
+}
+
 // Returns an OpInfo for a BatchMatMul
 OpContext DescribeBatchMatMul(const std::vector<int>& dims_a,
                               const std::vector<int>& dims_b) {
@@ -174,6 +171,166 @@ OpContext DescribeBiasAdd(int size1, int size2) {
   return op_context;
 }
 
+int GetOutputSize(const int x, const int k, const int s,
+                  const string& padding) {
+  if (padding == "SAME") {
+    return (x + s - 1) / s;
+  } else {
+    return (x - k + s) / s;
+  }
+}
+
+std::vector<int> GetPoolingOutputSize(const std::vector<int>& input,
+                                      const std::vector<int>& ksize,
+                                      const std::vector<int>& strides,
+                                      const string& data_format,
+                                      const string& padding) {
+  // h, w, and c indices: default with NHWC.
+  int h_index = 1;
+  int w_index = 2;
+  int c_index = 3;
+  if (data_format == "NCHW") {
+    h_index = 2;
+    w_index = 3;
+    c_index = 1;
+  }
+  // Extract parameters.
+  int n = input[0];
+  int h = input[h_index];
+  int w = input[w_index];
+  int c = input[c_index];
+  int sx = strides[h_index];
+  int sy = strides[w_index];
+  int kx = ksize[h_index];
+  int ky = ksize[w_index];
+
+  // Output activation size: default with VALID padding.
+  int ho = GetOutputSize(h, kx, sx, padding);
+  int wo = GetOutputSize(w, ky, sy, padding);
+
+  std::vector<int> output;
+  if (data_format == "NHWC") {
+    output = {n, ho, wo, c};
+  } else {
+    output = {n, c, ho, wo};
+  }
+  return output;
+}
+
+// Helper functions for testing GetTensorShapeProtoFromTensorProto().
+void GetTensorProto(const DataType dtype, const std::vector<int64>& shape,
+                    const std::vector<int64> values, const bool tensor_content,
+                    TensorProto* tensor_proto) {
+  tensor_proto->Clear();
+  TensorProto temp_tensor_proto;
+  temp_tensor_proto.set_dtype(dtype);
+  for (const auto& x : shape) {
+    temp_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(x);
+  }
+  for (const auto& x : values) {
+    if (dtype == DT_INT64) {
+      temp_tensor_proto.add_int64_val(x);
+    } else if (dtype == DT_INT32 || dtype == DT_INT16 || dtype == DT_INT8 ||
+               dtype == DT_UINT8) {
+      temp_tensor_proto.add_int_val(x);
+    } else if (dtype == DT_UINT32) {
+      temp_tensor_proto.add_uint32_val(x);
+    } else if (dtype == DT_UINT64) {
+      temp_tensor_proto.add_uint64_val(x);
+    } else {
+      CHECK(false) << "Unsupported dtype: " << dtype;
+    }
+  }
+  Tensor tensor(dtype);
+  CHECK(tensor.FromProto(temp_tensor_proto));
+  if (tensor_content) {
+    tensor.AsProtoTensorContent(tensor_proto);
+  } else {
+    tensor.AsProtoField(tensor_proto);
+  }
+}
+
+OpContext DescribePoolingOp(const string& op_name, const std::vector<int>& x,
+                            const std::vector<int>& ksize,
+                            const std::vector<int>& strides,
+                            const string& data_format, const string& padding) {
+  OpContext op_context;
+  auto& op_info = op_context.op_info;
+  SetCpuDevice(&op_info);
+  op_info.set_op(op_name);
+
+  const std::vector<int> y =
+      GetPoolingOutputSize(x, ksize, strides, data_format, padding);
+  if (op_name == "AvgPool" || op_name == "MaxPool") {
+    // input: x, output: y.
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_outputs());
+  } else if (op_name == "AvgPoolGrad") {
+    // input: x's shape, y_grad, output: x_grad.
+    DescribeArbitraryRankInput({4}, DT_INT32, &op_info);
+    auto* tensor_proto = op_info.mutable_inputs(0)->mutable_value();
+    GetTensorProto(DT_INT32, {4}, {x[0], x[1], x[2], x[3]},
+                   /*tensor_content=*/false, tensor_proto);
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_outputs());
+  } else if (op_name == "MaxPoolGrad") {
+    // input: x, y, y_grad, output: x_grad.
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_outputs());
+  }
+  auto* attr = op_info.mutable_attr();
+  SetAttrValue(data_format, &(*attr)["data_format"]);
+  SetAttrValue(padding, &(*attr)["padding"]);
+  SetAttrValue(strides, &(*attr)["strides"]);
+  SetAttrValue(ksize, &(*attr)["ksize"]);
+  return op_context;
+}
+
+OpContext DescribeFusedBatchNorm(const bool is_training, const bool is_grad,
+                                 const std::vector<int>& x,
+                                 const string& data_format) {
+  // First, get MaxPool op info with unit stride and unit window.
+  OpContext op_context = DescribePoolingOp("MaxPool", x, {1, 1, 1, 1},
+                                           {1, 1, 1, 1}, data_format, "SAME");
+  auto& op_info = op_context.op_info;
+  // Override op name.
+  if (is_grad) {
+    op_info.set_op("FusedBatchNormGrad");
+  } else {
+    op_info.set_op("FusedBatchNorm");
+  }
+
+  // Add additional input output tensors.
+  if (is_grad) {
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+  }
+  int num_1d_inputs = is_grad ? 3 : 4;
+  for (int i = 0; i < num_1d_inputs; i++) {
+    auto* tensor = op_info.add_inputs();
+    auto* shape = tensor->mutable_shape();
+    shape->add_dim()->set_size(x[3]);
+    tensor->set_dtype(DT_FLOAT);
+  }
+  for (int i = 0; i < 4; i++) {
+    auto* tensor = op_info.add_outputs();
+    auto* shape = tensor->mutable_shape();
+    shape->add_dim()->set_size(x[3]);
+    tensor->set_dtype(DT_FLOAT);
+  }
+
+  // Delete unnecessary attr.
+  auto* attr = op_context.op_info.mutable_attr();
+  attr->erase("ksize");
+  attr->erase("strides");
+  attr->erase("padding");
+
+  // Additional attrs for FusedBatchNorm.
+  SetAttrValue(is_training, &(*attr)["is_training"]);
+
+  return op_context;
+}
 }  // namespace
 
 class OpLevelCostEstimatorTest : public ::testing::Test {
@@ -197,9 +354,104 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
     estimator_.compute_memory_overlap_ = value;
   }
 
+  void ValidateOpDimensionsFromImputs(const int n, const int h, const int w,
+                                      const int c, const int kx, const int ky,
+                                      const int sx, const int sy,
+                                      const string& data_format,
+                                      const string& padding) {
+    OpContext op_context;
+    int ho;
+    int wo;
+    if (data_format == "NHWC") {
+      op_context = DescribePoolingOp("MaxPool", {n, h, w, c}, {1, kx, ky, 1},
+                                     {1, sx, sy, 1}, "NHWC", padding);
+      ho = op_context.op_info.outputs(0).shape().dim(1).size();
+      wo = op_context.op_info.outputs(0).shape().dim(2).size();
+    } else {
+      op_context = DescribePoolingOp("MaxPool", {n, c, h, w}, {1, 1, kx, ky},
+                                     {1, 1, sx, sy}, "NCHW", padding);
+      ho = op_context.op_info.outputs(0).shape().dim(2).size();
+      wo = op_context.op_info.outputs(0).shape().dim(3).size();
+    }
+
+    bool found_unknown_shapes;
+    auto dims = OpLevelCostEstimator::OpDimensionsFromInputs(
+        op_context.op_info.inputs(0).shape(), op_context.op_info,
+        &found_unknown_shapes);
+    Padding padding_enum;
+    if (padding == "VALID") {
+      padding_enum = Padding::VALID;
+    } else {
+      padding_enum = Padding::SAME;
+    }
+    EXPECT_EQ(n, dims.batch);
+    EXPECT_EQ(h, dims.ix);
+    EXPECT_EQ(w, dims.iy);
+    EXPECT_EQ(c, dims.iz);
+    EXPECT_EQ(kx, dims.kx);
+    EXPECT_EQ(ky, dims.ky);
+    EXPECT_EQ(sx, dims.sx);
+    EXPECT_EQ(sy, dims.sy);
+    EXPECT_EQ(ho, dims.ox);
+    EXPECT_EQ(wo, dims.oy);
+    EXPECT_EQ(c, dims.oz);
+    EXPECT_EQ(padding_enum, dims.padding);
+  }
+
   OpLevelCostEstimator estimator_;
 };
 
+TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Gather");
+
+  // Huge first input shouldn't affect Gather execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankOutput({16, 10}, DT_FLOAT, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(130), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(16), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(146), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, TestGatherCostsWithoutOutput) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Gather");
+
+  // Huge first input shouldn't affect Gather execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(0), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(0), cost.execution_time);
+  EXPECT_TRUE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Slice");
+
+  // Huge first input shouldn't affect Slice execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankOutput({10, 10}, DT_FLOAT, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(81), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(10), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(91), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, BiasAddExecutionTime) {
   auto cost = PredictCosts(DescribeBiasAdd(1000, 10));
   EXPECT_EQ(Costs::Duration(8400), cost.memory_time);
@@ -307,39 +559,6 @@ TEST_F(OpLevelCostEstimatorTest, BatchMatMul) {
   EXPECT_NE(matmul_inaccurate, batch_matmul_inaccurate);
 }
 
-// Helper functions for testing GetTensorShapeProtoFromTensorProto().
-void GetTensorProto(const DataType dtype, const std::vector<int64>& shape,
-                    const std::vector<int64> values, const bool tensor_content,
-                    TensorProto* tensor_proto) {
-  tensor_proto->Clear();
-  TensorProto temp_tensor_proto;
-  temp_tensor_proto.set_dtype(dtype);
-  for (const auto& x : shape) {
-    temp_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(x);
-  }
-  for (const auto& x : values) {
-    if (dtype == DT_INT64) {
-      temp_tensor_proto.add_int64_val(x);
-    } else if (dtype == DT_INT32 || dtype == DT_INT16 || dtype == DT_INT8 ||
-               dtype == DT_UINT8) {
-      temp_tensor_proto.add_int_val(x);
-    } else if (dtype == DT_UINT32) {
-      temp_tensor_proto.add_uint32_val(x);
-    } else if (dtype == DT_UINT64) {
-      temp_tensor_proto.add_uint64_val(x);
-    } else {
-      CHECK(false) << "Unsupported dtype: " << dtype;
-    }
-  }
-  Tensor tensor(dtype);
-  CHECK(tensor.FromProto(temp_tensor_proto));
-  if (tensor_content) {
-    tensor.AsProtoTensorContent(tensor_proto);
-  } else {
-    tensor.AsProtoField(tensor_proto);
-  }
-}
-
 void ExpectTensorShape(const std::vector<int64>& expected,
                        const TensorShapeProto& tensor_shape_proto) {
   TensorShape tensor_shape_expected(expected);
@@ -354,7 +573,7 @@ TEST_F(OpLevelCostEstimatorTest, GetTensorShapeProtoFromTensorProto) {
   TensorProto tensor_proto;
   TensorShapeProto tensor_shape_proto;
 
-  // Dimention larger than max value; should fail while converting to Tensor
+  // Dimension larger than max value; should fail while converting to Tensor
   // class.
   tensor_proto.mutable_tensor_shape()->add_dim()->set_size(255);
   EXPECT_FALSE(
@@ -410,5 +629,226 @@ TEST_F(OpLevelCostEstimatorTest, GetTensorShapeProtoFromTensorProto) {
   }
 }
 
+TEST_F(OpLevelCostEstimatorTest, OpDimensionsFromInputs) {
+  std::vector<string> paddings = {"VALID", "SAME"};
+  std::vector<string> formats = {"NHWC", "NCHW"};
+  for (const auto& p : paddings) {
+    for (const auto& f : formats) {
+      // n, h, w, c, kx, ky, sx, sy, data_format, padding.
+      ValidateOpDimensionsFromImputs(10, 20, 20, 100, 3, 3, 2, 2, f, p);
+      ValidateOpDimensionsFromImputs(10, 20, 20, 100, 1, 1, 3, 3, f, p);
+      ValidateOpDimensionsFromImputs(10, 200, 200, 100, 5, 5, 3, 3, f, p);
+      ValidateOpDimensionsFromImputs(10, 14, 14, 3840, 3, 3, 2, 2, f, p);
+    }
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictMaxPool) {
+  auto predict_max_pool = [this](const int n, const int in, const int c,
+                                 const int k, const int s,
+                                 const string& padding) -> Costs {
+    OpContext op_context = DescribePoolingOp(
+        "MaxPool", {n, in, in, c}, {1, k, k, 1}, {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_max_pool(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1075200), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(307200), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_max_pool(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(499200), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(38400), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(460800), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_max_pool(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(561792), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(56448), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(505344), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictMaxPoolGrad) {
+  auto predict_max_pool_grad = [this](const int n, const int in, const int c,
+                                      const int k, const int s,
+                                      const string& padding) -> Costs {
+    OpContext op_context =
+        DescribePoolingOp("MaxPoolGrad", {n, in, in, c}, {1, k, k, 1},
+                          {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_max_pool_grad(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1996800), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(614400), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_max_pool_grad(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1536000), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(153600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_max_pool_grad(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(1514112), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(210048), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(1304064), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictAvgPool) {
+  auto predict_avg_pool = [this](const int n, const int in, const int c,
+                                 const int k, const int s,
+                                 const string& padding) -> Costs {
+    OpContext op_context = DescribePoolingOp(
+        "AvgPool", {n, in, in, c}, {1, k, k, 1}, {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_avg_pool(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1113600), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(345600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_avg_pool(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(499200), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(38400), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(460800), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_avg_pool(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(580608), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(75264), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(505344), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictAvgPoolGrad) {
+  auto predict_avg_pool_grad = [this](const int n, const int in, const int c,
+                                      const int k, const int s,
+                                      const string& padding) -> Costs {
+    OpContext op_context =
+        DescribePoolingOp("AvgPoolGrad", {n, in, in, c}, {1, k, k, 1},
+                          {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_avg_pool_grad(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1305602), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(537600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_avg_pool_grad(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(960002), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(192000), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_avg_pool_grad(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(862082), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(172416), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(689666), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNorm) {
+  auto predict_fused_bn = [this](const int n, const int in, const int c,
+                                 const bool is_training) -> Costs {
+    OpContext op_context = DescribeFusedBatchNorm(
+        is_training, /*is_grad=*/false, {n, in, in, c}, "NHWC");
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    auto costs = predict_fused_bn(10, 20, 96, /*is_training=*/true);
+    EXPECT_EQ(Costs::Duration(614737), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(153706), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(461031), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn(10, 20, 32, /*is_training=*/true);
+    EXPECT_EQ(Costs::Duration(204913), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(51236), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(153677), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn(10, 20, 96, /*is_training=*/false);
+    EXPECT_EQ(Costs::Duration(384154), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(76800), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(307354), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn(10, 20, 32, /*is_training=*/false);
+    EXPECT_EQ(Costs::Duration(128052), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(25600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(102452), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNormGrad) {
+  auto predict_fused_bn_grad = [this](const int n, const int in,
+                                      const int c) -> Costs {
+    OpContext op_context = DescribeFusedBatchNorm(
+        /*is_training=*/false, /*is_grad=*/true, {n, in, in, c}, "NHWC");
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    auto costs = predict_fused_bn_grad(10, 20, 96);
+    EXPECT_EQ(Costs::Duration(1037050), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(422496), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(614554), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn_grad(128, 7, 384);
+    EXPECT_EQ(Costs::Duration(6503809), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(2649677), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(3854132), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 076945d5c626b9609448e339fcbd96de3e9d137f..f318e3911c2654412fc315e3a0356181e2b49333 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -212,8 +212,8 @@ DeviceProperties GetDeviceInfo(const string& device_str) {
       CudaGpuId cuda_gpu_id;
       Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
       if (!s.ok()) {
-        LOG(ERROR) << s;
-        return unknown;
+        // We are probably running simulation without linking cuda libraries.
+        cuda_gpu_id = CudaGpuId(parsed.id);
       }
       return GetLocalGPUInfo(cuda_gpu_id);
     } else if (parsed.type == "CPU") {
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 3ac3ae0f8f835226bbc3ec5d6cec6cb890a6998f..0e5c654acfa59a3c1cba5d853dcf9fb87554bbaf 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -44,6 +44,8 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
 
   Costs result = left;
   result.execution_time += right.execution_time;
+  result.compute_time += right.compute_time;
+  result.memory_time += right.memory_time;
   if (right.inaccurate) {
     result.inaccurate = true;
   }
@@ -841,6 +843,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
 Costs VirtualScheduler::Summary() const {
   // Print out basic execution summary.
   VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
+  VLOG(1) << "Expected compute time: " << graph_costs_.compute_time.count();
+  VLOG(1) << "Expected memory time: " << graph_costs_.memory_time.count();
   VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
   VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
   VLOG(1) << "Expected max per-op streaming buffers: "
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index d7b300321a5e6ad2271370902ca0d26012c0ff10..288587ce9b357d0056de428f5abc653cc4b91ea2 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -78,7 +78,7 @@ void InitializeTensor(DataType type, Tensor* tensor) {
 // correct optimizations.
 Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
                      const ItemConfig& cfg) {
-  if (!cfg.apply_optimizations && !cfg.inline_functions) {
+  if (!cfg.apply_optimizations && !cfg.erase_noinline_attributes) {
     return Status::OK();
   }
 
@@ -88,7 +88,7 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   // Make a local copy of graph def, because we need to change some things.
   GraphDef graph_def(graph_def_arg);
 
-  if (cfg.inline_functions && cfg.erase_noinline_attributes) {
+  if (cfg.erase_noinline_attributes) {
     // TF optimizer doesn't inline functions with "_noinline" attribute,
     // so let's go over the function library and erase it.
     for (auto& func : *graph_def.mutable_library()->mutable_function()) {
@@ -113,7 +113,6 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   } else {
     optimizer_opts->set_opt_level(::tensorflow::OptimizerOptions_Level_L0);
   }
-  optimizer_opts->set_do_function_inlining(cfg.inline_functions);
 
   // Create the function library runtime.
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index c877d911636d8620e7951a5d8279e426d109b2d3..6d181e49e67acaae116c5f5af9365dba238994e8 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -40,8 +40,6 @@ struct ItemConfig {
   int placeholder_unknown_output_shape_dim = -1;
   // If true, does L1 optimizations.
   bool apply_optimizations = false;
-  // If true, does inlining.
-  bool inline_functions = false;
   // If true, erases all "_noinline" attributes from user-defined functions.
   // Has no effect if "inline_functions" is disabled.
   bool erase_noinline_attributes = false;
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
index 29488e4b7e2d187036d4ef7bf7eb29a0dea7fcbf..4b90bf3038df2900315aa32e32a6635d834e4403 100644
--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -35,96 +35,6 @@ namespace {
 
 class GrapplerItemBuilderTest : public ::testing::Test {};
 
-// Create a sample graph with a symbolic gradient for sum.
-void SampleSumSymbolicGradientGraphdef(
-    GraphDef *def, CollectionDef *fetches,
-    std::vector<string> *names_of_ops_of_inline) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-
-  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-
-  auto dummy_variable = Variable(scope, {2, 2}, DT_FLOAT);
-  auto x = Const(scope, 1.0f);
-  auto y = Const(scope, 2);
-  auto z = Const(scope, 3.0f);
-  TF_ASSERT_OK(scope.status());
-
-  NameAttrList fn;
-  fn.set_name("Sum");
-  (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
-  auto g0 = SymbolicGradient(scope, std::initializer_list<Input>{x, y, z},
-                             {DT_FLOAT, DT_INT32}, fn);
-
-  // TODO(bsteiner): we should rewrite the feed/fetch nodes to reflect the
-  // inlining that's done in the item builder
-  // fetches->mutable_node_list()->add_value(g0[0].name());
-  fetches->mutable_node_list()->add_value("SymbolicGradient/dx");
-  fetches->mutable_node_list()->add_value("SymbolicGradient/dy_reshaped");
-
-  TF_CHECK_OK(scope.ToGraphDef(def));
-
-  // Add names of the ops that replace the Mul symbolic gradient during
-  // inlining. This is for validation.
-  *names_of_ops_of_inline = {
-      "SymbolicGradient/dx",          "SymbolicGradient/tile_scaling",
-      "SymbolicGradient/dy_reshaped", "SymbolicGradient/y_shape",
-      "SymbolicGradient/x_shape",     "SymbolicGradient/stitch_idx0",
-      "SymbolicGradient/x_rank",      "SymbolicGradient/stitch_val1",
-      "SymbolicGradient/i_shape",     "SymbolicGradient/di",
-      "SymbolicGradient/zero",        "SymbolicGradient/one"};
-}
-
-std::unique_ptr<GrapplerItem> CreateGrapplerItem(const GraphDef &def,
-                                                 const CollectionDef &fetches) {
-  MetaGraphDef meta_def;
-  ItemConfig cfg;
-  cfg.inline_functions = true;
-  *meta_def.mutable_graph_def() = def;
-  (*meta_def.mutable_collection_def())["train_op"] = fetches;
-  return GrapplerItemFromMetaGraphDef("0", meta_def, cfg);
-}
-
-int CountSymbolicGradientOps(const std::unique_ptr<GrapplerItem> &item) {
-  int n_symb_grads = 0;
-  for (const auto &node : item->graph.node()) {
-    if (node.op() == FunctionLibraryDefinition::kGradientOp) {
-      n_symb_grads++;
-    }
-  }
-  return n_symb_grads;
-}
-
-int CountOpsWithNames(const std::unique_ptr<GrapplerItem> &item,
-                      const std::vector<string> &names) {
-  std::set<string> names_set(names.begin(), names.end());
-  int n_with_names = 0;
-  for (const auto &node : item->graph.node()) {
-    if (names_set.find(node.name()) != names_set.end()) {
-      n_with_names++;
-    }
-  }
-  return n_with_names;
-}
-
-TEST_F(GrapplerItemBuilderTest, SymbolicGradientInlining) {
-  // Create sample sum symbolic gradient graph.
-  GraphDef def;
-  CollectionDef fetches;
-  std::vector<string> ops_of_inline;
-  SampleSumSymbolicGradientGraphdef(&def, &fetches, &ops_of_inline);
-
-  // Create the inlined graph.
-  std::unique_ptr<GrapplerItem> with_inline = CreateGrapplerItem(def, fetches);
-
-  // For the inlined graph, there should be 0 symbolic gradient ops.
-  EXPECT_EQ(0, CountSymbolicGradientOps(with_inline));
-
-  // For the inlined graph, make sure all the required expanded op’s are in the
-  // graph.
-  EXPECT_EQ(ops_of_inline.size(),
-            CountOpsWithNames(with_inline, ops_of_inline));
-}
-
 TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest) {
   MetaGraphDef meta_graph;
 
@@ -273,7 +183,6 @@ TEST_F(GrapplerItemBuilderTest, GraphWithFunctions) {
   (*meta_graph.mutable_collection_def())["train_op"] = train_op;
 
   ItemConfig cfg;
-  cfg.inline_functions = false;
 
   std::unique_ptr<GrapplerItem> item =
       GrapplerItemFromMetaGraphDef("0", meta_graph, cfg);
@@ -294,7 +203,6 @@ TEST_F(GrapplerItemBuilderTest, GraphWithCustomOps) {
   (*meta_graph.mutable_collection_def())["train_op"] = train_op;
 
   ItemConfig cfg;
-  cfg.inline_functions = false;
 
   std::unique_ptr<GrapplerItem> item =
       GrapplerItemFromMetaGraphDef("0", meta_graph, cfg);
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index b683216590ede887d9c65003a23e712e0d612622..ffa204028cca828147810c99277fdcd9cb05f5ee 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -2,18 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "utils",
     srcs = [
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 9c9600db5e33f4d8619d5fa555dfb3e1bb849cd3..a24d2dbd9f767e7973651f6c954f78c7d80d7978 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -68,6 +69,10 @@ bool IsBitcast(const NodeDef& node) { return node.op() == "Bitcast"; }
 
 bool IsCast(const NodeDef& node) { return node.op() == "Cast"; }
 
+bool IsCheckNumerics(const NodeDef& node) {
+  return node.op() == "CheckNumerics";
+}
+
 bool IsComplex(const NodeDef& node) { return node.op() == "Complex"; }
 
 bool IsComplexAbs(const NodeDef& node) { return node.op() == "ComplexAbs"; }
@@ -212,6 +217,8 @@ bool IsMod(const NodeDef& node) { return node.op() == "Mod"; }
 
 bool IsMul(const NodeDef& node) { return node.op() == "Mul"; }
 
+bool IsNeg(const NodeDef& node) { return node.op() == "Neg"; }
+
 bool IsNoOp(const NodeDef& node) { return node.op() == "NoOp"; }
 
 bool IsNotEqual(const NodeDef& node) { return node.op() == "NotEqual"; }
@@ -307,6 +314,8 @@ bool IsSplitV(const NodeDef& node) { return node.op() == "SplitV"; }
 
 bool IsSqrtGrad(const NodeDef& node) { return node.op() == "SqrtGrad"; }
 
+bool IsSquare(const NodeDef& node) { return node.op() == "Square"; }
+
 bool IsSquaredDifference(const NodeDef& node) {
   return node.op() == "SquaredDifference";
 }
@@ -356,6 +365,8 @@ bool IsTruncateDiv(const NodeDef& node) { return node.op() == "TruncateDiv"; }
 
 bool IsTruncateMod(const NodeDef& node) { return node.op() == "TruncateMod"; }
 
+bool IsUnpack(const NodeDef& node) { return node.op() == "Unpack"; }
+
 bool IsVariable(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
@@ -394,12 +405,27 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
       return false;
     }
   }
+  return !ModifiesInputsInPlace(node);
+}
+
+bool ModifiesInputsInPlace(const NodeDef& node) {
   // Some nodes do in-place updates on regular tensor inputs.
-  if (GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace") ||
-      StringPiece(op_name).starts_with("Inplace")) {
+  string op_name = node.op();
+
+  // Ops that modify resource variables effectively modify one of their inputs.
+  if (op_name == "AssignVariableOp" || op_name == "AssignAddVariableOp" ||
+      op_name == "AssignSubVariableOp" || op_name == "ResourceScatterUpdate" ||
+      op_name == "ResourceScatterAdd" || op_name == "ResourceScatterSub" ||
+      op_name == "ResourceScatterMul" || op_name == "ResourceScatterDiv" ||
+      op_name == "ResourceScatterMin" || op_name == "ResourceScatterMax") {
     return false;
   }
-  return true;
+
+  std::transform(op_name.begin(), op_name.end(), op_name.begin(), ::tolower);
+  if (str_util::StrContains(op_name, "inplace")) {
+    return true;
+  }
+  return GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace");
 }
 
 bool ModifiesFrameInfo(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 41ba8bb01e59c82afb2f0a40788b7aeb82421251..8667f72c7ecd213d61c92edabc62610a7e7f1595 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -37,6 +37,7 @@ bool IsBiasAdd(const NodeDef& node);
 bool IsBiasAddGrad(const NodeDef& node);
 bool IsBitcast(const NodeDef& node);
 bool IsCast(const NodeDef& node);
+bool IsCheckNumerics(const NodeDef& node);
 bool IsComplex(const NodeDef& node);
 bool IsComplexAbs(const NodeDef& node);
 bool IsConj(const NodeDef& node);
@@ -89,6 +90,7 @@ bool IsNextIteration(const NodeDef& node);
 bool IsPack(const NodeDef& node);
 bool IsPad(const NodeDef& node);
 bool IsPack(const NodeDef& node);
+bool IsNeg(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
 bool IsNotEqual(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
@@ -120,6 +122,7 @@ bool IsSoftsignGrad(const NodeDef& node);
 bool IsSplit(const NodeDef& node);
 bool IsSplitV(const NodeDef& node);
 bool IsSqrtGrad(const NodeDef& node);
+bool IsSquare(const NodeDef& node);
 bool IsSquaredDifference(const NodeDef& node);
 bool IsSqueeze(const NodeDef& node);
 bool IsStackOp(const NodeDef& node);
@@ -137,6 +140,7 @@ bool IsTile(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
 bool IsTruncateDiv(const NodeDef& node);
 bool IsTruncateMod(const NodeDef& node);
+bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 bool IsZeta(const NodeDef& node);
 
@@ -153,8 +157,12 @@ bool IsCommutative(const NodeDef& node);
 bool IsPersistent(const NodeDef& node);
 
 bool IsFreeOfSideEffect(const NodeDef& node);
+
 bool ModifiesFrameInfo(const NodeDef& node);
 
+// Returns true if the op is known to write to one or more of its inputs.
+bool ModifiesInputsInPlace(const NodeDef& node);
+
 // Returns true if the op is an element-wise involution, i.e. if it is its
 // own inverse such that f(f(x)) == x.
 bool IsInvolution(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index fe095a725a2fb1fdaf52cbb70432808e5cb1d36a..122fd48584f6170da7ff175f4a8b5ed2e436623f 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1,20 +1,15 @@
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+# Platform specific build config
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_protos_grappler",
 )
 
 cc_library(
@@ -38,7 +33,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "static_schedule_test",
     srcs = ["static_schedule_test.cc"],
     deps = [
@@ -73,7 +68,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "auto_parallel_test",
     srcs = ["auto_parallel_test.cc"],
     deps = [
@@ -141,6 +136,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
@@ -149,13 +146,14 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "function_optimizer_test",
     srcs = ["function_optimizer_test.cc"],
     deps = [
         ":function_optimizer",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -199,6 +197,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "graph_optimizer_stage",
+    srcs = ["graph_optimizer_stage.cc"],
+    hdrs = ["graph_optimizer_stage.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:frame",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "graph_optimizer_stage_test",
+    size = "small",
+    srcs = ["graph_optimizer_stage_test.cc"],
+    deps = [
+        ":graph_optimizer_stage",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 cc_library(
     name = "custom_graph_optimizer",
     hdrs = [
@@ -221,6 +250,8 @@ cc_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
+        ":graph_optimizer_stage",
+        ":symbolic_shapes",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -230,10 +261,11 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "arithmetic_optimizer_test",
     size = "small",
     srcs = ["arithmetic_optimizer_test.cc"],
@@ -242,9 +274,15 @@ tf_cc_test(
         ":constant_folding",
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
@@ -274,7 +312,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "dependency_optimizer_test",
     size = "small",
     srcs = ["dependency_optimizer_test.cc"],
@@ -310,7 +348,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "model_pruner_test",
     srcs = ["model_pruner_test.cc"],
     deps = [
@@ -330,6 +368,7 @@ tf_kernel_library(
     srcs = [
         "gpu_swapping_kernels.cc",
     ],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -342,6 +381,7 @@ cc_library(
     srcs = [
         "gpu_swapping_ops.cc",
     ],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -375,17 +415,16 @@ cc_library(
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/utils:traversal",
-    ] + if_cuda([
-        ":gpu_swapping_kernels",
-        ":gpu_swapping_ops",
-    ]),
+    ],
 )
 
-tf_cc_test_gpu(
+tf_cuda_only_cc_test(
     name = "memory_optimizer_test",
     srcs = ["memory_optimizer_test.cc"],
-    tags = ["no_cuda_on_cpu_tap"],
+    tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
     deps = [
+        ":gpu_swapping_kernels",
+        ":gpu_swapping_ops",
         ":memory_optimizer",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:ops",
@@ -423,7 +462,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "layout_optimizer_test",
     srcs = ["layout_optimizer_test.cc"],
     deps = [
@@ -457,6 +496,7 @@ cc_library(
         ":constant_folding",
         ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
+        ":debug_stripper",
         ":dependency_optimizer",
         ":function_optimizer",
         ":graph_optimizer",
@@ -468,11 +508,12 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "meta_optimizer_test",
     srcs = ["meta_optimizer_test.cc"],
     deps = [
@@ -501,7 +542,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "custom_graph_optimizer_registry_test",
     size = "small",
     srcs = ["custom_graph_optimizer_registry_test.cc"],
@@ -536,15 +577,9 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "loop_optimizer_test",
-    size = "small",
     srcs = ["loop_optimizer_test.cc"],
-    tags = [
-        "manual",
-        "no_oss",  # b/74111495
-        "notap",
-    ],
     deps = [
         ":loop_optimizer",
         "//tensorflow/cc:cc_ops",
@@ -554,5 +589,65 @@ tf_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
+cc_library(
+    name = "symbolic_shapes",
+    srcs = ["symbolic_shapes.cc"],
+    hdrs = ["symbolic_shapes.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ] + tf_protos_grappler(),
+)
+
+tf_cc_test(
+    name = "symbolic_shapes_test",
+    srcs = ["symbolic_shapes_test.cc"],
+    deps = [
+        ":symbolic_shapes",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "debug_stripper",
+    srcs = ["debug_stripper.cc"],
+    hdrs = [
+        "debug_stripper.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:graph_optimizer",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "debug_stripper_test",
+    size = "small",
+    srcs = ["debug_stripper_test.cc"],
+    deps = [
+        ":debug_stripper",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index c0fcfaf428fc7c2d1fe84bfda84ce1095c45ec4c..59a5695af06c315961897321600e7b9346f56ee2 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 
 #include <algorithm>
+#include <deque>
 #include <limits>
 #include <unordered_map>
 #include <unordered_set>
@@ -30,10 +31,13 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -195,50 +199,23 @@ void SetSourceDataType(DataType dtype, NodeDef* node) {
 
 bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
 
-const char kOutputShapesAttr[] = "_output_shapes";
-
-PartialTensorShape GetInputShape(const string& input, const NodeMap& node_map) {
-  int output_pos;
-  string node_name = ParseNodeName(input, &output_pos);
-  const NodeDef* input_node = node_map.GetNode(node_name);
-  auto attr = input_node->attr();
-  if (attr.find(kOutputShapesAttr) == attr.end()) {
-    return PartialTensorShape();  // unknown shape
-  } else {
-    return attr.at(kOutputShapesAttr).list().shape(output_pos);
-  }
-}
-
-bool ShapesEqual(const string& input_x, const string& input_y,
-                 const NodeMap& node_map) {
-  PartialTensorShape x_shape = GetInputShape(input_x, node_map);
-  PartialTensorShape y_shape = GetInputShape(input_y, node_map);
-  if (x_shape.unknown_rank() || y_shape.unknown_rank() ||
-      x_shape.dims() != y_shape.dims()) {
-    return false;
-  }
-  for (int i = 0; i < x_shape.dims(); ++i) {
-    if (x_shape.dim_size(i) == -1 || y_shape.dim_size(i) == -1 ||
-        x_shape.dim_size(i) != y_shape.dim_size(i)) {
-      return false;
-    }
-  }
-  return true;
-}
-
 // Returns whether `reshape` is an identity op. The tensor that `reshape`
 // reshapes is the `output_pos`-th output of node `input`.
 bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
-                       const int output_pos) {
-  if (!reshape.attr().count(kOutputShapesAttr) ||
-      !input.attr().count(kOutputShapesAttr)) {
+                       const int output_pos,
+                       const GraphProperties& graph_properties) {
+  const std::vector<OpInfo::TensorProperties>& reshape_props =
+      graph_properties.GetOutputProperties(reshape.name());
+  const std::vector<OpInfo::TensorProperties>& input_props =
+      graph_properties.GetOutputProperties(input.name());
+  if (reshape_props.empty() || input_props.empty() ||
+      input_props.size() <= output_pos) {
     return false;
   }
 
-  PartialTensorShape src_shape(
-      input.attr().at(kOutputShapesAttr).list().shape(output_pos));
-  PartialTensorShape dst_shape(
-      reshape.attr().at(kOutputShapesAttr).list().shape(0));
+  const PartialTensorShape& src_shape = input_props[output_pos].shape();
+  const PartialTensorShape& dst_shape = reshape_props[0].shape();
+
   if (src_shape.unknown_rank() || dst_shape.unknown_rank()) {
     return false;
   }
@@ -251,7 +228,8 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
   // sizes.
   auto num_unknown_dim_sizes = [](const PartialTensorShape& partial_shape) {
     auto dim_sizes = partial_shape.dim_sizes();
-    return std::count(dim_sizes.begin(), dim_sizes.end(), -1);
+    return std::count_if(dim_sizes.begin(), dim_sizes.end(),
+                         [](int dim) { return dim < 0; });
   };
   int src_num_unknown_dim_sizes = num_unknown_dim_sizes(src_shape);
   int dst_num_unknown_dim_sizes = num_unknown_dim_sizes(dst_shape);
@@ -284,246 +262,86 @@ NodeDef* GetTailOfValuePreservingChain(
                         is_value_preserving_non_branching);
 }
 
-// Context passed to each arithmetic optimizer stage. Optimizer stage is
-// responsible for updating the node map for all added or deleted nodes, to keep
-// it consistent with optimized graph.
+// Graph optimizer context extension specific to ArithmeticOptimizer
 struct ArithmeticOptimizerContext {
-  ArithmeticOptimizerContext(
-      const std::unordered_set<string>* nodes_to_preserve,
-      GraphDef* optimized_graph, NodeMap* node_map, FrameMap* frame_map,
-      SetVector<NodeDef*>* nodes_to_simplify)
-      : nodes_to_preserve(nodes_to_preserve),
-        optimized_graph(optimized_graph),
-        node_map(node_map),
-        frame_map(frame_map),
-        nodes_to_simplify(nodes_to_simplify) {}
-
-  const std::unordered_set<string>* nodes_to_preserve;
-  GraphDef* optimized_graph;
-  NodeMap* node_map;
-  FrameMap* frame_map;
+  explicit ArithmeticOptimizerContext(SetVector<NodeDef*>* nodes_to_simplify)
+      : nodes_to_simplify(nodes_to_simplify) {}
   SetVector<NodeDef*>* nodes_to_simplify;
 };
 
 // Base class for single arithmetic optimization: e.g. Bitcast optimization,
 // AddOps optimization, etc...
-// TODO(ezhulenev): extract this class to be reused by other multi-stage
-// graph optimizers (const_folding, dependency_optimizer, etc...)
-class ArithmeticOptimizerStage {
+class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
  public:
   explicit ArithmeticOptimizerStage(const string& name,
-                                    const ArithmeticOptimizerContext& ctx)
-      : name_(name), ctx_(ctx) {}
+                                    const GraphOptimizerContext& ctx,
+                                    const ArithmeticOptimizerContext ctx_ext)
+      : GraphOptimizerStage("ArithmeticOptimizer", name, ctx),
+        ctx_ext_(ctx_ext) {}
   virtual ~ArithmeticOptimizerStage() = default;
 
-  // Check if we should try to simplify node. Returning true doesn't
-  // guarantee that node will be simplified.
-  //
-  // Should implement just a basic sanity check, without any expensive graph
-  // traversals.
-  virtual bool IsSupported(const NodeDef* node) const = 0;
-
-  // Try to simplify the given node. If successfully simplified a given node,
-  // return a name of a new simplified version using output parameter.
-  //
-  // Consumers of an old node's outputs will be automatically re-wired to
-  // consume outputs of a new simplified node.
-  //
-  // Return error status only if some precondition is failed, or got an
-  // incorrect graph. In every other case return Status:OK(), even if didn't
-  // simplify anything.
-  //
-  // A simplified node will be always considered for further optimization and
-  // will be automatically added to the optimization queue. If a simplified node
-  // has the same name as original node it has to be explicitly added to the
-  // optimization queue for second pass.
-  virtual Status TrySimplify(const NodeDef* node,
-                             string* simplified_node_name) = 0;
-
- protected:
-  struct ScopedNodeName {
-    string scope;
-    string name;
-  };
-
-  const ScopedNodeName ParseScopedNodeName(const string& name) const {
-    auto pos = name.find_last_of("/");
-    if (pos == string::npos) {
-      return {"", name};
-    } else {
-      return {name.substr(0, pos), name.substr(pos + 1)};
-    }
-  }
-
-  // Prefix optimized node name with stage name and rewrite_rule
-  const string OptimizedNodeName(const string& rewrite_rule,
-                                 const ScopedNodeName& scoped_node_name) const {
-    return MakeOptimizedNodeName(strings::StrCat(name_, "_", rewrite_rule),
-                                 scoped_node_name);
-  }
-
-  // Prefix optimized node name with stage name and rewrite_rule
-  const string OptimizedNodeName(const string& rewrite_rule,
-                                 const ScopedNodeName& scoped_node_name,
-                                 const std::vector<string>& node_names) const {
-    return MakeOptimizedNodeName(strings::StrCat(name_, "_", rewrite_rule),
-                                 scoped_node_name, node_names);
-  }
-
-  // Prefix optimized node name with stage name
-  const string OptimizedNodeName(const ScopedNodeName& scoped_node_name) const {
-    return MakeOptimizedNodeName(name_, scoped_node_name);
-  }
-
-  // Prefix optimized node name with stage name
-  const string OptimizedNodeName(const ScopedNodeName& scoped_node_name,
-                                 const std::vector<string>& node_names) const {
-    return MakeOptimizedNodeName(name_, scoped_node_name, node_names);
-  }
-
   // Simplification graph rewrite can create additional nodes that are inputs
   // to final simplified node, they can be also added to the arithmetic
   // optimizer queue for further optimization.
   void AddToOptimizationQueue(NodeDef* node) {
-    ctx_.nodes_to_simplify->PushBack(node);
-  }
-
-  // Get a node by input name from a node map. Return a error if node was not
-  // found.
-  Status GetInputNode(const string& input, NodeDef** node) const {
-    string node_name = NodeName(input);
-    NodeDef* node_by_name = ctx_.node_map->GetNode(node_name);
-    if (node_by_name == nullptr) {
-      return errors::FailedPrecondition("Node ", node_name,
-                                        " doesn't exists in a node map");
-    }
-    *node = node_by_name;
-    return Status::OK();
-  }
-
-  // Get input shape from a node map. If node doesn't exists return unknown
-  // shape.
-  PartialTensorShape GetInputShape(const string& input) const {
-    int position;
-    string node_name = ParseNodeName(input, &position);
-    NodeDef* node;
-    Status node_status = GetInputNode(node_name, &node);
-    if (!node_status.ok()) {
-      return PartialTensorShape();  // unknown shape
-    }
-    auto attr = node->attr();
-    if (attr.find(kOutputShapesAttr) == attr.end()) {
-      return PartialTensorShape();  // unknown shape
-    } else {
-      return attr.at(kOutputShapesAttr).list().shape(position);
-    }
-  }
-
-  NodeDef* AddCopyNode(const string& name, const NodeDef* node_to_copy) {
-    CHECK(node_to_copy != nullptr);
-    CHECK(!ctx_.node_map->NodeExists(name))
-        << "Node " << name << " already exists in a graph";
-    NodeDef* new_node = ctx_.optimized_graph->add_node();
-    *new_node = *node_to_copy;
-    new_node->set_name(name);
-    ctx_.node_map->AddNode(name, new_node);
-    return new_node;
-  }
-
-  NodeDef* AddEmptyNode(const string& name) {
-    CHECK(!ctx_.node_map->NodeExists(name))
-        << "Node " << name << " already exists in a graph";
-    NodeDef* new_node = ctx_.optimized_graph->add_node();
-    new_node->set_name(name);
-    ctx_.node_map->AddNode(name, new_node);
-    return new_node;
+    ctx_ext_.nodes_to_simplify->PushBack(node);
   }
 
   // TODO(ezhulenev): remove this method from ArithmeticOptimizer when all
   // optimizations will be migrated to stages
-  void AddFrameControlDeps(const NodeDef* old_node,
-                           const std::vector<NodeDef*>& new_nodes,
-                           const string& source_for_ctrl_dep,
-                           const std::vector<NodeDef*>& sinks_for_control_dep) {
-    const auto frame_it = ctx_.frame_map->find(old_node);
-    if (frame_it != ctx_.frame_map->end()) {
-      for (auto node : new_nodes) {
-        ctx_.frame_map->emplace(node, frame_it->second);
-      }
-      if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
-        const string ctrl_dep = ConstantFolding::AddControlDependency(
-            source_for_ctrl_dep, ctx_.optimized_graph, ctx_.node_map);
-        for (auto node : sinks_for_control_dep) {
-          MaybeAddControlInput(ctrl_dep, node, ctx_.optimized_graph,
-                               ctx_.node_map);
+  void ForwardControlDependencies(
+      NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
+    for (const auto& src : src_nodes) {
+      for (int i = src->input_size() - 1; i >= 0; --i) {
+        if (IsControlInput(src->input(i))) {
+          *target_node->add_input() = src->input(i);
+          ctx_.node_map->AddOutput(NodeName(src->input(i)),
+                                   target_node->name());
+        } else {
+          break;
         }
       }
     }
   }
 
-  const string name_;
-  const ArithmeticOptimizerContext ctx_;
-
  private:
-  // Get a name for a new node obtained by optimizing a single node of the
-  // original graph. The optimized node is placed under the original node scope.
-  //
-  // Node name uniqueness is guaranteed by unique name of an original node in
-  // a same scope.
-  //
-  // Example: MakeOptimizedNodeName("AwesomeRewrite", "a/b/c/Add_1")
-  // Optimized name: "a/b/c/ArithmeticOptimizer/AwesomeRewrite_Add_1"
-  const string MakeOptimizedNodeName(
-      const string& prefix, const ScopedNodeName& scoped_node_name) const {
-    string node_name;
-    strings::StrAppend(&node_name, scoped_node_name.scope);
-    if (!node_name.empty()) strings::StrAppend(&node_name, "/");
-    strings::StrAppend(&node_name, kArithmeticOptimizer, "/", prefix, "_",
-                       scoped_node_name.name);
-    return node_name;
-  }
-
-  // Get a name for a new node obtained by optimizing multiple nodes of the
-  // original graph, starting from "root". The optimized node is placed under
-  // the original scope of a "root" node.
-  //
-  // Node name uniqueness is guaranteed by unique name of a "root" node in
-  // a same scope.
-  //
-  // Example:
-  //   MakeOptimizedNodeName("AwesomeRewrite", "a/b/Add_AB", ["x/y/Add_XY"])
-  // Optimized name:
-  //   "a/b/ArithmeticOptimizer/AwesomeRewrite_Add_AB_Add_XY"
-  const string MakeOptimizedNodeName(
-      const string& prefix, const ScopedNodeName& scoped_node_name,
-      const std::vector<string>& node_names) const {
-    string node_name = MakeOptimizedNodeName(prefix, scoped_node_name);
-    for (const string& optimized : node_names) {
-      auto scoped_node = ParseScopedNodeName(optimized);
-      strings::StrAppend(&node_name, "_", scoped_node.name);
-    }
-    return node_name;
-  }
+  // extened context required for ArithmeticOptimizer
+  const ArithmeticOptimizerContext ctx_ext_;
 };
 
 // Rewrite a tree of Add/AddN with a single AddN operation, consuming all the
 // original inputs of absorbed nodes.
 //
-// All nodes in a Add/AddN subgraph must have fully specified and identical
-// shape. All nodes must have the same device placement.
+// 1) All nodes must have the same device placement.
+//
+// 2) If All nodes in a Add/AddN subgraph have symbolically equal shape, tree is
+//    optimized to a single AddN node.
 //
-// Example:
 //                AddN_1
 //             /    |    \
-//          Add_1   z   Add_2       -> AddN(z, y, z, w, q, e)
+//          Add_1   z   Add_2       -> AddN(x, y, z, w, q, e)
 //          /  \        /  \
 //         x    y      w    Add_3
 //                          / \
 //                         q   e
+//
+// 3) If some nodes have different shape (it needs to be broadcastable to the
+//    shape of a "root), tree is optimized to AddNs for symbolically equal
+//    shapes, and a tree of Add ops, that minimize broadcasts.
+//
+//                AddN_1                                 Add
+//             /    |    \                              /  \
+//          Add_1   z   Add_2       ->               Add    w
+//          /  \        /  \                        /   \
+//         x    y      w    Add_3      AddN(x, y, q, e)  z
+//                          / \
+//                         q   e
 class AddOpsRewriteStage : public ArithmeticOptimizerStage {
  public:
-  explicit AddOpsRewriteStage(const ArithmeticOptimizerContext& ctx)
-      : ArithmeticOptimizerStage("AddOpsRewrite", ctx), rewritten_nodes_() {}
+  explicit AddOpsRewriteStage(const GraphOptimizerContext& ctx,
+                              const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("AddOpsRewrite", ctx, ctx_ext),
+        rewritten_nodes_() {}
 
   ~AddOpsRewriteStage() override = default;
 
@@ -533,25 +351,20 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (!IsRewritable(node)) {
       return false;
     }
-    // and must have fully defined shape
-    // TODO(ezhulenev): support partially defined shapes, when we can prove that
-    // unknown dimensions in the rewritten subgraph are the same.
-    PartialTensorShape shape = GetInputShape(node->name());
-    if (!shape.IsFullyDefined()) {
-      return false;
-    }
-    // and must have inputs of fully defined shape identical to the output
-    // TODO(ezhulenev): relax this condition to support equal unknown dimensions
-    return HasAllInputsOfIdenticalShape(*node, shape);
+
+    // shape must be symbolically defined and all inputs compatible with it
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(node->name(), &properties);
+    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
+           HasAllInputsOfBroadcastableShape(*node, properties);
   }
 
-  Status TrySimplify(const NodeDef* node,
-                     string* simplified_node_name) override {
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     CHECK(IsSupported(node));
     AddOpsGroup group;
     TF_RETURN_IF_ERROR(CreateAddOpsGroup(node, &group));
 
-    if (!group.absorbed_nodes.empty() && !IsRewritten(group)) {
+    if (!group.absorbed_nodes.empty()) {
       *simplified_node_name = RewriteAddOpsGroup(group);
     }
 
@@ -559,6 +372,14 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
   }
 
  private:
+  // Input name with a statically inferred shape from GraphProperties
+  struct InputAndShape {
+    InputAndShape(const string& input, const TensorShapeProto& shape)
+        : input(input), shape(shape) {}
+    string input;
+    TensorShapeProto shape;
+  };
+
   // Holds together an add ops subgraph that we want to rewrite together.
   //
   // For the graph above the AddOpsGroup will be:
@@ -567,23 +388,26 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
   //   input_nodes: [x, y, z, w, q, e]
   struct AddOpsGroup {
     const NodeDef* root_node;
-    PartialTensorShape root_shape;
+    TensorShapeProto root_shape;
     // Add/AddN operations below the root level that were absorbed by this group
     std::vector<NodeDef*> absorbed_nodes;
-    // Inputs of absorbed nodes that will be forwarded to rewritten AddN node
-    std::vector<string> inputs;
+    // Inputs of absorbed nodes that will be forwarded to optimized AddN ops
+    std::vector<InputAndShape> inputs;
   };
 
-  // Check if all inputs are fully defined and identical to expected shape
-  bool HasAllInputsOfIdenticalShape(const NodeDef& node,
-                                    const PartialTensorShape& shape) const {
+  // Check if all inputs can be broadcasted to the same shape
+  bool HasAllInputsOfBroadcastableShape(
+      const NodeDef& node, const OpInfo::TensorProperties& properties) const {
     const AddOpsRewriteStage* self = this;
-    return std::all_of(node.input().begin(), node.input().end(),
-                       [self, &shape](const string& input) {
-                         auto input_shape = self->GetInputShape(input);
-                         return input_shape.IsFullyDefined() &&
-                                input_shape.IsIdenticalTo(shape);
-                       });
+    return std::all_of(
+        node.input().begin(), node.input().end(),
+        [self, &properties](const string& input) {
+          OpInfo::TensorProperties input_properties;
+          Status has_input_properties =
+              self->GetTensorProperties(input, &input_properties);
+          return has_input_properties.ok() &&
+                 ShapesBroadcastable(properties, input_properties);
+        });
   }
 
   // TODO(ezhulenev): use GraphRewriter?
@@ -614,27 +438,25 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (!node_status.ok()) {
       return false;
     }
-
-    PartialTensorShape shape = GetInputShape(name);
-    CHECK(shape.IsIdenticalTo(group.root_shape))
-        << "Cannot absorb a node of incompatible shape";
-
     // check basic preconditions
     if (!IsRewritable(node)) {
       return false;
     }
-    // with a single output consumer (presumably if we reach this node from
+    // with a single output data consumer (presumably if we reach this node from
     // previously absorbed or a root node, it means that this node is not used
     // as an input to any other op, outside of the group)
-    if (ctx_.node_map->GetOutputs(node->name()).size() != 1) {
+    if (NumNonControlDataOutputs(*node, *ctx_.node_map) != 1) {
       return false;
     }
     // must be on the same device as a root node
     if (node->device() != group.root_node->device()) {
       return false;
     }
-    // All input shapes must be fully defined and equal to the node shape
-    return HasAllInputsOfIdenticalShape(*node, shape);
+    // All input shapes must be broadcastable to the node shape
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(name, &properties);
+    return has_properties.ok() &&
+           HasAllInputsOfBroadcastableShape(*node, properties);
   }
 
   // Node requirements both for a root node and an absorbed node
@@ -653,26 +475,31 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (rewritten_nodes_.find(node->name()) != rewritten_nodes_.end()) {
       return false;
     }
+    // it must not be created by this stage at any of previous optimization runs
+    if (StringPiece(node->name()).contains(stage_name_)) {
+      return false;
+    }
     // should not drive or be driven by control dependency
     // TODO(ezhulenev): relax this condition for root node
     return !(IsDrivenByControlDependency(*node) ||
              DrivesControlDependency(*node));
   }
 
-  // Check that optimized group node name doesn't exists. It might happen if
-  // graph optimized multiple times without pruning beween invocations.
-  bool IsRewritten(const AddOpsGroup& group) const {
-    return ctx_.node_map->NodeExists(AddOpsGroupName(group));
-  }
-
   // Create an AddOpsGroup with a root in a given node
   Status CreateAddOpsGroup(const NodeDef* root_node, AddOpsGroup* group) {
+    OpInfo::TensorProperties root_node_output_properties;
+    TF_RETURN_IF_ERROR(
+        GetTensorProperties(root_node->name(), &root_node_output_properties));
+
     group->root_node = root_node;
-    group->root_shape = GetInputShape(root_node->name());
+    group->root_shape = root_node_output_properties.shape();
 
     group->absorbed_nodes.reserve(root_node->input_size());
     for (int i = 0; i < root_node->input_size(); ++i) {
-      TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(root_node->input(i), group));
+      const string& input_i = root_node->input(i);
+      if (!IsControlInput(input_i)) {
+        TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(input_i, group));
+      }
     }
 
     return Status::OK();
@@ -685,68 +512,159 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (IsAbsorbableByAddOpsGroup(input, *group)) {
       group->absorbed_nodes.push_back(node);
       for (int i = 0; i < node->input_size(); ++i) {
-        TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(node->input(i), group));
+        const string& input_i = node->input(i);
+        if (!IsControlInput(input)) {
+          TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(input_i, group));
+        }
       }
     } else {
       // If node can't be absorbed, add it to AddOpsGroup input
-      group->inputs.push_back(input);
+      OpInfo::TensorProperties properties;
+      TF_RETURN_IF_ERROR(GetTensorProperties(input, &properties));
+      group->inputs.emplace_back(input, properties.shape());
     }
     return Status::OK();
   }
 
-  // New node for AddOpsGroup is added to the same scope as a root_node. All
-  // absorbed nodes are stripped of their scope, and only names are used in a
-  // new node name.
-  //
-  // Example: AddOpsGroup(root="a/b/c/Add_2", absorbed=["d/Add_1", "e/Add"])
-  //          node_name="a/b/c/AddOpsGroup_Add_2_Add_1_Add
-  string AddOpsGroupName(const AddOpsGroup& group) const {
-    CHECK_NOTNULL(group.root_node);
-
-    auto root = ParseScopedNodeName(group.root_node->name());
+  // Rewrite an add ops group into a single AddN if all input shapes are
+  // symbolically equal. If not, create AddN for equal shapes first, and then
+  // build an Add tree, minimizing the cost of broadcasts.
+  string RewriteAddOpsGroup(const AddOpsGroup& group) {
+    // all new nodes will be placed under the scope of a root node
+    auto root_scope_and_name = ParseNodeScopeAndName(group.root_node->name());
+
+    auto shape_sig = [](const TensorShapeProto& shape) {
+      string name = strings::StrCat("r:", shape.dim_size(), ":d");
+      for (int i = 0; i < shape.dim_size(); ++i)
+        strings::StrAppend(&name, ":", shape.dim(i).size());
+      return name;
+    };
+
+    // Find what shapes are present in the inputs of absorbed nodes
+    std::unordered_map<string, std::vector<InputAndShape>> shape_sig_to_inputs;
+    for (const auto& input : group.inputs) {
+      shape_sig_to_inputs[shape_sig(input.shape)].push_back(input);
+    }
 
-    std::vector<string> absorbed_node_names(group.absorbed_nodes.size());
-    std::transform(group.absorbed_nodes.begin(), group.absorbed_nodes.end(),
-                   absorbed_node_names.begin(),
-                   [](const NodeDef* node) { return node->name(); });
+    // Collect all the shapes from representative elements
+    std::vector<TensorShapeProto> shapes;
+    shapes.reserve(shape_sig_to_inputs.size());
+    for (const auto& el : shape_sig_to_inputs)
+      shapes.push_back(el.second[0].shape);
+
+    // If all inputs have the same shape, rewrite whole group with a single AddN
+    if (shapes.size() == 1) {
+      string node_name = OptimizedNodeName(root_scope_and_name);
+      AddInputsOfSymbolicallyEqualShape(*group.root_node, node_name,
+                                        group.inputs);
+      // keep track of nodes that were created or absorbed as a part of rewrite
+      rewritten_nodes_.insert(node_name);
+      return node_name;
+    }
 
-    return OptimizedNodeName(root, absorbed_node_names);
-  }
+    // For inputs of different shapes:
+    // 1. Rewrite inputs of the same shape using AddN (leaf nodes)
+    // 2. Build a tree of Add nodes, minimizing cost of broadcast
+    std::sort(shapes.begin(), shapes.end(),
+              [](const TensorShapeProto& left, const TensorShapeProto& right) {
+                return CompareSymbolicallyShapedTensorSizes(left, right);
+              });
+
+    // optimized name for leaf AddN nodes
+    auto leaf_node_name = [&root_scope_and_name, this](int i) {
+      return OptimizedNodeName(root_scope_and_name,
+                               strings::StrCat("Leaf_", i));
+    };
+    // optimized name for internal nodes of a tree built up from AddN leaves
+    auto internal_node_name = [&root_scope_and_name, this](int i) {
+      return OptimizedNodeName(root_scope_and_name,
+                               strings::StrCat("Internal_", i));
+    };
+
+    // Add/AddN nodes that must be added to the tree
+    std::deque<InputAndShape> add_ops;
+
+    // Prepare leaf AddN nodes for inputs of equal shape
+    for (int i = 0; i < shapes.size(); ++i) {
+      const auto node_name = leaf_node_name(i);
+      const auto& inputs = shape_sig_to_inputs[shape_sig(shapes[i])];
+      add_ops.push_back(AddInputsOfSymbolicallyEqualShape(*group.root_node,
+                                                          node_name, inputs));
+    }
 
-  // Create a new node for a AddOpsGroup and return it's name.
-  string RewriteAddOpsGroup(const AddOpsGroup& group) {
-    CHECK_GT(group.absorbed_nodes.size(), 0)
-        << "AddOpsGroup must have non empty absorbed nodes";
+    // Build up a tree of Add ops
+    int internal_nodes = 0;
+    do {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops.front();
+      add_ops.pop_front();
+      string name = add_ops.empty() ? OptimizedNodeName(root_scope_and_name)
+                                    : internal_node_name(internal_nodes++);
+      InputAndShape add = AddAggregatedInputs(*group.root_node, name, lhs, rhs);
+      add_ops.push_front(add);
+    } while (add_ops.size() > 1);
+
+    InputAndShape optimized_root_node = add_ops.front();
+    return optimized_root_node.input;
+  }
+
+  // Add 'AddN' node to aggregate inputs of symbolically equal shape
+  InputAndShape AddInputsOfSymbolicallyEqualShape(
+      const NodeDef& root_node, const string& node_name,
+      const std::vector<InputAndShape>& inputs) {
+    CHECK(!inputs.empty()) << "Inputs must be non-empty";
+
+    // Do not create redundant AddN nodes
+    if (inputs.size() == 1) {
+      return inputs[0];
+    }
 
-    // name for a new node constructed from AddOpsGroup
-    string node_name = AddOpsGroupName(group);
+    // get shape from representative element
+    auto shape = inputs[0].shape;
 
     // copy attributes from a root node
-    DataType dtype = group.root_node->attr().at("T").type();
+    DataType dtype = root_node.attr().at("T").type();
 
     // add new AddN node
-    NodeDef* added_node = AddEmptyNode(node_name);
-    added_node->set_op("AddN");
-    added_node->set_device(group.root_node->device());
-    (*added_node->mutable_attr())["T"].set_type(dtype);
-    (*added_node->mutable_attr())["N"].set_i(group.inputs.size());
-
-    // all inputs of absorbed nodes are added to the new node
-    for (const string& input : group.inputs) {
-      ctx_.node_map->AddOutput(input, node_name);
-      added_node->add_input(input);
+    NodeDef* node = AddEmptyNode(node_name);
+    node->set_op("AddN");
+    node->set_device(root_node.device());
+    (*node->mutable_attr())["T"].set_type(dtype);
+    (*node->mutable_attr())["N"].set_i(inputs.size());
+
+    for (const auto& inputAndShape : inputs) {
+      ctx_.node_map->AddOutput(inputAndShape.input, node_name);
+      node->add_input(inputAndShape.input);
     }
 
-    VLOG(1) << "Absorbed " << group.absorbed_nodes.size()
-            << " Add/AddN nodes from the graph";
-
-    // keep track of nodes that were created or absorbed as a part of rewrite
     rewritten_nodes_.insert(node_name);
-    for (const NodeDef* absorbed : group.absorbed_nodes) {
-      rewritten_nodes_.insert(absorbed->name());
-    }
+    return InputAndShape(node_name, shape);
+  }
+
+  // Add a single 'Add' node to sum two inputs
+  InputAndShape AddAggregatedInputs(const NodeDef& root_node,
+                                    const string& node_name,
+                                    const InputAndShape& left,
+                                    const InputAndShape& right) {
+    // copy attributes from a root node
+    DataType dtype = root_node.attr().at("T").type();
+
+    // add new Add node
+    NodeDef* node = AddEmptyNode(node_name);
+    node->set_op("Add");
+    node->set_device(root_node.device());
+    (*node->mutable_attr())["T"].set_type(dtype);
+
+    ctx_.node_map->AddOutput(left.input, node_name);
+    ctx_.node_map->AddOutput(right.input, node_name);
+
+    node->add_input(left.input);
+    node->add_input(right.input);
 
-    return node_name;
+    rewritten_nodes_.insert(node_name);
+    return InputAndShape(
+        node_name, TensorShapeProto());  // shape is not important at this point
   }
 
   // keep nodes that were added or absorbed as a part of AddOpsGroup rewrite
@@ -765,8 +683,9 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
 class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
  public:
   explicit HoistCommonFactorOutOfAggregation(
-      const ArithmeticOptimizerContext& ctx)
-      : ArithmeticOptimizerStage("HoistCommonFactor", ctx) {}
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("HoistCommonFactor", ctx, ctx_ext) {}
   ~HoistCommonFactorOutOfAggregation() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
@@ -774,12 +693,12 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
            !IsRewritten(node);
   }
 
-  Status TrySimplify(const NodeDef* node,
-                     string* simplified_node_name) override {
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     CHECK(IsSupported(node));
 
     std::set<string> common_factors;
-    TF_RETURN_IF_ERROR(GetCommonFactors(node, &common_factors));
+    std::vector<string> ctrl_deps;
+    TF_RETURN_IF_ERROR(GetCommonFactors(node, &common_factors, &ctrl_deps));
 
     if (common_factors.size() == 1) {
       const string& common_factor = *common_factors.begin();
@@ -811,9 +730,11 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
           new_add_node->set_input(i, unique_factors[i]);
         }
 
-        // Add frame dependencies that the original node might have had.
-        AddFrameControlDeps(node, {new_add_node, new_mul_node}, common_factor,
-                            {new_add_node});
+        // Add control deps on add node
+        for (const string& ctrl_dep : ctrl_deps) {
+          *new_add_node->add_input() = ctrl_dep;
+          ctx_.node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name());
+        }
 
         // optimize new inner aggregation node
         AddToOptimizationQueue(new_add_node);
@@ -828,25 +749,27 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
  private:
   // Get a name for new outer Mul node
   string OuterMulNodeName(const NodeDef* node) const {
-    auto scoped_node = ParseScopedNodeName(node->name());
-    return OptimizedNodeName("Mul", scoped_node);
+    auto scope_and_name = ParseNodeScopeAndName(node->name());
+    return OptimizedNodeName(scope_and_name, "Mul");
   }
 
   // Get a name new inner Add node
   string InnerAddNodeName(const NodeDef* node) const {
-    auto scoped_node = ParseScopedNodeName(node->name());
-    return OptimizedNodeName("Add", scoped_node);
+    auto scope_and_name = ParseNodeScopeAndName(node->name());
+    return OptimizedNodeName(scope_and_name, "Add");
   }
 
   // Determine the set of common factors if the input nodes are all Mul nodes.
-  Status GetCommonFactors(const NodeDef* node,
-                          std::set<string>* common_factors) const {
+  Status GetCommonFactors(const NodeDef* node, std::set<string>* common_factors,
+                          std::vector<string>* ctrl_deps) const {
     CHECK(common_factors->empty());
 
     for (int i = 0; i < node->input_size(); ++i) {
       if (i > 0 && common_factors->empty()) break;
-      if (IsControlInput(node->input(i))) break;
-
+      if (IsControlInput(node->input(i))) {
+        ctrl_deps->push_back(node->input(i));
+        continue;
+      }
       NodeDef* input;
       TF_RETURN_IF_ERROR(GetInputNode(node->input(i), &input));
 
@@ -866,6 +789,9 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
             std::inserter(intersection, intersection.begin()));
         std::swap(*common_factors, intersection);
       }
+      for (int i = 2; i < input->input_size(); ++i) {
+        ctrl_deps->push_back(input->input(i));
+      }
     }
     return Status::OK();
   }
@@ -891,8 +817,11 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
           mul_node->input(0) == common_factor ? 1 : 0;
       unique_factors->push_back(mul_node->input(unique_factor_index));
       if (i > 0 && !IsAdd(*node)) {
-        *shapes_match = ShapesEqual(unique_factors->front(),
-                                    unique_factors->back(), *ctx_.node_map);
+        OpInfo::TensorProperties lhs;
+        OpInfo::TensorProperties rhs;
+        TF_RETURN_IF_ERROR(GetTensorProperties(unique_factors->front(), &lhs));
+        TF_RETURN_IF_ERROR(GetTensorProperties(unique_factors->back(), &rhs));
+        *shapes_match = ShapesSymbolicallyEqual(lhs, rhs);
       }
     }
     return Status::OK();
@@ -910,55 +839,67 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
 };
 
 // Removes inverse transpose nodes
-class RemoveInverseTranspose : public ArithmeticOptimizerStage {
+class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
  public:
-  explicit RemoveInverseTranspose(const ArithmeticOptimizerContext& ctx)
-      : ArithmeticOptimizerStage("RemoveInverseTranspose", ctx) {}
-  ~RemoveInverseTranspose() override = default;
+  explicit RemoveIdentityTranspose(const GraphOptimizerContext& ctx,
+                                   const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveIdentityTranspose", ctx, ctx_ext) {}
+  ~RemoveIdentityTranspose() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
     return IsTranspose(*node) || IsConjugateTranspose(*node);
   }
 
-  Status TrySimplify(const NodeDef* node,
-                     string* simplified_node_name) override {
+  // TODO(rmlarsen): Forward control dependencies on the bypassed
+  // transpose nodes.
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     CHECK(IsSupported(node));
 
     NodeDef* input;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
+    NodeDef* node_perm;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &node_perm));
+    std::vector<int64> node_perm_values;
+    TF_RETURN_IF_ERROR(GetPermutation(*node_perm, &node_perm_values));
 
     if (input->op() == node->op()) {
-      NodeDef* node_perm;
+      // Remove pairs of transposes that cancel each other.
       NodeDef* input_perm;
-
-      TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &node_perm));
       TF_RETURN_IF_ERROR(GetInputNode(input->input(1), &input_perm));
-
-      // Try 32-bit indices.
-      std::vector<int> node_perm_values;
-      std::vector<int> input_perm_values;
-      if (ValuesFromConstNode(*node_perm, &node_perm_values) &&
-          ValuesFromConstNode(*input_perm, &input_perm_values) &&
-          AreInversePermutations(node_perm_values, input_perm_values)) {
+      std::vector<int64> input_perm_values;
+      TF_RETURN_IF_ERROR(GetPermutation(*input_perm, &input_perm_values));
+      if (AreInversePermutations(node_perm_values, input_perm_values)) {
         *simplified_node_name = input->input(0);
       }
-      // Try 64-bit indices.
-      std::vector<int64> node_perm_values64;
-      std::vector<int64> input_perm_values64;
-      if (ValuesFromConstNode(*node_perm, &node_perm_values64) &&
-          ValuesFromConstNode(*input_perm, &input_perm_values64) &&
-          AreInversePermutations(node_perm_values64, input_perm_values64)) {
-        *simplified_node_name = input->input(0);
+    } else {
+      // Remove simple identity transposes.
+      if (IsIdentityPermutation(node_perm_values)) {
+        *simplified_node_name = node->input(0);
       }
     }
-
     return Status::OK();
   }
 
  private:
-  template <typename T>
-  bool AreInversePermutations(const std::vector<T>& a,
-                              const std::vector<T>& b) {
+  Status GetPermutation(const NodeDef& node_perm,
+                        std::vector<int64>* perm64) const {
+    std::vector<int> perm32;
+    if (ValuesFromConstNode(node_perm, &perm32)) {
+      perm64->reserve(perm32.size());
+      for (int val : perm32) {
+        perm64->push_back(static_cast<int64>(val));
+      }
+      return Status::OK();
+    }
+    if (ValuesFromConstNode(node_perm, perm64)) {
+      return Status::OK();
+    }
+    return errors::InvalidArgument("Couldn't extract permutation from ",
+                                   node_perm.name());
+  }
+
+  bool AreInversePermutations(const std::vector<int64>& a,
+                              const std::vector<int64>& b) {
     if (a.size() != b.size()) {
       return false;
     }
@@ -969,6 +910,15 @@ class RemoveInverseTranspose : public ArithmeticOptimizerStage {
     }
     return true;
   }
+
+  bool IsIdentityPermutation(const std::vector<int64>& perm) {
+    for (int64 i = 0; i < perm.size(); ++i) {
+      if (i != perm[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
 };
 
 // Remove redundant Bitcasts.
@@ -976,16 +926,17 @@ class RemoveInverseTranspose : public ArithmeticOptimizerStage {
 // 2) Rewrite Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
 class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
  public:
-  explicit RemoveRedundantBitcastStage(const ArithmeticOptimizerContext& ctx)
-      : ArithmeticOptimizerStage("RemoveRedundantBitcast", ctx) {}
+  explicit RemoveRedundantBitcastStage(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveRedundantBitcast", ctx, ctx_ext) {}
   ~RemoveRedundantBitcastStage() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
     return IsBitcast(*node);
   }
 
-  Status TrySimplify(const NodeDef* node,
-                     string* simplified_node_name) override {
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     CHECK(IsSupported(node));
 
     // Bypass Bitcast whose source type and destination type are equal.
@@ -1016,14 +967,14 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
 // Remove Casts whose source type and destination type are equal.
 class RemoveRedundantCastStage : public ArithmeticOptimizerStage {
  public:
-  explicit RemoveRedundantCastStage(const ArithmeticOptimizerContext& ctx)
-      : ArithmeticOptimizerStage("RemoveRedundantCast", ctx) {}
+  explicit RemoveRedundantCastStage(const GraphOptimizerContext& ctx,
+                                    const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveRedundantCast", ctx, ctx_ext) {}
   ~RemoveRedundantCastStage() override = default;
 
   bool IsSupported(const NodeDef* node) const override { return IsCast(*node); }
 
-  Status TrySimplify(const NodeDef* node,
-                     string* simplified_node_name) override {
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     CHECK(IsSupported(node));
     // Bypass Cast whose source type and destination type are equal.
     if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
@@ -1033,6 +984,58 @@ class RemoveRedundantCastStage : public ArithmeticOptimizerStage {
   }
 };
 
+class RemoveNegationStage : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveNegationStage(const GraphOptimizerContext& ctx,
+                               const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveNegation", ctx, ctx_ext) {}
+  ~RemoveNegationStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsAdd(*node) || IsSub(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const string node_name = node->name();
+    NodeDef* x;
+    NodeDef* y;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &x));
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &y));
+    bool updated = false;
+    if (IsAdd(*node)) {
+      if (IsNeg(*x)) {
+        // (-a) + b = b - a
+        node->set_op("Sub");
+        node->mutable_input()->SwapElements(0, 1);
+        node->set_input(1, x->input(0));
+        node->add_input(AsControlDependency(x->name()));
+        ctx_.node_map->AddOutput(NodeName(x->input(0)), node_name);
+        updated = true;
+      } else if (IsNeg(*y)) {
+        // a + (-b) = a - b
+        node->set_op("Sub");
+        node->set_input(1, y->input(0));
+        node->add_input(AsControlDependency(y->name()));
+        ctx_.node_map->AddOutput(NodeName(y->input(0)), node_name);
+        updated = true;
+      }
+    } else if (IsSub(*node)) {
+      if (IsNeg(*y)) {
+        // a - (-b) = a + b
+        node->set_op("Add");
+        node->set_input(1, y->input(0));
+        node->add_input(AsControlDependency(y->name()));
+        ctx_.node_map->AddOutput(NodeName(y->input(0)), node_name);
+        updated = true;
+      }
+    }
+    if (updated) {
+      AddToOptimizationQueue(node);
+    }
+    return Status::OK();
+  }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -1120,6 +1123,9 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   }
 
   // Compare attributes.
+  if (node1.attr().size() != node2.attr().size()) {
+    return false;
+  }
   for (const auto& attr1 : node1.attr()) {
     auto it = node2.attr().find(attr1.first);
     if (it == node2.attr().end()) {
@@ -1165,6 +1171,25 @@ bool ArithmeticOptimizer::OptimizedNodeExists(const NodeDef& node,
   return node_map_->NodeExists(OptimizedNodeName(node, suffix));
 }
 
+namespace {
+
+bool FeedsInPlaceOp(const SimpleGraphView& graph_view, const NodeDef& node) {
+  const std::unordered_set<string> op_types_to_traverse = {
+      node.op(),    "Identity", "IdentityN", "Reshape",
+      "ExpandDims", "Enter",    "Switch",    "Merge"};
+  int node_idx = graph_view.index(node.name());
+  std::set<int> node_fanout;
+  graph_view.DepthFirstSearch(op_types_to_traverse, node_idx, &node_fanout);
+  for (int fanout : node_fanout) {
+    if (ModifiesInputsInPlace(graph_view.graph()->node(fanout))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
 bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
@@ -1184,6 +1209,11 @@ bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
 
 void ArithmeticOptimizer::DedupComputations() {
   bool stop = true;
+  SimpleGraphView graph_view;
+  if (!graph_view.Initialize(*optimized_graph_).ok()) {
+    LOG(WARNING) << "Failed to build SimpleGraphView.";
+    return;
+  }
   std::set<int> duplicates;
   do {
     stop = true;
@@ -1200,19 +1230,28 @@ void ArithmeticOptimizer::DedupComputations() {
       if (rep == node) {
         continue;
       }
+      // If either node feeds an inplace op, deduping them may cause data races.
+      // For example: If we dedup nodes initializing two independent inplace
+      // accumulations, they will write to the same buffer, clobbering each
+      // other's results.
+      if (FeedsInPlaceOp(graph_view, *rep) ||
+          FeedsInPlaceOp(graph_view, *node)) {
+        continue;
+      }
       const std::set<NodeDef*>& fanouts = node_map_->GetOutputs(node->name());
       for (NodeDef* fanout : fanouts) {
-        for (string& name : *fanout->mutable_input()) {
+        for (int i = 0; i < fanout->input_size(); ++i) {
+          string* name = fanout->mutable_input(i);
           int position;
-          const string nodename = ParseNodeName(name, &position);
+          const string nodename = ParseNodeName(*name, &position);
           if (nodename == node->name()) {
             // Update name in-place.
             if (position > 0) {
-              name = StrCat(rep->name(), ":", position);
+              *name = StrCat(rep->name(), ":", position);
             } else if (position == 0) {
-              name = rep->name();
+              *name = rep->name();
             } else {
-              name = StrCat("^", rep->name());
+              *name = StrCat("^", rep->name());
             }
             node_map_->AddOutput(rep->name(), fanout->name());
           }
@@ -1238,20 +1277,15 @@ void ArithmeticOptimizer::DedupComputations() {
   }
 }
 
-void ArithmeticOptimizer::AddFrameControlDeps(
-    const NodeDef* old_node, const std::vector<NodeDef*>& new_nodes,
-    const string& source_for_ctrl_dep,
-    const std::vector<NodeDef*>& sinks_for_control_dep) {
-  const auto frame_it = frame_map_.find(old_node);
-  if (frame_it != frame_map_.end()) {
-    for (auto node : new_nodes) {
-      frame_map_.emplace(node, frame_it->second);
-    }
-    if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
-      const string ctrl_dep = ConstantFolding::AddControlDependency(
-          source_for_ctrl_dep, optimized_graph_, node_map_.get());
-      for (auto node : sinks_for_control_dep) {
-        MaybeAddControlInput(ctrl_dep, node, optimized_graph_, node_map_.get());
+void ArithmeticOptimizer::ForwardControlDependencies(
+    NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
+  for (const auto& src : src_nodes) {
+    for (int i = src->input_size() - 1; i >= 0; --i) {
+      if (IsControlInput(src->input(i))) {
+        *target_node->add_input() = src->input(i);
+        node_map_->AddOutput(NodeName(src->input(i)), target_node->name());
+      } else {
+        break;
       }
     }
   }
@@ -1307,19 +1341,18 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     int output_pos = 0;
     string input_node_name = ParseNodeName(node->input(0), &output_pos);
     const NodeDef* input = node_map_->GetNode(input_node_name);
-    if (input->op() == "Reshape") {
+    if (input->op() == "Reshape" && !HasControlInputs(*input)) {
       reshape->set_input(0, input->input(0));
       node_map_->UpdateInput(reshape->name(), input->name(), input->input(0));
       nodes_to_simplify->PushBack(reshape);
       return reshape->name();
     }
 
-    // If the reshape is a no-op, forward its input to its consumers. This is
-    // considered aggressive, because users may state that the placeholder
-    // outputs tensors of shape [M, N] while feeding it with tensors of shape
-    // [M*N] (or worse). The reshape nodes are then necessary to update the
-    // tensor metadata to the required shape.
-    if (ReshapeIsIdentity(*reshape, *input, output_pos)) {
+    // If the reshape is a no-op, forward its input to its consumers, unless it
+    // anchors a control dependency since we want to make sure that control
+    // dependency is triggered.
+    if (ReshapeIsIdentity(*reshape, *input, output_pos, *graph_properties_) &&
+        !HasControlInputs(*reshape)) {
       return reshape->input(0);
     }
   }
@@ -1347,8 +1380,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     // with image.type than with dst_type.
     if (DeviceNameUtils::SplitDeviceName(transpose->device(), &dontcare,
                                          &device) &&
-        (StringPiece(device).contains(DEVICE_CPU) ||
-         StringPiece(device).contains(DEVICE_GPU))) {
+        (str_util::StrContains(device, DEVICE_CPU) ||
+         str_util::StrContains(device, DEVICE_GPU))) {
       const NodeDef* cast = node_map_->GetNode(transpose->input(0));
       if (cast->op() == "Cast") {
         const NodeDef* input = node_map_->GetNode(cast->input(0));
@@ -1372,10 +1405,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
           node_map_->AddOutput(new_transpose->name(), new_cast->name());
 
           nodes_to_simplify->PushBack(new_transpose);
-          //  Add frame dependencies that the original node might have had.
-          AddFrameControlDeps(node, {new_transpose, new_cast},
-                              new_transpose->input(0), {new_transpose});
-
+          ForwardControlDependencies(new_transpose, {cast, node});
           return new_cast->name();
         }
       }
@@ -1449,7 +1479,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
             node_map_->AddOutput(weights->name(), scaled_weights->name());
             scaled_weights->add_input(mul->input(1));
             node_map_->AddOutput(scale->name(), scaled_weights->name());
-            AddFrameControlDeps(node, {scaled_weights}, "", {});
+            ForwardControlDependencies(scaled_weights, {source});
 
             // Update `conv`'s weights to `scaled_weights`.
             conv->set_input(1, scaled_weights->name());
@@ -1485,7 +1515,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   }
 
   if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
-    // Discard aggregate nodes with a single input.
+    // Discard aggregate nodes with a single input and no control dependencies.
     if (node->input_size() == 1) {
       return node->input(0);
     }
@@ -1531,6 +1561,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         return "";
       }
       new_const_node->set_device(node->device());
+      MaybeAddControlInput(NodeName(node->input(0)), new_const_node,
+                           optimized_graph_, node_map_.get());
       nodes_to_simplify->PushBack(new_const_node);
 
       // 2. Replace the aggregate node with Mul(Const(N), x).
@@ -1543,9 +1575,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
       new_mul_node->add_input(node->input(0));
       node_map_->AddOutput(node->input(0), new_mul_node->name());
 
-      CopyControlInputs(*node, new_mul_node, optimized_graph_, node_map_.get());
-      AddFrameControlDeps(node, {new_const_node, new_mul_node}, node->input(0),
-                          {new_const_node});
+      ForwardControlDependencies(new_mul_node, {node});
       return new_mul_node->name();
     }
   }
@@ -1578,7 +1608,6 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         FlipBooleanAttr(attr_a, new_op);
         new_op->set_input(0, a->input(0));
         node_map_->UpdateInput(new_op->name(), a->name(), a->input(0));
-        AddFrameControlDeps(node, {new_op}, a->input(0), {new_op});
       }
       if (b_is_foldable) {
         const string attr_b =
@@ -1586,10 +1615,15 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         FlipBooleanAttr(attr_b, new_op);
         new_op->set_input(1, b->input(0));
         node_map_->UpdateInput(new_op->name(), b->name(), b->input(0));
-        if (!a_is_foldable) {
-          AddFrameControlDeps(node, {new_op}, b->input(0), {new_op});
-        }
       }
+      std::vector<const NodeDef*> deps_to_forward({node});
+      if (a_is_foldable) {
+        deps_to_forward.push_back(a);
+      }
+      if (b_is_foldable) {
+        deps_to_forward.push_back(b);
+      }
+      ForwardControlDependencies(new_op, deps_to_forward);
     }
   }
 
@@ -1611,7 +1645,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
                                                        : "Transpose");
       new_op->set_input(0, input->input(0));
       node_map_->UpdateInput(new_op->name(), node->name(), input->input(0));
-      AddFrameControlDeps(node, {new_op}, "", {});
+      ForwardControlDependencies(new_op, {node, input});
       return new_op->name();
     }
   }
@@ -1626,60 +1660,47 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
     nodes_to_simplify.PushBack(optimized_graph_->mutable_node(i));
   }
 
-  const ArithmeticOptimizerContext ctx(&nodes_to_preserve_, optimized_graph_,
-                                       node_map_.get(), &frame_map_,
-                                       &nodes_to_simplify);
-
-  std::vector<std::unique_ptr<ArithmeticOptimizerStage>> stages;
-
-  if (options_.combine_add_to_addn) {
-    stages.push_back(
-        std::unique_ptr<ArithmeticOptimizerStage>(new AddOpsRewriteStage(ctx)));
-  }
-  if (options_.hoist_common_factor_out_of_aggregation) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new HoistCommonFactorOutOfAggregation(ctx)));
-  }
-  if (options_.remove_inverse_transpose) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveInverseTranspose(ctx)));
-  }
-  if (options_.remove_redundant_bitcast) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveRedundantBitcastStage(ctx)));
-  }
-  if (options_.remove_redundant_cast) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveRedundantCastStage(ctx)));
-  }
-
-  VLOG(1) << "Simplify arithmetic ops using " << stages.size()
+  const GraphOptimizerContext ctx(&nodes_to_preserve_, optimized_graph_,
+                                  graph_properties_.get(), node_map_.get());
+  const ArithmeticOptimizerContext ctx_ext(&nodes_to_simplify);
+
+  // Stop pipeline after first stage returning non-empty simplified tensor name.
+  const auto stop = [](const string& result) { return !result.empty(); };
+  GraphOptimizerStagePipeline<string> pipeline(stop);
+
+  if (options_.combine_add_to_addn)
+    pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
+  if (options_.hoist_common_factor_out_of_aggregation)
+    pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
+  if (options_.remove_identity_transpose)
+    pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
+  if (options_.remove_redundant_bitcast)
+    pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
+  if (options_.remove_redundant_cast)
+    pipeline.AddStage<RemoveRedundantCastStage>(ctx, ctx_ext);
+  if (options_.remove_negation)
+    pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
+
+  VLOG(1) << "Simplify arithmetic ops using " << pipeline.NumStages()
           << " arithmetic optimization stages";
 
   while (!nodes_to_simplify.Empty()) {
-    const NodeDef* node = nodes_to_simplify.PopBack();
+    NodeDef* node = nodes_to_simplify.PopBack();
 
     // TODO(ezhulenev): move all rewrites into separate stages
-    string simplified_tensor =
-        TrySimplifyAndReplaceUses(node, &nodes_to_simplify);
+    string simplified_tensor = "";
+    if (options_.enable_try_simplify_and_replace) {
+      simplified_tensor = TrySimplifyAndReplaceUses(node, &nodes_to_simplify);
+    }
 
     // if it was not simplified try to run it through all configured stages
-    if (simplified_tensor.empty()) {
-      for (auto& stage : stages) {
-        if (stage->IsSupported(node)) {
-          TF_RETURN_IF_ERROR(stage->TrySimplify(node, &simplified_tensor));
-          if (!simplified_tensor.empty()) {
-            break;
-          }
-        }
+    if (!stop(simplified_tensor)) {
+      bool optimized = pipeline.PassThroughAllStages(node, &simplified_tensor);
+      if (!optimized) {
+        continue;
       }
     }
 
-    // if it's still empty go to the next Node
-    if (simplified_tensor.empty()) {
-      continue;
-    }
-
     // re-wire consumers of an old node to the new one
     if (NodeName(simplified_tensor) != node->name()) {
       // Always consider simplified_tensor for further optimizations.
@@ -1722,31 +1743,28 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
 Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
                                      const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  optimized_graph_ = optimized_graph;
-  *optimized_graph_ = item.graph;
+  GrapplerItem optimized_item(item);
+  optimized_graph_ = &optimized_item.graph;
 
   // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
   node_map_.reset(new NodeMap(optimized_graph_));
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                               &frame_map_, &num_frames));
+
+  DedupComputations();
+
+  // Perform topological sort on the graph in order to help AddOpsRewrite to
+  // optimize larger subgraphs starting from the roots with more inputs.
+  TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
+
   // Shapes are only needed in aggressive mode.
   graph_properties_.reset(new GraphProperties(item));
   TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
-  // TODO(ezhulenev): Use GraphProperties to lookup tensor shapes directly
-  TF_RETURN_IF_ERROR(graph_properties_->AnnotateOutputShapes(optimized_graph_));
 
   // Perform the optimizations.
-  DedupComputations();
   TF_RETURN_IF_ERROR(SimplifyArithmeticOps());
 
-  // Clear output shapes.
-  for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    optimized_graph_->mutable_node(i)->mutable_attr()->erase(kOutputShapesAttr);
-  }
-
+  optimized_graph->Swap(optimized_graph_);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index d5a7af5ba6888c9d65d7086efdb7c1c300e66869..7e81ed0a1f8547a352bd433bab99ab01fdb73b48 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -55,17 +54,27 @@ class ArithmeticOptimizer : public GraphOptimizer {
 
   // Granular control for arithmetic optimizer stages
   struct ArithmeticOptimizerOptions {
-    bool combine_add_to_addn = true;
+    // TODO(ezhulenev): flag do disable TrySimplifyAndReplaceUses in tests.
+    // Remove when all optimizers will be migrated to separate stages.
+    bool enable_try_simplify_and_replace = true;
+    bool combine_add_to_addn = false;
     bool hoist_common_factor_out_of_aggregation = true;
-    bool remove_inverse_transpose = true;
+    bool remove_identity_transpose = true;
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
+    bool remove_negation = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
-      return ArithmeticOptimizerOptions();
+      ArithmeticOptimizerOptions options;
+      // TODO(ezhulenev): enable combine_add_to_addn by default after 1.8
+      // release cut
+      if (opt_level == RewriterConfig::AGGRESSIVE) {
+        options.combine_add_to_addn = true;
+      }
+      return options;
     }
   };
 
@@ -90,13 +99,9 @@ class ArithmeticOptimizer : public GraphOptimizer {
   // Dedup redundant nodes in the graph.
   void DedupComputations();
 
-  // Fix frame dependencies by adding control dependencies from old_input to
-  // nodes in new_nodes_for_control_dep, and update frame_map for all nodes in
-  // new_nodes.
-  void AddFrameControlDeps(const NodeDef* old_node,
-                           const std::vector<NodeDef*>& new_nodes,
-                           const string& source_for_ctrl_dep,
-                           const std::vector<NodeDef*>& sinks_for_control_dep);
+  // Forward the control dependencies anchored on src_nodes to the target_nodes.
+  void ForwardControlDependencies(NodeDef* target_node,
+                                  const std::vector<const NodeDef*>& src_nodes);
 
   // Runs peep-hole optimizations on `optimized_graph`, e.g., removing inverse
   // transposes.
@@ -125,7 +130,6 @@ class ArithmeticOptimizer : public GraphOptimizer {
   bool fetch_nodes_known_ = false;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
-  FrameMap frame_map_;
   std::unique_ptr<GraphProperties> graph_properties_;
   GraphDef* optimized_graph_ = nullptr;  // Not owned.
 };
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index e1f47625c12a12241cf2ef9e984a29bb7a8f1d00..e117341ba362ade4c23262477dbe2d95a4d78f6f 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -89,9 +90,10 @@ class ArithmeticOptimizerTest : public GrapplerTest {
   // should explicitly enable required optimization for tests isolation
   void DisableAllStages(ArithmeticOptimizer* optimizer) {
     ArithmeticOptimizer::ArithmeticOptimizerOptions options;
+    options.enable_try_simplify_and_replace = false;
     options.combine_add_to_addn = false;
     options.hoist_common_factor_out_of_aggregation = false;
-    options.remove_inverse_transpose = false;
+    options.remove_identity_transpose = false;
     options.remove_redundant_bitcast = false;
     options.remove_redundant_cast = false;
     optimizer->options_ = options;
@@ -111,9 +113,9 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.hoist_common_factor_out_of_aggregation = true;
   }
 
-  void EnableOnlyRemoveInverseTranspose(ArithmeticOptimizer* optimizer) {
+  void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
-    optimizer->options_.remove_inverse_transpose = true;
+    optimizer->options_.remove_identity_transpose = true;
   }
 
   void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
@@ -125,6 +127,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     DisableAllStages(optimizer);
     optimizer->options_.remove_redundant_cast = true;
   }
+
+  void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_negation = true;
+  }
 };
 
 TEST_F(ArithmeticOptimizerTest, NoOp) {
@@ -149,23 +156,26 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div"};
 
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
   EXPECT_EQ(2, output.node_size());
-  const NodeDef& new_c1 = output.node(0);
-  EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_div = output.node(1);
-  EXPECT_EQ("div", new_div.name());
-  EXPECT_EQ(2, new_div.input_size());
-  EXPECT_EQ("c1", new_div.input(0));
-  EXPECT_EQ("c1", new_div.input(1));
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  EXPECT_EQ(2, new_div->input_size());
+  EXPECT_EQ("c1", new_div->input(0));
+  EXPECT_EQ("c1", new_div->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
@@ -182,23 +192,30 @@ TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div"};
+  Tensor bool_t(DT_BOOL, TensorShape({}));
+  bool_t.scalar<bool>().setConstant(true);
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", bool_t}});
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_div = output.node(3);
-  EXPECT_EQ(4, new_div.input_size());
-  EXPECT_EQ("check1", new_div.input(0));
-  EXPECT_EQ("check1", new_div.input(1));
-  EXPECT_EQ("^assert1", new_div.input(2));
-  EXPECT_EQ("^assert1", new_div.input(3));
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  EXPECT_EQ(4, new_div->input_size());
+  EXPECT_EQ("check1", new_div->input(0));
+  EXPECT_EQ("check1", new_div->input(1));
+  EXPECT_EQ("^assert1", new_div->input(2));
+  EXPECT_EQ("^assert1", new_div->input(3));
+
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", bool_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
@@ -210,32 +227,34 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   Output div1 = ops::Div(s.WithOpName("div1"), mul1, mul2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch = {"div"};
+  item.fetch = {"div1"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(4, output.node_size());
-  const NodeDef& new_c1 = output.node(0);
-  EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_c2 = output.node(1);
-  EXPECT_EQ("c2", new_c2.name());
-  const NodeDef& new_mul1 = output.node(2);
-  EXPECT_EQ("mul1", new_mul1.name());
-  EXPECT_EQ(2, new_mul1.input_size());
-  EXPECT_EQ("c1", new_mul1.input(0));
-  EXPECT_EQ("c2", new_mul1.input(1));
-  const NodeDef& new_div1 = output.node(3);
-  EXPECT_EQ("div1", new_div1.name());
-  EXPECT_EQ(2, new_div1.input_size());
-  EXPECT_EQ("mul1", new_div1.input(0));
-  EXPECT_EQ("mul1", new_div1.input(1));
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+  const NodeDef* new_c2 = node_map.GetNode("c2");
+  ASSERT_NE(new_c2, nullptr);
+  const NodeDef* new_mul1 = node_map.GetNode("mul1");
+  ASSERT_NE(new_mul1, nullptr);
+  EXPECT_EQ(2, new_mul1->input_size());
+  EXPECT_EQ("c1", new_mul1->input(0));
+  EXPECT_EQ("c2", new_mul1->input(1));
+  const NodeDef* new_div1 = node_map.GetNode("div1");
+  ASSERT_NE(new_div1, nullptr);
+  EXPECT_EQ(2, new_div1->input_size());
+  EXPECT_EQ("mul1", new_div1->input(0));
+  EXPECT_EQ("mul1", new_div1->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, MulToSquare) {
@@ -246,6 +265,9 @@ TEST_F(ArithmeticOptimizerTest, MulToSquare) {
   Output id = ops::Identity(s.WithOpName("id"), mul);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -260,6 +282,10 @@ TEST_F(ArithmeticOptimizerTest, MulToSquare) {
   EXPECT_EQ(2, output.node(4).input_size());
   EXPECT_EQ("c", output.node(4).input(0));
   EXPECT_EQ("^d", output.node(4).input(1));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
@@ -272,6 +298,9 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
   Output id = ops::Identity(s.WithOpName("id"), recip2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -282,6 +311,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
   EXPECT_EQ("c", output.node(1).input(0));
   EXPECT_EQ("c", output.node(3).input(0));
   EXPECT_EQ("c", output.node(5).input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
@@ -294,6 +327,9 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
   Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -307,6 +343,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
   EXPECT_EQ(6, output.node_size());
   EXPECT_EQ("squeeze", output.node(5).input(0));
   EXPECT_EQ("c", output.node(2).input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
@@ -321,6 +361,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -338,6 +382,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
       EXPECT_EQ(original.input(j), optimized.input(j));
     }
   }
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
@@ -349,28 +397,35 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_const = output.node(3);
-  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
-  EXPECT_EQ("^x", new_const.input(0));
+
+  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  ASSERT_NE(new_const, nullptr);
+  EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
-            new_const.attr().at("value").tensor().tensor_content());
-  const NodeDef& new_mul = output.node(4);
-  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
-  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
-  EXPECT_EQ("x", new_mul.input(1));
-  const NodeDef& new_id = output.node(2);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+            new_const->attr().at("value").tensor().tensor_content());
+
+  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  ASSERT_NE(new_mul, nullptr);
+  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ("x", new_mul->input(1));
+
+  const NodeDef* new_id = node_map.GetNode("id");
+  ASSERT_NE(new_id, nullptr);
+  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
@@ -383,29 +438,36 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(6, output.node_size());
-  const NodeDef& new_const = output.node(4);
-  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
-  EXPECT_EQ("^x", new_const.input(0));
+
+  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  ASSERT_NE(new_const, nullptr);
+  EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
-            new_const.attr().at("value").tensor().tensor_content());
-  const NodeDef& new_mul = output.node(5);
-  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
-  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
-  EXPECT_EQ("x", new_mul.input(1));
-  EXPECT_EQ("^y", new_mul.input(2));
-  const NodeDef& new_id = output.node(3);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+            new_const->attr().at("value").tensor().tensor_content());
+
+  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  ASSERT_NE(new_mul, nullptr);
+  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ("x", new_mul->input(1));
+  EXPECT_EQ("^y", new_mul->input(2));
+
+  const NodeDef* new_id = node_map.GetNode("id");
+  ASSERT_NE(new_id, nullptr);
+  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
@@ -421,6 +483,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
   const std::vector<string> devices{
       "/device:CPU:0", "/device:GPU:0", "/device:CPU:0", "/device:GPU:1",
       "/device:CPU:0", "/device:CPU:0", "/device:CPU:0",
@@ -445,48 +508,45 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   EXPECT_EQ(17, output.node_size());
 
   const NodeDef* id_node = node_map.GetNode("id");
-  ASSERT_TRUE(id_node != nullptr);
+  ASSERT_NE(id_node, nullptr);
   EXPECT_EQ(1, id_node->input_size());
   EXPECT_EQ(HoistMulName("Add_6"), id_node->input(0));
 
   const NodeDef* mul_node = node_map.GetNode(HoistMulName("Add_6"));
-  ASSERT_TRUE(mul_node != nullptr);
+  ASSERT_NE(mul_node, nullptr);
   EXPECT_EQ(2, mul_node->input_size());
   EXPECT_EQ("Placeholder", mul_node->input(0));
   EXPECT_EQ(HoistAddName("Add_6"), mul_node->input(1));
 
   const NodeDef* add_6_node = node_map.GetNode(HoistAddName("Add_6"));
-  ASSERT_TRUE(add_6_node != nullptr);
-  EXPECT_EQ(3, add_6_node->input_size());
+  ASSERT_NE(add_6_node, nullptr);
+  EXPECT_EQ(2, add_6_node->input_size());
   EXPECT_EQ(HoistAddName("Add_4"), add_6_node->input(0));
   EXPECT_EQ(HoistAddName("Add_5"), add_6_node->input(1));
-  EXPECT_EQ("^Placeholder", add_6_node->input(2));
 
   const NodeDef* add_4_node = node_map.GetNode(HoistAddName("Add_4"));
-  ASSERT_TRUE(add_4_node != nullptr);
+  ASSERT_NE(add_4_node, nullptr);
   EXPECT_EQ("Add", add_4_node->op());
-  EXPECT_EQ(3, add_4_node->input_size());
+  EXPECT_EQ(2, add_4_node->input_size());
   EXPECT_EQ(OptimizedName("Add_const"), add_4_node->input(0));
   EXPECT_EQ(OptimizedName("Add_1_const"), add_4_node->input(1));
-  EXPECT_EQ("^Placeholder", add_4_node->input(2));
 
   const NodeDef* add_5_node = node_map.GetNode(HoistAddName("Add_5"));
-  ASSERT_TRUE(add_5_node != nullptr);
+  ASSERT_NE(add_5_node, nullptr);
   EXPECT_EQ("Add", add_5_node->op());
-  EXPECT_EQ(3, add_5_node->input_size());
+  EXPECT_EQ(2, add_5_node->input_size());
   EXPECT_EQ(OptimizedName("Add_const"), add_5_node->input(0));
   EXPECT_EQ(OptimizedName("Add_1_const"), add_5_node->input(1));
-  EXPECT_EQ("^Placeholder", add_5_node->input(2));
 
   const NodeDef* add_const_node = node_map.GetNode(OptimizedName("Add_const"));
-  ASSERT_TRUE(add_const_node != nullptr);
+  ASSERT_NE(add_const_node, nullptr);
   EXPECT_EQ("Const", add_const_node->op());
   EXPECT_EQ(1, add_const_node->input_size());
   EXPECT_EQ("^Placeholder", add_const_node->input(0));
 
   const NodeDef* add_1_const_node =
       node_map.GetNode(OptimizedName("Add_1_const"));
-  ASSERT_TRUE(add_1_const_node != nullptr);
+  ASSERT_NE(add_1_const_node, nullptr);
   EXPECT_EQ("Const", add_1_const_node->op());
   EXPECT_EQ(1, add_1_const_node->input_size());
   EXPECT_EQ("^Placeholder", add_1_const_node->input(0));
@@ -512,7 +572,8 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
       GrapplerItem item;
       item.fetch = {"id"};
       TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+      EXPECT_EQ(1, tensors_expected.size());
       ArithmeticOptimizer optimizer;
       EnableOnlyHoistCommonFactor(&optimizer);
 
@@ -537,55 +598,63 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
         EXPECT_EQ(9, output.node_size());
 
         const NodeDef* new_add_node = node_map.GetNode(HoistAddName("add"));
-        ASSERT_TRUE(new_add_node != nullptr) << "Hoisted Add node not found";
+        ASSERT_NE(new_add_node, nullptr) << "Hoisted Add node not found";
         EXPECT_EQ("y1", new_add_node->input(0));
         EXPECT_EQ("y2", new_add_node->input(1));
 
         const NodeDef* new_mul_node = node_map.GetNode(HoistMulName("add"));
-        ASSERT_TRUE(new_mul_node != nullptr) << "Hoisted Mul node not found";
+        ASSERT_NE(new_mul_node, nullptr) << "Hoisted Mul node not found";
         EXPECT_EQ("x", new_mul_node->input(0));
         EXPECT_EQ(new_add_node->name(), new_mul_node->input(1));
 
         const NodeDef* id_node = node_map.GetNode("id");
-        ASSERT_TRUE(id_node != nullptr) << "Id node not found";
+        ASSERT_NE(id_node, nullptr) << "Id node not found";
         EXPECT_EQ("id", id_node->name());
         EXPECT_EQ(HoistMulName("add"), id_node->input(0));
       }
+      auto tensors = EvaluateNodes(output, item.fetch);
+      EXPECT_EQ(1, tensors.size());
+      test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
     }
   }
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
   Output transp = ops::Transpose(s.WithOpName("trans"), conj, perm);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  std::vector<string> fetch = {"trans"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("trans_fused"), output.node(6).name());
-  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* trans_fused_node =
+      node_map.GetNode(OptimizedName("trans_fused"));
+  ASSERT_NE(trans_fused_node, nullptr);
+  EXPECT_EQ("ConjugateTranspose", trans_fused_node->op());
+  EXPECT_EQ("z", trans_fused_node->input(0));
+  EXPECT_EQ("perm", trans_fused_node->input(1));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
@@ -593,44 +662,56 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
       ops::ConjugateTranspose(s.WithOpName("conjugate_trans"), conj, perm);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"conjugate_trans"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("conjugate_trans_fused"), output.node(6).name());
-  EXPECT_EQ("Transpose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* conjugate_trans_fused_node =
+      node_map.GetNode(OptimizedName("conjugate_trans_fused"));
+  EXPECT_EQ("Transpose", conjugate_trans_fused_node->op());
+  EXPECT_EQ("z", conjugate_trans_fused_node->input(0));
+  EXPECT_EQ("perm", conjugate_trans_fused_node->input(1));
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output trans = ops::Transpose(s.WithOpName("trans"), z, perm);
   Output conj = ops::Conj(s.WithOpName("conj"), trans);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"conj"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("conj_fused"), output.node(6).name());
-  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* conj_fused_node =
+      node_map.GetNode(OptimizedName("conj_fused"));
+  EXPECT_EQ("ConjugateTranspose", conj_fused_node->op());
+  EXPECT_EQ("z", conj_fused_node->input(0));
+  EXPECT_EQ("perm", conj_fused_node->input(1));
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
@@ -652,27 +733,32 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
     }
     GrapplerItem item;
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    std::vector<string> fetch = {"matmul"};
+    auto tensors_expected = EvaluateNodes(item.graph, fetch);
+    EXPECT_EQ(1, tensors_expected.size());
 
     ArithmeticOptimizer optimizer;
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
-    // Run the optimizer twice to make sure the rewrite is idempotent.
-    item.graph.Swap(&output);
-    status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
+    OptimizeTwice(&optimizer, &item, &output);
+    NodeMap node_map(&output);
 
     EXPECT_EQ(7, output.node_size());
-    EXPECT_EQ(OptimizedName("matmul_fused"), output.node(6).name());
-    EXPECT_EQ("a", output.node(6).input(0));
-    EXPECT_EQ("b", output.node(6).input(1));
+
+    const NodeDef* matmul_fused_node =
+        node_map.GetNode(OptimizedName("matmul_fused"));
+    ASSERT_NE(matmul_fused_node, nullptr);
+    EXPECT_EQ("a", matmul_fused_node->input(0));
+    EXPECT_EQ("b", matmul_fused_node->input(1));
     if (matmul_type == "BatchMatMul") {
-      EXPECT_TRUE(output.node(6).attr().at("adj_x").b());
-      EXPECT_TRUE(output.node(6).attr().at("adj_y").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("adj_x").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("adj_y").b());
     } else {
-      EXPECT_TRUE(output.node(6).attr().at("transpose_a").b());
-      EXPECT_TRUE(output.node(6).attr().at("transpose_b").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("transpose_a").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("transpose_b").b());
     }
+    auto tensors = EvaluateNodes(output, fetch);
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
   }
 }
 
@@ -694,6 +780,9 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   Output matmul = ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"matmul"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -706,6 +795,9 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   EXPECT_EQ("b", output.node(10).input(1));
   EXPECT_TRUE(output.node(10).attr().at("adj_x").b());
   EXPECT_TRUE(output.node(10).attr().at("adj_y").b());
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<complex64>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
@@ -726,7 +818,10 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -734,6 +829,9 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
@@ -748,7 +846,10 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({8, 3, 28, 28}));
+  item.feed = {{"Placeholder", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -756,6 +857,9 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
@@ -768,7 +872,6 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -799,7 +902,10 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_INT8>(TensorShape({8, 3, 28, 28, 4}));
+  item.feed = {{"nchw_vect_c", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -807,6 +913,9 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast) {
@@ -875,7 +984,7 @@ TEST_F(ArithmeticOptimizerTest, NoReorderTransposeCast) {
   EXPECT_EQ(1, num_transposes);
 }
 
-TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposes) {
+TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs_shape =
       ops::Const(s.WithOpName("inputs_shape"), {8, 3, 28, 28}, {4});
@@ -883,18 +992,21 @@ TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposes) {
       ops::RandomUniform(s.WithOpName("inputs"), inputs_shape, DT_FLOAT);
   Output perm1 = ops::Const(s.WithOpName("perm1"), {0, 2, 3, 1}, {4});
   Output perm2 = ops::Const(s.WithOpName("perm2"), {0, 3, 1, 2}, {4});
+  Output perm3 = ops::Const(s.WithOpName("perm2"), {0, 1, 2, 3}, {4});
   Output transpose1 = ops::Transpose(s.WithOpName("transpose1"), inputs, perm1);
   Output transpose2 =
       ops::Transpose(s.WithOpName("transpose2"), transpose1, perm2);
-  Output outputs = ops::Identity(s.WithOpName("outputs"), transpose2);
+  Output transpose3 = ops::Transpose(s.WithOpName("transpose3"), inputs, perm3);
+  Output id1 = ops::Identity(s.WithOpName("id1"), transpose2);
+  Output id2 = ops::Identity(s.WithOpName("id2"), transpose3);
 
   GrapplerItem item;
-  item.fetch = {"outputs"};
+  item.fetch = {"id1", "id2"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
   ArithmeticOptimizer optimizer;
-  EnableOnlyRemoveInverseTranspose(&optimizer);
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
   OptimizeAndPrune(&optimizer, &item, &output);
 
   std::set<string> nodes_after_optimization;
@@ -902,10 +1014,10 @@ TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposes) {
     nodes_after_optimization.insert(node.name());
   }
   EXPECT_EQ(nodes_after_optimization,
-            std::set<string>({"inputs_shape", "inputs", "outputs"}));
+            std::set<string>({"id1", "id2", "inputs_shape", "inputs"}));
 }
 
-TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposesMultipleOutputs) {
+TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposesMultipleOutputs) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs_shape =
       ops::Const(s.WithOpName("inputs_shape"), {8, 9, 28, 28}, {4});
@@ -926,7 +1038,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposesMultipleOutputs) {
 
   GraphDef output;
   ArithmeticOptimizer optimizer;
-  EnableOnlyRemoveInverseTranspose(&optimizer);
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
   OptimizeAndPrune(&optimizer, &item, &output);
 
   for (const NodeDef& node : output.node()) {
@@ -954,7 +1066,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveTransposesWithControlDependency) {
 
   GraphDef output;
   ArithmeticOptimizer optimizer;
-  EnableOnlyRemoveInverseTranspose(&optimizer);
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
   OptimizeAndPrune(&optimizer, &item, &output);
 
   NodeMap node_map(&output);
@@ -982,7 +1094,7 @@ TEST_F(ArithmeticOptimizerTest, NotRemoveTransposes) {
 
   GraphDef output;
   ArithmeticOptimizer optimizer;
-  EnableOnlyRemoveInverseTranspose(&optimizer);
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
   OptimizeAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(6, output.node_size());
@@ -1270,7 +1382,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveRedundantCast) {
   EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "outputs"));
 }
 
-TEST_F(ArithmeticOptimizerTest, AddOpsRewriteCollapseAddsOfIdenticalShape) {
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Scope sx = s.NewSubScope("x");
   tensorflow::Scope sy = s.NewSubScope("y");
@@ -1306,8 +1418,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteCollapseAddsOfIdenticalShape) {
 
   // check add tree was replaced with AddN
   const NodeDef* collapsed_add =
-      node_map.GetNode("y/ArithmeticOptimizer/AddOpsRewrite_Add_abc_Add_ab");
-  ASSERT_TRUE(collapsed_add != nullptr);
+      node_map.GetNode("y/ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_add, nullptr);
 
   EXPECT_EQ("AddN", collapsed_add->op());
   EXPECT_EQ(3, collapsed_add->input_size());
@@ -1317,12 +1429,12 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteCollapseAddsOfIdenticalShape) {
 
   // check output was re-wired to new node
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
-  ASSERT_TRUE(updated_outputs != nullptr);
+  ASSERT_NE(updated_outputs, nullptr);
 
   EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
 }
 
-TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMultiplePasses) {
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
@@ -1365,8 +1477,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMultiplePasses) {
 
   // check left Add subtree replaced with AddN
   const NodeDef* collapsed_left =
-      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc_Add_ab");
-  ASSERT_TRUE(collapsed_left != nullptr);
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_left, nullptr);
 
   EXPECT_EQ("AddN", collapsed_left->op());
   EXPECT_EQ(3, collapsed_left->input_size());
@@ -1376,8 +1488,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMultiplePasses) {
 
   // check right Add subtree replaced with AddN
   const NodeDef* collapsed_right =
-      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_xyz_Add_xy");
-  ASSERT_TRUE(collapsed_right != nullptr);
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_xyz");
+  ASSERT_NE(collapsed_right, nullptr);
 
   EXPECT_EQ("AddN", collapsed_right->op());
   EXPECT_EQ(3, collapsed_right->input_size());
@@ -1387,7 +1499,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMultiplePasses) {
 
   // check that Mul inputs re-wired to new Nodes
   const NodeDef* updated_mul = node_map.GetNode("Mul");
-  ASSERT_TRUE(updated_mul != nullptr);
+  ASSERT_NE(updated_mul, nullptr);
 
   EXPECT_EQ("Mul", updated_mul->op());
   EXPECT_EQ(2, updated_mul->input_size());
@@ -1395,7 +1507,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMultiplePasses) {
   EXPECT_EQ(collapsed_right->name(), updated_mul->input(1));
 }
 
-TEST_F(ArithmeticOptimizerTest, AddOpsRewriteAddInputThroughMultiplePaths) {
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
@@ -1428,9 +1540,9 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteAddInputThroughMultiplePaths) {
   NodeMap node_map(&output);
 
   // check Add tree replaced with AddN
-  const NodeDef* collapsed_add = node_map.GetNode(
-      "ArithmeticOptimizer/AddOpsRewrite_Add_all_Add_ab_Add_bc");
-  ASSERT_TRUE(collapsed_add != nullptr);
+  const NodeDef* collapsed_add =
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_all");
+  ASSERT_NE(collapsed_add, nullptr);
 
   EXPECT_EQ("AddN", collapsed_add->op());
   EXPECT_EQ(4, collapsed_add->input_size());
@@ -1440,5 +1552,294 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteAddInputThroughMultiplePaths) {
   EXPECT_EQ("c", collapsed_add->input(3));
 }
 
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // unknown input shape propagated symbolically through the graph
+  auto input = ops::Variable(s.WithOpName("input"), {-1, 2}, DT_FLOAT);
+
+  // [a, b, c] have symbolically equal shapes
+  auto a = ops::Sqrt(s.WithOpName("a"), input);
+  auto b = ops::Square(s.WithOpName("b"), input);
+  auto c = ops::Round(s.WithOpName("c"), input);
+
+  // [add_ab, add_abc] shape must be inferred from inputs
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_abc);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //     +
+  //    / \
+  //   +   c      -->    AddN(a, b, c)
+  //  / \
+  // a   b
+  EXPECT_EQ(6, output.node_size());
+
+  NodeMap node_map(&output);
+
+  // check add tree was replaced with AddN
+  const NodeDef* collapsed_add =
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_add, nullptr);
+  EXPECT_EQ("AddN", collapsed_add->op());
+  EXPECT_EQ(3, collapsed_add->input_size());
+  EXPECT_EQ("a", collapsed_add->input(0));
+  EXPECT_EQ("b", collapsed_add->input(1));
+  EXPECT_EQ("c", collapsed_add->input(2));
+
+  // check output was re-wired to new node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32, 32, 32}, DT_FLOAT);
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto x = ops::Variable(s.WithOpName("x"), {32}, DT_FLOAT);
+  auto y = ops::Variable(s.WithOpName("y"), {32, 32}, DT_FLOAT);
+  auto z = ops::Variable(s.WithOpName("z"), {32, 32, 32}, DT_FLOAT);
+  auto add_xy = ops::Add(s.WithOpName("Add_xy"), x, y);
+  auto add_xyz = ops::Add(s.WithOpName("Add_xyz"), add_xy, z);
+
+  auto add_all = ops::Add(s.WithOpName("AddAll"), add_abc, add_xyz);
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_all);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //  1) [a, x], [b, y], [c, z] - aggregate same shapes first
+  //  2) Build an aggregation tree minimizing cost of broadcast
+  //
+  //         +                              +
+  //      /     \                       /       \
+  //     +       +                     +       AddN(c, z)
+  //    / \     / \                 /     \
+  //   +   c   x   + -->    AddN(a, x)  AddN(b, y)
+  //  / \         / \
+  // a   b       y   z
+  EXPECT_EQ(12, output.node_size());
+  NodeMap node_map(&output);
+
+  // expected names of outer and inner nodes
+  string outer_add_name = "ArithmeticOptimizer/AddOpsRewrite_AddAll";
+  string outer_0_add_name =
+      "ArithmeticOptimizer/AddOpsRewrite_Internal_0_AddAll";
+  string inner_0_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_0_AddAll";
+  string inner_1_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_1_AddAll";
+  string inner_2_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_2_AddAll";
+
+  // Add [a, x] first
+  const NodeDef* add_ax_node = node_map.GetNode(inner_0_add_name);
+  ASSERT_NE(add_ax_node, nullptr);
+  EXPECT_EQ("AddN", add_ax_node->op());
+  EXPECT_EQ(2, add_ax_node->input_size());
+  EXPECT_EQ("a", add_ax_node->input(0));
+  EXPECT_EQ("x", add_ax_node->input(1));
+
+  // Then add [b, y]
+  const NodeDef* add_by_node = node_map.GetNode(inner_1_add_name);
+  ASSERT_NE(add_by_node, nullptr);
+  EXPECT_EQ("AddN", add_by_node->op());
+  EXPECT_EQ(2, add_by_node->input_size());
+  EXPECT_EQ("b", add_by_node->input(0));
+  EXPECT_EQ("y", add_by_node->input(1));
+
+  // Then add [c, z]
+  const NodeDef* add_cz_node = node_map.GetNode(inner_2_add_name);
+  ASSERT_NE(add_cz_node, nullptr);
+  EXPECT_EQ("AddN", add_cz_node->op());
+  EXPECT_EQ(2, add_cz_node->input_size());
+  EXPECT_EQ("c", add_cz_node->input(0));
+  EXPECT_EQ("z", add_cz_node->input(1));
+
+  // Then add results together starting from smaller shapes [a, x] + [b, y]
+  const NodeDef* outer_0_node = node_map.GetNode(outer_0_add_name);
+  ASSERT_NE(outer_0_node, nullptr);
+  EXPECT_EQ("Add", outer_0_node->op());
+  EXPECT_EQ(2, outer_0_node->input_size());
+  EXPECT_EQ(inner_0_add_name, outer_0_node->input(0));
+  EXPECT_EQ(inner_1_add_name, outer_0_node->input(1));
+
+  // And finally top level Add node
+  const NodeDef* outer_node = node_map.GetNode(outer_add_name);
+  ASSERT_NE(outer_node, nullptr);
+  EXPECT_EQ("Add", outer_node->op());
+  EXPECT_EQ(2, outer_node->input_size());
+  EXPECT_EQ(outer_0_add_name, outer_node->input(0));
+  EXPECT_EQ(inner_2_add_name, outer_node->input(1));
+
+  // And outputs reading new top level Add node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // We have a small input with one unknown dimension
+  auto small = ops::Variable(s.WithOpName("small"), {-1, 1, 1}, DT_FLOAT);
+
+  // And second input which is larger, but has the same unknown dimension
+  // device spec prevents this node from rewriting
+  auto d = "/job:do_not_rewrite_me";
+  auto v = ops::Variable(s.WithOpName("v"), {1, 32, 32}, DT_FLOAT);
+  auto large = ops::Add(s.WithOpName("large").WithDevice(d), small, v);
+
+  // [a, c] have {?, 1, 1} shape, [b] has {?, 32, 32}
+  auto a = ops::Sqrt(s.WithOpName("a"), small);
+  auto b = ops::Square(s.WithOpName("b"), large);
+  auto c = ops::Round(s.WithOpName("c"), small);
+
+  // [add_ab, add_abc] shape must be inferred from inputs
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_abc);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur: it's much cheaper to add small
+  // tensors, and do the broadcast just once
+  //
+  //     +                  +
+  //    / \                / \
+  //   +   c      -->     +   b
+  //  / \                / \
+  // a   b              a   c
+  EXPECT_EQ(9, output.node_size());
+  NodeMap node_map(&output);
+
+  // expected names of outer and inner nodes
+  string outer_add_name = "ArithmeticOptimizer/AddOpsRewrite_Add_abc";
+  string inner_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_0_Add_abc";
+
+  // outer Add node
+  const NodeDef* outer_add = node_map.GetNode(outer_add_name);
+  ASSERT_NE(outer_add, nullptr);
+  EXPECT_EQ("Add", outer_add->op());
+  EXPECT_EQ(inner_add_name, outer_add->input(0));
+  EXPECT_EQ("b", outer_add->input(1));
+
+  // inner AddN node
+  const NodeDef* inner_add = node_map.GetNode(inner_add_name);
+  ASSERT_NE(inner_add, nullptr);
+  EXPECT_EQ(2, inner_add->input_size());
+  EXPECT_EQ("a", inner_add->input(0));
+  EXPECT_EQ("c", inner_add->input(1));
+
+  // check output was re-wired to new node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Variable(s.WithOpName("x"), {2, 2}, DT_FLOAT);
+  auto y = ops::Variable(s.WithOpName("y"), {2, 2}, DT_FLOAT);
+  Output neg_x = ops::Neg(s.WithOpName("Neg_x"), x);
+  Output neg_y = ops::Neg(s.WithOpName("Neg_y"), y);
+  Output add_x_y = ops::Add(s.WithOpName("Add_x_y"), x, y);
+  Output add_negx_y = ops::Add(s.WithOpName("Add_negx_y"), neg_x, y);
+  Output add_x_negy = ops::Add(s.WithOpName("Add_x_negy"), x, neg_y);
+  Output add_negx_negy = ops::Add(s.WithOpName("Add_negx_negy"), neg_x, neg_y);
+  Output sub_x_y = ops::Sub(s.WithOpName("Sub_x_y"), x, y);
+  Output sub_negx_y = ops::Sub(s.WithOpName("Sub_negx_y"), neg_x, y);
+  Output sub_x_negy = ops::Sub(s.WithOpName("Sub_x_negy"), x, neg_y);
+  Output sub_negx_negy = ops::Sub(s.WithOpName("Sub_negx_negy"), neg_x, neg_y);
+  auto add_all = ops::AddN(s.WithOpName("add_all"),
+                           {add_x_y, add_negx_y, add_x_negy, add_negx_negy,
+                            sub_x_y, sub_negx_y, sub_x_negy, sub_negx_negy});
+
+  GrapplerItem item;
+  item.fetch = {"add_all"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveNegation(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  int found = 0;
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "Add_negx_y") {
+      ++found;
+      EXPECT_EQ("Sub", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+      EXPECT_EQ("^Neg_x", node.input(2));
+    } else if (node.name() == "Add_x_negy") {
+      ++found;
+      EXPECT_EQ("Sub", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^Neg_y", node.input(2));
+    } else if (node.name() == "Add_negx_negy") {
+      ++found;
+      EXPECT_EQ("Sub", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("Neg_y", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+      EXPECT_EQ("^Neg_x", node.input(2));
+    } else if (node.name() == "Sub_x_negy") {
+      ++found;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^Neg_y", node.input(2));
+    } else if (node.name() == "Sub_negx_negy") {
+      ++found;
+      EXPECT_EQ("Sub", node.op());
+      EXPECT_EQ(4, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+      EXPECT_EQ("^Neg_y", node.input(2));
+      EXPECT_EQ("^Neg_x", node.input(3));
+    }
+  }
+  EXPECT_EQ(5, found);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 77ccd4ffc8c460d09e6ad8db50351928c4f681fb..d941a0b3f9d38519216dff22cb1854b3094e7e45 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -109,33 +109,18 @@ class DeviceSimple : public DeviceBase {
 };
 
 template <typename T>
-bool AllValuesAre(const TensorProto& tensor, const T& value) {
-  // TensorProto represents the content of the tensor in either <type>_val or
-  // tensor_content.
-  typename checkpoint::SaveTypeTraits<T>::RepeatedField* tensor_values =
-      checkpoint::MutableTensorProtoData<T>(const_cast<TensorProto*>(&tensor));
-  if (!tensor_values->empty()) {
-    for (const T& tensor_value : *tensor_values) {
-      if (tensor_value != value) {
-        return false;
-      }
-    }
-    return true;
+bool AllValuesAre(const TensorProto& proto, const T& value) {
+  Tensor tensor;
+  if (!tensor.FromProto(proto)) {
+    return false;
   }
-  const auto tensor_content_size = tensor.tensor_content().size();
-  if (tensor_content_size > 0) {
-    CHECK_EQ(0, tensor_content_size % sizeof(T));
-    std::vector<T> raw_values(tensor_content_size / sizeof(T));
-    port::CopyToArray(tensor.tensor_content(),
-                      reinterpret_cast<char*>(raw_values.data()));
-    for (int i = 0; i < tensor_content_size / sizeof(T); ++i) {
-      if (raw_values[i] != value) {
-        return false;
-      }
+  auto values = tensor.flat<T>();
+  for (int i = 0; i < tensor.NumElements(); ++i) {
+    if (values(i) != value) {
+      return false;
     }
-    return true;
   }
-  return false;
+  return true;
 }
 
 // Add new_input as a control input to node if it does not already depend on it.
@@ -498,6 +483,11 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
     return Status::OK();
   }
 
+  // Don't optimize this again if it was already optimized and folded.
+  if (OptimizedNodeExists(node, "-folded-1") ||
+      OptimizedNodeExists(node, "-folded-2")) {
+    return Status::OK();
+  }
   int64 min_id = 0;
   BCast::Vec shape1;
   if (!ExtractShape(*shape_node1, properties, &shape1, &min_id)) {
@@ -562,6 +552,7 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
 
   const DataType type = node.attr().at("T").type();
   NodeDef* out[2];
+  bool created_const = false;
   for (int j = 0; j < 2; ++j) {
     int reduction_indices = reduce_dims[j].size();
     Tensor value(type, TensorShape({reduction_indices}));
@@ -585,17 +576,20 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
           AddControlDependency(node.name(), graph_, node_map_.get());
       *out[j]->add_input() = ctrl_dep;
       node_map_->AddOutput(NodeName(ctrl_dep), const_name);
+      created_const = true;
     }
   }
 
-  const std::set<NodeDef*> outputs = node_map_->GetOutputs(node.name());
-  for (NodeDef* output : outputs) {
-    for (int k = 0; k < output->input_size(); ++k) {
-      int port;
-      string node_name = ParseNodeName(output->input(k), &port);
-      if (node_name == node.name() && port >= 0 && port < 2 && out[port]) {
-        *output->mutable_input(k) = out[port]->name();
-        node_map_->UpdateInput(output->name(), node_name, out[port]->name());
+  if (created_const) {
+    const std::set<NodeDef*> outputs = node_map_->GetOutputs(node.name());
+    for (NodeDef* output : outputs) {
+      for (int k = 0; k < output->input_size(); ++k) {
+        int port;
+        string node_name = ParseNodeName(output->input(k), &port);
+        if (node_name == node.name() && port >= 0 && port < 2 && out[port]) {
+          *output->mutable_input(k) = out[port]->name();
+          node_map_->UpdateInput(output->name(), node_name, out[port]->name());
+        }
       }
     }
   }
@@ -753,10 +747,6 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (op.find("Quantized") != string::npos || op.find("Sparse") == 0) {
     return false;
   }
-  if (node.attr().count("_XlaCompile") > 0 &&
-      node.attr().at("_XlaCompile").b()) {
-    return false;
-  }
 
   const OpDef* op_def = nullptr;
   Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
@@ -783,7 +773,7 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   // the case of a merge node that propagate the first inputs that becomes
   // available, and therefore only requires a single constant input to be
   // foldable.
-  bool has_constant_input = false;
+  bool merge_has_constant_input = false;
   const bool is_merge = IsMerge(node);
   for (const auto& input : node.input()) {
     if (IsControlInput(input)) {
@@ -794,21 +784,20 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
       return false;
     }
     bool is_const = IsReallyConstant(*input_node);
-    if (!is_const && !is_merge) {
-      return false;
-    }
-    // Don't fold strings constants for now since this causes problems with
-    // checkpointing.
-    if (is_const && input_node->attr().at("dtype").type() == DT_STRING) {
+    if (is_const) {
+      // Don't fold strings constants for now since this causes problems with
+      // checkpointing.
+      if (input_node->attr().at("dtype").type() == DT_STRING) {
+        return false;
+      }
+      // Special case: If a Merge node has at least one constant input that
+      // does not depend on a control input, we can fold it.
+      merge_has_constant_input |= !HasControlInputs(*input_node);
+    } else if (!is_merge) {
       return false;
     }
-    has_constant_input |= is_const;
   }
-  if (is_merge) {
-    return has_constant_input;
-  }
-
-  return true;
+  return !is_merge || merge_has_constant_input;
 }
 
 namespace {
@@ -825,17 +814,23 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
   t->set_dtype(type);
   *t->mutable_tensor_shape() = shape;
   switch (type) {
-    SET_TENSOR_VAL_CASE(DT_FLOAT, float, float);
-    SET_TENSOR_VAL_CASE(DT_DOUBLE, double, double);
-    SET_TENSOR_VAL_CASE(DT_INT64, int64, int64);
-    SET_TENSOR_VAL_CASE(DT_UINT64, int64, int64);
-    SET_TENSOR_VAL_CASE(DT_INT32, int32, int);
-    SET_TENSOR_VAL_CASE(DT_UINT32, int32, int);
-    SET_TENSOR_VAL_CASE(DT_INT16, int32, int);
-    SET_TENSOR_VAL_CASE(DT_UINT16, int32, int);
-    SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
-    SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
-    SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
+    case DT_HALF:
+      t->add_half_val(static_cast<Eigen::half>(value).x);
+      break;
+    case DT_BFLOAT16:
+      t->add_half_val(static_cast<bfloat16>(value).value);
+      break;
+      SET_TENSOR_VAL_CASE(DT_FLOAT, float, float);
+      SET_TENSOR_VAL_CASE(DT_DOUBLE, double, double);
+      SET_TENSOR_VAL_CASE(DT_INT64, int64, int64);
+      SET_TENSOR_VAL_CASE(DT_UINT64, int64, int64);
+      SET_TENSOR_VAL_CASE(DT_INT32, int32, int);
+      SET_TENSOR_VAL_CASE(DT_UINT32, int32, int);
+      SET_TENSOR_VAL_CASE(DT_INT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_UINT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
     default:
       return errors::InvalidArgument("Unsupported type: ", type);
   }
@@ -1246,7 +1241,8 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
     Status s = FoldNode(node, output);
     processed_nodes.insert(node->name());
     if (!s.ok()) {
-      VLOG(1) << "Failed to fold node " << node->name() << ": " << s;
+      VLOG(1) << "Failed to fold node " << node->DebugString()
+              << "\nError message: " << s;
     } else {
       for (auto& output : fanout) {
         if (IsFoldable(*output)) {
@@ -1388,8 +1384,8 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
   }
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
-    // TODO(rmlarsen): Make DT_HALF case compile.
-    //    IS_ONES_CASE(DT_HALF);
+    IS_ONES_CASE(DT_HALF);
+    IS_ONES_CASE(DT_BFLOAT16);
     IS_ONES_CASE(DT_FLOAT);
     IS_ONES_CASE(DT_DOUBLE);
     IS_ONES_CASE(DT_COMPLEX64);
@@ -1423,8 +1419,8 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   }
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
-    // TODO(rmlarsen): Make DT_HALF case compile.
-    //    IS_ZEROS_CASE(DT_HALF);
+    IS_ZEROS_CASE(DT_HALF);
+    IS_ZEROS_CASE(DT_BFLOAT16);
     IS_ZEROS_CASE(DT_FLOAT);
     IS_ZEROS_CASE(DT_DOUBLE);
     IS_ZEROS_CASE(DT_COMPLEX64);
@@ -1511,9 +1507,8 @@ void ConstantFolding::ReplaceSubtractionFromZeroByNegation(NodeDef* node,
 }
 
 Status ConstantFolding::ReplaceOperationWithConstant(
-    double value, const TensorShapeProto& shape, NodeDef* node,
-    GraphDef* graph) {
-  AttrValue dtype_attr = node->attr().at("T");
+    double value, const AttrValue& dtype_attr, const TensorShapeProto& shape,
+    NodeDef* node, GraphDef* graph) {
   AttrValue tensor_attr;
   TF_RETURN_IF_ERROR(CreateConstantTensorAttrValue(dtype_attr.type(), value,
                                                    shape, &tensor_attr));
@@ -1535,15 +1530,26 @@ Status ConstantFolding::ReplaceOperationWithConstant(
   return Status::OK();
 }
 
-Status ConstantFolding::SimplifyGraph(GraphDef* output,
+Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
                                       GraphProperties* properties,
                                       bool use_shape_info) {
   const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
-  for (int i = 0; i < output->node_size(); ++i) {
-    NodeDef* node = output->mutable_node(i);
+  for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    NodeDef* node = optimized_graph->mutable_node(i);
+
+    if (IsSplit(*node) && node->attr().at("num_split").i() == 1) {
+      ReplaceOperationWithIdentity(1, node, optimized_graph);
+      continue;
+    }
+
+    if (IsSplitV(*node) && node->attr().at("num_split").i() == 1) {
+      ReplaceOperationWithIdentity(0, node, optimized_graph);
+      continue;
+    }
 
     // Remove Shuffle or Reverse op over scalar values.
     if (use_shape_info &&
+        !properties->GetInputProperties(node->name()).empty() &&
         (IsShuffle(*node) || IsReverse(*node) || IsTranspose(*node))) {
       const auto& shape =
           properties->GetInputProperties(node->name())[0].shape();
@@ -1554,7 +1560,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
         replaceable &= shape.dim(j).size() == 1;
       }
       if (replaceable) {
-        ReplaceOperationWithIdentity(0, node, output);
+        ReplaceOperationWithIdentity(0, node, optimized_graph);
         continue;
       }
     }
@@ -1595,7 +1601,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
           }
         }
         if (replaceable) {
-          ReplaceOperationWithIdentity(0, node, output);
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
           continue;
         }
       }
@@ -1624,7 +1630,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
           }
         }
         if (replaceable) {
-          ReplaceOperationWithIdentity(0, node, output);
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
           continue;
         }
       }
@@ -1648,7 +1654,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
           replaceable &= flatten(j) == 0;
         }
         if (replaceable) {
-          ReplaceOperationWithIdentity(0, node, output);
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
           continue;
         }
       }
@@ -1668,7 +1674,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
         replaceable &= shape.dim(j).size() > 1;
       }
       if (replaceable) {
-        ReplaceOperationWithIdentity(0, node, output);
+        ReplaceOperationWithIdentity(0, node, optimized_graph);
         continue;
       }
     }
@@ -1677,7 +1683,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
         !OptimizedNodeExists(*node, "_const_axis")) {
       // Create constant axis node.
       Tensor axis_t(DT_INT32, TensorShape({}));
-      NodeDef* axis_node = output->add_node();
+      NodeDef* axis_node = optimized_graph->add_node();
       axis_node->set_name(OptimizedNodeName(*node, "_const_axis"));
       const int axis = node->attr().at("axis").i();
       if (!SetTensorValue(DT_INT32, axis, &axis_t).ok() ||
@@ -1685,7 +1691,6 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
                .ok()) {
         continue;
       }
-      VLOG(1) << "*** Rewriting trivial Pack node: " << node->DebugString();
       // Add a control dependency to make sure axis_node is in the right frame.
       const string ctrl_dep = ConstantFolding::AddControlDependency(
           node->input(0), graph_, node_map_.get());
@@ -1703,6 +1708,55 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       if (node->input_size() > 2) {
         node->mutable_input()->SwapElements(1, node->input_size() - 1);
       }
+      graph_modified_ = true;
+      continue;
+    }
+
+    // Move constants past Enter.
+    if (IsEnter(*node) && node->input_size() > 0) {
+      if (node->attr().count("is_constant") == 0 ||
+          !node->attr().at("is_constant").b()) {
+        continue;
+      }
+      const string& node_name = node->name();
+      const NodeDef* input = node_map_->GetNode(node->input(0));
+      if (input != nullptr && IsReallyConstant(*input) &&
+          !OptimizedNodeExists(*input, "_enter")) {
+        auto fanouts = node_map_->GetOutputs(node_name);
+        // Find non-constant nodes that consume the output of *node.
+        std::vector<NodeDef*> consumers;
+        for (NodeDef* fanout : fanouts) {
+          if (!IsConstant(*fanout)) {
+            for (int i = 0; i < fanout->input_size(); ++i) {
+              if (fanout->input(i) == node_name) {
+                consumers.push_back(fanout);
+                break;
+              }
+            }
+          }
+        }
+        if (!consumers.empty()) {
+          NodeDef* new_node = optimized_graph->add_node();
+          *new_node = *input;
+          new_node->set_name(OptimizedNodeName(*input, "_enter"));
+          new_node->set_device(node->device());
+          new_node->clear_input();
+          new_node->add_input(AsControlDependency(node_name));
+          node_map_->AddNode(new_node->name(), new_node);
+          node_map_->AddOutput(node_name, new_node->name());
+          for (NodeDef* consumer : consumers) {
+            for (int i = 0; i < consumer->input_size(); ++i) {
+              if (NodeName(consumer->input(i)) == node_name) {
+                node_map_->UpdateInput(consumer->name(), node_name,
+                                       new_node->name());
+                consumer->set_input(i, new_node->name());
+              }
+            }
+          }
+          graph_modified_ = true;
+          continue;
+        }
+      }
     }
 
     // Switch(x, x) will always feed false to its false branch and true to
@@ -1754,7 +1808,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
                     return n1->name() < n2->name();
                   });
         // Create constant false & true nodes.
-        NodeDef* false_node = output->add_node();
+        NodeDef* false_node = optimized_graph->add_node();
         false_node->set_name(OptimizedNodeName(*node, "_const_false"));
         if (!CreateNodeDef(false_node->name(), TensorValue(&false_t),
                            false_node)
@@ -1763,7 +1817,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
         }
         false_node->set_device(node->device());
 
-        NodeDef* true_node = output->add_node();
+        NodeDef* true_node = optimized_graph->add_node();
         true_node->set_name(OptimizedNodeName(*node, "_const_true"));
         if (!CreateNodeDef(true_node->name(), TensorValue(&true_t), true_node)
                  .ok()) {
@@ -1776,10 +1830,10 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
         const string false_port = node->name();
         const string true_port = strings::StrCat(node->name(), ":1");
         const string false_ctrl_dep =
-            AddControlDependency(false_port, output, node_map_.get());
+            AddControlDependency(false_port, optimized_graph, node_map_.get());
         false_node->add_input(false_ctrl_dep);
         const string true_ctrl_dep =
-            AddControlDependency(true_port, output, node_map_.get());
+            AddControlDependency(true_port, optimized_graph, node_map_.get());
         true_node->add_input(true_ctrl_dep);
 
         node_map_->AddNode(false_node->name(), false_node);
@@ -1861,13 +1915,13 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       if (y_matches_output_shape &&
           ((is_mul && x_is_one) || (is_add && x_is_zero))) {
         // 1 * y = y or 0 + y = y.
-        ReplaceOperationWithSnapshot(1, node, output);
+        ReplaceOperationWithSnapshot(1, node, optimized_graph);
         continue;
       }
 
       if (y_matches_output_shape && (is_sub && x_is_zero)) {
         // Replace 0 - y with Neg(y).
-        ReplaceSubtractionFromZeroByNegation(node, output);
+        ReplaceSubtractionFromZeroByNegation(node, optimized_graph);
         continue;
       }
 
@@ -1875,7 +1929,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       if (y_matches_output_shape && is_any_div && x_is_one) {
         DataType type = node->attr().at("T").type();
         if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
-          ReplaceDivisionOfOnesByReciprocal(node, output);
+          ReplaceDivisionOfOnesByReciprocal(node, optimized_graph);
           continue;
         }
       }
@@ -1888,7 +1942,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       if (x_matches_output_shape && (((is_mul || is_any_div) && y_is_one) ||
                                      ((is_add || is_sub) && y_is_zero))) {
         // x * 1 = x or x / 1 = x or x +/- 0 = x
-        ReplaceOperationWithSnapshot(0, node, output);
+        ReplaceOperationWithSnapshot(0, node, optimized_graph);
         continue;
       }
 
@@ -1901,18 +1955,24 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
           (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
         const PartialTensorShape shp(output_shape);
         if (shp.IsFullyDefined()) {
-          TF_RETURN_IF_ERROR(
-              ReplaceOperationWithConstant(0, output_shape, node, output));
+          AttrValue dtype_attr;
+          if (node->op() == "SparseMatMul") {
+            dtype_attr.set_type(DT_FLOAT);
+          } else {
+            dtype_attr = node->attr().at("T");
+          }
+          TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
+              0, dtype_attr, output_shape, node, optimized_graph));
           continue;
         }
         // Even if an input shape is only partially known, we may known that it
         // matches the output shape and thus forward the corresponding zero
         // input.
         if ((is_mul || is_any_div) && x_is_zero && x_matches_output_shape) {
-          ReplaceOperationWithIdentity(0, node, output);
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
           continue;
         } else if (is_mul && y_is_zero && y_matches_output_shape) {
-          ReplaceOperationWithIdentity(1, node, output);
+          ReplaceOperationWithIdentity(1, node, optimized_graph);
           continue;
         }
       }
@@ -1937,7 +1997,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
         continue;
       }
       // Insert new reciprocal op and change node from Div to Mul.
-      NodeDef* reciprocal_node = output->add_node();
+      NodeDef* reciprocal_node = optimized_graph->add_node();
       reciprocal_node->set_name(OptimizedNodeName(*node, "_recip"));
       reciprocal_node->set_op("Reciprocal");
       reciprocal_node->set_device(node->device());
@@ -1950,6 +2010,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       node_map_->UpdateOutput(node->name(), const_input,
                               reciprocal_node->name());
       graph_modified_ = true;
+      continue;
     }
 
     // Consider the transformation
@@ -2042,6 +2103,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
     if (IsIdentityN(*node) && NumNonControlInputs(*node) > 0) {
       const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
       const std::vector<NodeDef*> consumers(tmp.begin(), tmp.end());
+      bool updated_graph = false;
       for (int input_idx = 0; input_idx < node->input_size(); ++input_idx) {
         const string& input = node->input(input_idx);
         if (IsControlInput(input)) {
@@ -2072,7 +2134,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
               if (input_node_name == node->name() && output_idx == input_idx) {
                 consumer->set_input(consumer_input_idx, input);
                 // We will keep the input from IdentityN through a control
-                // dependendy, so we only need to add the consumer as an output
+                // dependency, so we only need to add the consumer as an output
                 // for the constant input node.
                 node_map_->AddOutput(NodeName(input), consumer->name());
                 add_dep = true;
@@ -2080,12 +2142,18 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
             }
             if (add_dep) {
               consumer->add_input(AsControlDependency(node->name()));
+              updated_graph = true;
             }
           }
         }
       }
-      for (NodeDef* consumer : consumers) {
-        DedupControlInputs(consumer);
+
+      if (updated_graph) {
+        for (NodeDef* consumer : consumers) {
+          DedupControlInputs(consumer);
+        }
+        graph_modified_ = true;
+        continue;
       }
     }
 
@@ -2126,7 +2194,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       if (1 < const_inputs.size() &&
           const_inputs.size() < num_non_control_inputs &&
           !node_map_->NodeExists(new_node_name)) {
-        NodeDef* added_node = output->add_node();
+        NodeDef* added_node = optimized_graph->add_node();
         *added_node = *node;
         // Always use AddN for the constant node, since AccumulateNV2 is a fake
         // node that cannot be constant folded, since it does not have a kernel.
@@ -2230,7 +2298,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
         if (node_map_->NodeExists(new_node_name)) {
           break;
         }
-        NodeDef* added_node = output->add_node();
+        NodeDef* added_node = optimized_graph->add_node();
         *added_node = *node;
         added_node->set_name(new_node_name);
         node_map_->AddNode(added_node->name(), added_node);
@@ -2278,7 +2346,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
 
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
                                             const GrapplerItem& item,
-                                            GraphDef* output) {
+                                            GraphDef* optimized_graph) {
   node_map_.reset(new NodeMap(graph_));
   nodes_whitelist_.clear();
   // Fold fetch nodes iff it has a single fanout. Note that if a fetch node
@@ -2307,20 +2375,20 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
     TF_RETURN_IF_ERROR(MaterializeShapes(properties));
     TF_RETURN_IF_ERROR(MaterializeConstants(properties));
   }
-  TF_RETURN_IF_ERROR(FoldGraph(output));
-  node_map_.reset(new NodeMap(output));
-  TF_RETURN_IF_ERROR(SimplifyGraph(output, &properties, can_use_shape_info));
+  TF_RETURN_IF_ERROR(FoldGraph(optimized_graph));
+  node_map_.reset(new NodeMap(optimized_graph));
+  TF_RETURN_IF_ERROR(
+      SimplifyGraph(optimized_graph, &properties, can_use_shape_info));
 
   return Status::OK();
 }
 
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                 GraphDef* output) {
+                                 GraphDef* optimized_graph) {
   // TensorFlow flushes denormals to zero and rounds to nearest, so we do
   // the same here.
   port::ScopedFlushDenormal flush;
   port::ScopedSetRound round(FE_TONEAREST);
-
   nodes_to_preserve_ = item.NodesToPreserve();
   for (const auto& feed : item.feed) {
     feed_nodes_.insert(NodeName(feed.first));
@@ -2332,20 +2400,20 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   has_fetch_ = !item.fetch.empty();
-
   GrapplerItem item_to_optimize = item;
-  *output = item.graph;
+  *optimized_graph = item.graph;
   int64 node_count;
   do {
     graph_modified_ = false;
-    item_to_optimize.graph.Swap(output);
+    item_to_optimize.graph.Swap(optimized_graph);
     graph_ = &item_to_optimize.graph;
-    *output = GraphDef();
+    *optimized_graph = GraphDef();
     node_count = graph_->node_size();
-    TF_RETURN_IF_ERROR(RunOptimizationPass(cluster, item_to_optimize, output));
-  } while (graph_modified_ || output->node_size() != node_count);
-  *output->mutable_library() = item.graph.library();
-  *output->mutable_versions() = item.graph.versions();
+    TF_RETURN_IF_ERROR(
+        RunOptimizationPass(cluster, item_to_optimize, optimized_graph));
+  } while (graph_modified_ || optimized_graph->node_size() != node_count);
+  *optimized_graph->mutable_library() = item.graph.library();
+  *optimized_graph->mutable_versions() = item.graph.versions();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 13ecfcd281913777dd47efca0b7b4a15f4ced7b6..f8a9e90d62111e516496450edb873d0e32e40464 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -38,7 +38,7 @@ class ConstantFolding : public GraphOptimizer {
   static string AddControlDependency(const string& input_name, GraphDef* graph,
                                      NodeMap* node_map);
 
-  ConstantFolding(DeviceBase* cpu_device);
+  explicit ConstantFolding(DeviceBase* cpu_device);
   ConstantFolding(RewriterConfig::Toggle opt_level, DeviceBase* cpu_device);
 
   ~ConstantFolding() override {}
@@ -83,7 +83,7 @@ class ConstantFolding : public GraphOptimizer {
   void ReplaceOperationWithSnapshot(int input_to_forward, NodeDef* node,
                                     GraphDef* graph);
   void ReplaceSubtractionFromZeroByNegation(NodeDef* node, GraphDef* graph);
-  Status ReplaceOperationWithConstant(double value,
+  Status ReplaceOperationWithConstant(double value, const AttrValue& dtype_attr,
                                       const TensorShapeProto& shape,
                                       NodeDef* node, GraphDef* graph);
   void ReplaceDivisionOfOnesByReciprocal(NodeDef* node, GraphDef* graph);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 9050ccb05387a77ad341c213fef48dfb76c0c736..71ee81dfde9be2b7e4f5d3c11e3adc2e8a36f9b0 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -22,13 +22,66 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-class ConstantFoldingTest : public GrapplerTest {};
+class ConstantFoldingTest : public GrapplerTest {
+ protected:
+  template <DataType DTYPE>
+  void SimpleNeutralElementTest() {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output x = ops::Placeholder(s.WithOpName("x"), DTYPE,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Tensor zeros_t(DTYPE, TensorShape({2, 2}));
+    Tensor ones_t(DTYPE, TensorShape({2, 2}));
+    Tensor x_t(DTYPE, TensorShape({2, 2}));
+    for (int i = 0; i < 4; ++i) {
+      zeros_t.flat<T>()(i) = T(0);
+      ones_t.flat<T>()(i) = T(1);
+      x_t.flat<T>()(i) = T(i + 1);
+    }
+    Output zeros = ops::Const(s.WithOpName("zeros"), zeros_t);
+    Output ones = ops::Const(s.WithOpName("ones"), ones_t);
+    Output mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
+    Output mul2 = ops::Mul(s.WithOpName("mul2"), x, ones);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch = {"mul1", "mul2"};
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+    LOG(INFO) << output.DebugString();
+    EXPECT_EQ(5, output.node_size());
+    for (int i = 0; i < output.node_size(); ++i) {
+      const NodeDef& node = output.node(i);
+      const string& name = node.name();
+      if (name == "mul1") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "mul2") {
+        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      }
+    }
+    auto tensors_expected =
+        EvaluateNodes(item.graph, {"mul1", "mul2"}, {{"x", x_t}});
+    auto tensors = EvaluateNodes(output, {"mul1", "mul2"}, {{"x", x_t}});
+    EXPECT_EQ(2, tensors_expected.size());
+    EXPECT_EQ(2, tensors.size());
+    for (int i = 0; i < 2; ++i) {
+      test::ExpectTensorEqual<T>(tensors_expected[i], tensors[i]);
+    }
+  }
+};
 
 TEST_F(ConstantFoldingTest, SimpleFolding) {
   // Build a simple graph with a few trivially prunable ops.
@@ -43,9 +96,9 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   item.fetch.push_back("d");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(1, output.node_size());
@@ -89,9 +142,9 @@ TEST_F(ConstantFoldingTest, AddTree) {
   item.fetch = {"add_parent", "mul_parent", "addmul_parent"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // We expect the following rewrite(s) to occur:
@@ -319,9 +372,31 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(2, t.tensor_shape().dim(1).size());
       }
     }
+    auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 2}));
+    auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 3}));
+    auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+    auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+    auto bias_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
+
+    auto tensors_expected = EvaluateNodes(
+        item.graph, item.fetch,
+        {{"x", x_t}, {"y", y_t}, {"a", a_t}, {"b", b_t}, {"bias", bias_t}});
+    EXPECT_EQ(item.fetch.size(), tensors_expected.size());
+    auto tensors = EvaluateNodes(
+        output, item.fetch,
+        {{"x", x_t}, {"y", y_t}, {"a", a_t}, {"b", b_t}, {"bias", bias_t}});
+    EXPECT_EQ(item.fetch.size(), tensors.size());
+    for (int i = 0; i < item.fetch.size(); ++i) {
+      test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+    }
   }
 }
 
+TEST_F(ConstantFoldingTest, NeutralElement_ShortFloats) {
+  SimpleNeutralElementTest<DT_HALF>();
+  SimpleNeutralElementTest<DT_BFLOAT16>();
+}
+
 TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output cf_half = ops::Const(s.WithOpName("cf_half"), 0.5f, {1});
@@ -525,9 +600,9 @@ TEST_F(ConstantFoldingTest, CreateConstNodes) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(24, output.node_size());
@@ -574,9 +649,9 @@ TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   item.fetch.push_back("f");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(2, output.node_size());
@@ -614,10 +689,11 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   GrapplerItem item;
   item.fetch.push_back("e");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-
-  ConstantFolding fold(nullptr /* cpu_device */);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "e"};
@@ -641,6 +717,9 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
     }
   }
   EXPECT_EQ(1, found);
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
@@ -658,9 +737,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "c",
@@ -714,10 +793,11 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
   GrapplerItem item;
   item.fetch.push_back("i2");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-
-  ConstantFolding fold(nullptr /* cpu_device */);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "i2"};
@@ -733,6 +813,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
       EXPECT_EQ("^p2", node.input(1));
     }
   }
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
@@ -789,9 +872,9 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
   }
 
   item.fetch = outputs;
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int constant_folded = 0;
@@ -827,9 +910,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterialization) {
   item.fetch.push_back("p2");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -866,9 +949,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -920,9 +1003,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
   int found = 0;
   for (const auto& node : output.node()) {
@@ -978,9 +1061,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
   item.fetch.push_back("ia");
   item.fetch.push_back("ib");
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1039,9 +1122,9 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
@@ -1117,9 +1200,9 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
 
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
                                     "switch",   "i",
@@ -1173,6 +1256,10 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   ops::Merge m1(scope.WithOpName("m1"), {x, const1, const2});
   ops::Merge m2(scope.WithOpName("m2"), {const1, const3});
   ops::Merge m3(scope.WithOpName("m3"), {x, y});
+  // m4 is not foldable because the only constant input
+  // has a control input, so we cannot know if it will be
+  // triggered.
+  ops::Merge m4(scope.WithOpName("m4"), {x, const1});
 
   ops::Identity out1(scope.WithOpName("out1"), m1.output);
   ops::Identity idx1(scope.WithOpName("idx1"), m1.value_index);
@@ -1180,16 +1267,19 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   ops::Identity idx2(scope.WithOpName("idx2"), m2.value_index);
   ops::Identity out3(scope.WithOpName("out3"), m3.output);
   ops::Identity idx3(scope.WithOpName("idx3"), m3.value_index);
+  ops::Identity out4(scope.WithOpName("out4"), m4.output);
+  ops::Identity idx4(scope.WithOpName("idx4"), m4.value_index);
 
   GrapplerItem item;
-  item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3"};
+  item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3", "out4", "idx4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  EXPECT_EQ(19, output.node_size());
   int found_nodes = 0;
   for (const auto& node : output.node()) {
     if (node.name() == "out1") {
@@ -1226,10 +1316,18 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("m3:1", node.input(0));
       ++found_nodes;
+    } else if (node.name() == "out4") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m4", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "idx4") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m4:1", node.input(0));
+      ++found_nodes;
     }
   }
   // Make sure the graph contains all the nodes we're expecting.
-  EXPECT_EQ(6, found_nodes);
+  EXPECT_EQ(8, found_nodes);
 
   std::vector<string> fetch = {"out1", "idx1"};
   auto tensors = EvaluateNodes(output, fetch);
@@ -1244,6 +1342,82 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   EXPECT_EQ(2, out_idx.flat<int32>()(0));
 }
 
+TEST_F(ConstantFoldingTest, SplitRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 =
+      ops::Variable(scope.WithOpName("in1"), TensorShape({2}), DT_FLOAT);
+  Output in2 =
+      ops::Variable(scope.WithOpName("in2"), TensorShape({4}), DT_FLOAT);
+  auto split_dim = ops::Const(scope.WithOpName("split_dim"), {0}, {});
+  ops::Split s1(scope.WithOpName("s1"), split_dim, in1, 1);
+  ops::Split s2(scope.WithOpName("s2"), split_dim, in2, 2);
+
+  ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
+
+  GrapplerItem item;
+  item.fetch = {"out"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("split_dim", "Const", {}, {}, &want);
+  AddNode("s1", "Identity", {"in1", AsControlDependency("split_dim")}, {},
+          &want);
+  AddNode("s2", "Split", {"in2", "split_dim"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
+TEST_F(ConstantFoldingTest, SplitVRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 =
+      ops::Variable(scope.WithOpName("in1"), TensorShape({2}), DT_FLOAT);
+  Output in2 =
+      ops::Variable(scope.WithOpName("in2"), TensorShape({5}), DT_FLOAT);
+  auto split_dim = ops::Const(scope.WithOpName("split_dim"), {0}, {});
+  auto size_splits1 = ops::Const(scope.WithOpName("size_splits1"), {2}, {1});
+  auto size_splits2 = ops::Const(scope.WithOpName("size_splits2"), {2, 3}, {2});
+  ops::SplitV s1(scope.WithOpName("s1"), in1, size_splits1, split_dim, 1);
+  ops::SplitV s2(scope.WithOpName("s2"), in2, size_splits2, split_dim, 2);
+
+  LOG(INFO) << s1.output.size();
+  LOG(INFO) << s2.output.size();
+  ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
+
+  GrapplerItem item;
+  item.fetch = {"out"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("split_dim", "Const", {}, {}, &want);
+  AddNode("size_splits1", "Const", {}, {}, &want);
+  AddNode("size_splits2", "Const", {}, {}, &want);
+  AddNode("s1", "Identity",
+          {"in1", AsControlDependency("size_splits1"),
+           AsControlDependency("split_dim")},
+          {}, &want);
+  AddNode("s2", "SplitV", {"in2", "size_splits2", "split_dim"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
 TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
@@ -1262,18 +1436,18 @@ TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
   item.fetch = {"out1", "out2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef got;
-  Status status = fold.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
-  AddNode("in1", "VariableV2", {}, &want);
-  AddNode("in2", "VariableV2", {}, &want);
-  AddNode("s1", "Identity", {"in1"}, &want);
-  AddNode("s2", "Identity", {"in2", AsControlDependency("in1")}, &want);
-  AddNode("out1", "Add", {"s1", "s2"}, &want);
-  AddNode("out2", "Identity", {"s2"}, &want);
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("s1", "Identity", {"in1"}, {}, &want);
+  AddNode("s2", "Identity", {"in2", AsControlDependency("in1")}, {}, &want);
+  AddNode("out1", "Add", {"s1", "s2"}, {}, &want);
+  AddNode("out2", "Identity", {"s2"}, {}, &want);
 
   CompareGraphs(want, got);
 }
@@ -1295,21 +1469,21 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding fold(nullptr /* cpu_device */);
+    ConstantFolding optimizer(nullptr /* cpu_device */);
     GraphDef got;
-    Status status = fold.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
-    AddNode("in1", "VariableV2", {}, &want);
-    AddNode("in2", "VariableV2", {}, &want);
-    AddNode("begin", "Const", {}, &want);
-    AddNode("size", "Const", {}, &want);
+    AddNode("in1", "VariableV2", {}, {}, &want);
+    AddNode("in2", "VariableV2", {}, {}, &want);
+    AddNode("begin", "Const", {}, {}, &want);
+    AddNode("size", "Const", {}, {}, &want);
     AddNode("s1", "Identity",
             {"in1", AsControlDependency("begin"), AsControlDependency("size")},
-            &want);
-    AddNode("s2", "Slice", {"in2", "begin", "size"}, &want);
-    AddNode("out", "Add", {"s1", "s2"}, &want);
+            {}, &want);
+    AddNode("s2", "Slice", {"in2", "begin", "size"}, {}, &want);
+    AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
     CompareGraphs(want, got);
   }
@@ -1332,22 +1506,22 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding fold(nullptr /* cpu_device */);
+    ConstantFolding optimizer(nullptr /* cpu_device */);
     GraphDef got;
-    Status status = fold.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
-    AddNode("in1", "VariableV2", {}, &want);
-    AddNode("in2", "VariableV2", {}, &want);
-    AddNode("begin1", "Const", {}, &want);
-    AddNode("begin2", "Const", {}, &want);
-    AddNode("size", "Const", {}, &want);
+    AddNode("in1", "VariableV2", {}, {}, &want);
+    AddNode("in2", "VariableV2", {}, {}, &want);
+    AddNode("begin1", "Const", {}, {}, &want);
+    AddNode("begin2", "Const", {}, {}, &want);
+    AddNode("size", "Const", {}, {}, &want);
     AddNode("s1", "Identity",
             {"in1", AsControlDependency("begin1"), AsControlDependency("size")},
-            &want);
-    AddNode("s2", "Slice", {"in2", "begin2", "size"}, &want);
-    AddNode("out", "Add", {"s1", "s2"}, &want);
+            {}, &want);
+    AddNode("s2", "Slice", {"in2", "begin2", "size"}, {}, &want);
+    AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
     CompareGraphs(want, got);
   }
@@ -1370,19 +1544,20 @@ TEST_F(ConstantFoldingTest, TileWithMultipliesBeingOne) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef got;
-  Status status = fold.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
-  AddNode("in1", "VariableV2", {}, &want);
-  AddNode("in2", "VariableV2", {}, &want);
-  AddNode("multiplies1", "Const", {}, &want);
-  AddNode("multiplies2", "Const", {}, &want);
-  AddNode("t1", "Identity", {"in1", AsControlDependency("multiplies1")}, &want);
-  AddNode("t2", "Tile", {"in2", "multiplies2"}, &want);
-  AddNode("out", "Add", {"t1", "t2"}, &want);
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("multiplies1", "Const", {}, {}, &want);
+  AddNode("multiplies2", "Const", {}, {}, &want);
+  AddNode("t1", "Identity", {"in1", AsControlDependency("multiplies1")}, {},
+          &want);
+  AddNode("t2", "Tile", {"in2", "multiplies2"}, {}, &want);
+  AddNode("out", "Add", {"t1", "t2"}, {}, &want);
 
   CompareGraphs(want, got);
 }
@@ -1408,23 +1583,23 @@ TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef got;
-  Status status = fold.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
-  AddNode("in1", "VariableV2", {}, &want);
-  AddNode("in2", "VariableV2", {}, &want);
-  AddNode("paddings1", "Const", {}, &want);
-  AddNode("paddings2", "Const", {}, &want);
-  AddNode("c1", "Const", {}, &want);
-  AddNode("c2", "Const", {}, &want);
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("paddings1", "Const", {}, {}, &want);
+  AddNode("paddings2", "Const", {}, {}, &want);
+  AddNode("c1", "Const", {}, {}, &want);
+  AddNode("c2", "Const", {}, {}, &want);
   AddNode("p1", "Identity",
           {"in1", AsControlDependency("paddings1"), AsControlDependency("c1")},
-          &want);
-  AddNode("p2", "PadV2", {"in2", "paddings2", "c2"}, &want);
-  AddNode("out", "Add", {"p1", "p2"}, &want);
+          {}, &want);
+  AddNode("p2", "PadV2", {"in2", "paddings2", "c2"}, {}, &want);
+  AddNode("out", "Add", {"p1", "p2"}, {}, &want);
 
   CompareGraphs(want, got);
 }
@@ -1444,17 +1619,17 @@ TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef got;
-  Status status = fold.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
-  AddNode("in1", "VariableV2", {}, &want);
-  AddNode("in2", "VariableV2", {}, &want);
-  AddNode("s1", "Identity", {"in1"}, &want);
-  AddNode("s2", "Squeeze", {"in2"}, &want);
-  AddNode("out", "Add", {"s1", "s2"}, &want);
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("s1", "Identity", {"in1"}, {}, &want);
+  AddNode("s2", "Squeeze", {"in2"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
 }
@@ -1475,9 +1650,9 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   item.fetch.push_back("s");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   bool found = false;
@@ -1534,9 +1709,9 @@ TEST_F(ConstantFoldingTest, NoOpReshape) {
   item.fetch = {"s1", "s2", "s3", "s4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1581,9 +1756,9 @@ TEST_F(ConstantFoldingTest, Packing) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Make sure that the representation of the folded constant is space
@@ -1616,14 +1791,14 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = fold.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1663,6 +1838,79 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   EXPECT_EQ(6, found);
 }
 
+TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({2, 2})));
+  Output b = ops::Square(s.WithOpName("b"), a);
+  Output c = ops::Mul(s.WithOpName("c"), a, b);
+  Output d = ops::Shape(s.WithOpName("d"), a);
+  Output e = ops::Shape(s.WithOpName("e"), b);
+
+  auto f = ops::internal::BroadcastGradientArgs(s.WithOpName("f"), d, e);
+  Output o1 = ops::Identity(s.WithOpName("o1"), f.r0);
+  Output o2 = ops::Identity(s.WithOpName("o2"), f.r1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // Run a second time to make sure the optimization is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(11, output.node_size());
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "ConstantFolding/f-folded-1") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^a", node.input(0));
+      EXPECT_EQ("^b", node.input(1));
+    } else if (node.name() == "d") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^a", node.input(0));
+    } else if (node.name() == "e") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^b", node.input(0));
+    } else if (node.name() == "o1") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/f-bcastargs-0", node.input(0));
+    } else if (node.name() == "o2") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/f-bcastargs-1", node.input(0));
+    } else if (node.name() == "ConstantFolding/f-bcastargs-0") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^ConstantFolding/f-folded-1", node.input(0));
+      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    } else if (node.name() == "ConstantFolding/f-bcastargs-1") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^ConstantFolding/f-folded-1", node.input(0));
+      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    }
+  }
+  EXPECT_EQ(7, found);
+}
+
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input =
@@ -1677,14 +1925,14 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch.push_back("reshape");
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = fold.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1717,9 +1965,9 @@ TEST_F(ConstantFoldingTest, LargeConstant) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("out");
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Make sure the diag node hasn't been folded, since it would use too much
@@ -1741,6 +1989,12 @@ TEST_F(ConstantFoldingTest, LargeConstant) {
   EXPECT_EQ(2, found);
 
   EXPECT_GT(1024 * 1024, output.ByteSizeLong());
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
@@ -1756,9 +2010,9 @@ TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
   item.fetch.push_back("id_true");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(6, output.node_size());
@@ -1880,7 +2134,7 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
         EXPECT_EQ("ConstantFolding/acc6_partial_split_2", node.input(1));
         EXPECT_EQ("y", node.input(2));
       }
-      if (StringPiece(node.name()).starts_with("ConstantFolding/")) {
+      if (str_util::StartsWith(node.name(), "ConstantFolding/")) {
         EXPECT_EQ("Const", node.op());
       }
     }
@@ -1921,6 +2175,8 @@ TEST_F(ConstantFoldingTest, PartialFolding_Concat) {
   item.fetch = {"concat0", "concat1", "concat2", "concat3", "concat4",
                 "concat5", "concat6", "concat7", "concat8", "concat9"};
 
+  auto tensors_expected = EvaluateNodes(item.graph, {"concat0"});
+  EXPECT_EQ(1, tensors_expected.size());
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -1963,16 +2219,14 @@ TEST_F(ConstantFoldingTest, PartialFolding_Concat) {
       EXPECT_EQ("x", node.input(1));
       EXPECT_EQ("y", node.input(2));
       EXPECT_EQ("axis", node.input(3));
-    } else if (StringPiece(node.name()).starts_with("ConstantFolding/")) {
+    } else if (str_util::StartsWith(node.name(), "ConstantFolding/")) {
       EXPECT_EQ("Const", node.op());
     } else {
       EXPECT_EQ(item.graph.node(i).DebugString(), node.DebugString());
     }
   }
 
-  auto tensors_expected = EvaluateNodes(item.graph, {"concat0"});
   auto tensors = EvaluateNodes(output, {"concat0"});
-  EXPECT_EQ(1, tensors_expected.size());
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
@@ -1984,8 +2238,8 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
   Output c1 = ops::Const(scope.WithOpName("c1"), 1.0f, {2, 2});
   Output c2 = ops::Const(scope.WithOpName("c2"), 2.0f, {2, 2});
   auto id_n = ops::IdentityN(scope.WithOpName("id_n"), {c1, x, c2});
-  auto id0 = ops::Identity(scope.WithOpName("id0"), id_n[1]);
-  auto id1 = ops::Identity(scope.WithOpName("id1"), id_n[0]);
+  auto id0 = ops::Identity(scope.WithOpName("id0"), id_n[0]);
+  auto id1 = ops::Identity(scope.WithOpName("id1"), id_n[1]);
   auto add0 = ops::Add(scope.WithOpName("add0"), id_n[0], id_n[1]);
   auto add1 = ops::Add(scope.WithOpName("add1"), id_n[0], id_n[2]);
 
@@ -1996,38 +2250,44 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
   item.fetch.push_back("add0");
   item.fetch.push_back("add1");
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
-
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  LOG(INFO) << output.DebugString();
   TF_EXPECT_OK(status);
   EXPECT_EQ(8, output.node_size());
-  // id_n should remain unchanged.
-  EXPECT_EQ("id_n", output.node(3).name());
-  EXPECT_EQ(3, output.node(3).input_size());
-  EXPECT_EQ("c1", output.node(3).input(0));
-  EXPECT_EQ("x", output.node(3).input(1));
-  EXPECT_EQ("c2", output.node(3).input(2));
-  // id0 is unchanged.
-  EXPECT_EQ("id0", output.node(4).name());
-  EXPECT_EQ(1, output.node(4).input_size());
-  // id1 should have the constant input forwarded to it,
-  // and a control dependency from id_n.
-  EXPECT_EQ("id1", output.node(5).name());
-  EXPECT_EQ(2, output.node(5).input_size());
-  EXPECT_EQ("c1", output.node(5).input(0));
-  EXPECT_EQ("^id_n", output.node(5).input(1));
-
-  EXPECT_EQ("add0", output.node(6).name());
-  EXPECT_EQ(2, output.node(6).input_size());
-  EXPECT_EQ("c1", output.node(6).input(0));
-  EXPECT_EQ("id_n:1", output.node(6).input(1));
-
-  EXPECT_EQ("add1", output.node(7).name());
-  EXPECT_EQ(3, output.node(7).input_size());
-  EXPECT_EQ("c1", output.node(7).input(0));
-  EXPECT_EQ("c2", output.node(7).input(1));
-  EXPECT_EQ("^id_n", output.node(7).input(2));
+  for (const auto& node : output.node()) {
+    // id_n should remain unchanged.
+    if (node.name() == "id_n") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("c1", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+      EXPECT_EQ("c2", node.input(2));
+    }
+    // id0 should be constant folded, and a control dependency from id_n.
+    if (node.name() == "id0") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^id_n", node.input(0));
+    }
+    // id1 is unchanged.
+    if ("id1" == node.name()) {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("id_n:1", node.input(0));
+    }
+
+    if ("add0" == node.name()) {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("c1", node.input(0));
+      EXPECT_EQ("id_n:1", node.input(1));
+    }
+    // add1 should bo constant folded and have a control dependency from id_n.
+    if ("add1" == node.name()) {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^id_n", node.input(0));
+    }
+  }
 }
 
 TEST_F(ConstantFoldingTest, TrivialPack) {
@@ -2043,11 +2303,10 @@ TEST_F(ConstantFoldingTest, TrivialPack) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("stack");
 
-  ConstantFolding fold(nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  LOG(INFO) << output.DebugString();
   EXPECT_EQ(5, output.node_size());
   for (const auto& node : output.node()) {
     if (node.name() == "stack") {
@@ -2072,6 +2331,77 @@ TEST_F(ConstantFoldingTest, TrivialPack) {
   EXPECT_EQ(tensors_expected[0].shape(), tensors[0].shape());
 }
 
+TEST_F(ConstantFoldingTest, Enter) {
+  GrapplerItem item;
+  AttrValue frame_name;
+  frame_name.set_s("foo");
+  AttrValue is_constant_true;
+  is_constant_true.set_b(true);
+  AttrValue is_constant_false;
+  is_constant_false.set_b(false);
+  AttrValue type;
+  type.set_type(DT_FLOAT);
+  AttrValue value;
+  Tensor value_tensor(DT_FLOAT, TensorShape({}));
+  value_tensor.flat<float>()(0) = 1;
+  value_tensor.AsProtoTensorContent(value.mutable_tensor());
+
+  GraphDef& graph = item.graph;
+  AddNode("x", "Placeholder", {}, {{"T", type}}, &graph);
+  AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph);
+  AddNode("enter1", "Enter", {"x"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_true}},
+          &graph);
+  AddNode("enter2", "Enter", {"c1"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_true}},
+          &graph);
+  AddNode("enter3", "Enter", {"c1"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_false}},
+          &graph);
+  AddNode("id1", "Identity", {"enter1"}, {{"T", type}}, &graph);
+  AddNode("id2", "Identity", {"enter2"}, {{"T", type}}, &graph);
+  AddNode("id3", "Identity", {"enter2"}, {{"T", type}}, &graph);
+  AddNode("id4", "Identity", {"enter3"}, {{"T", type}}, &graph);
+  item.fetch.push_back("id1");
+  item.fetch.push_back("id2");
+  item.fetch.push_back("id3");
+  item.fetch.push_back("id4");
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(9, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "id1") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("enter1", node.input(0));
+    }
+    if (node.name() == "id2" || node.name() == "id3") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^enter2", node.input(0));
+    }
+    if (node.name() == "id4") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("enter3", node.input(0));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8bd10171f15f80d3bb85d63c8f62067992c4f37e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/debug_stripper.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* output) {
+  *output = item.graph;
+  for (NodeDef& node : *output->mutable_node()) {
+    if (IsAssert(node)) {
+      // Convert this node into a no-op.
+      node.set_op("NoOp");
+      node.clear_attr();
+      // Convert all its inputs into control dependency, which will then
+      // be optimized away by dependency optimizer.
+      for (string& inp : *node.mutable_input()) {
+        if (!IsControlInput(inp)) {
+          inp = AsControlDependency(inp);
+        }
+      }
+    } else if (IsCheckNumerics(node)) {
+      // Replace with Identity op which will be pruned later.
+      node.set_op("Identity");
+      node.mutable_attr()->erase("message");
+    }
+  }
+  return Status::OK();
+}
+
+void DebugStripper::Feedback(Cluster* cluster, const GrapplerItem& item,
+                             const GraphDef& optimize_output, double result) {
+  // Takes no feedback.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.h b/tensorflow/core/grappler/optimizers/debug_stripper.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fe25aa1c38c1c33c235b78c1a3763631d4c74d4
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.h
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEBUG_STRIPPER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEBUG_STRIPPER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// DebugStripper strips off debug-related nodes (e.g.
+// Assert, CheckNumerics, Print) from the graph.
+class DebugStripper : public GraphOptimizer {
+ public:
+  DebugStripper() {}
+  ~DebugStripper() override {}
+
+  string name() const override { return "debug_stripper"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEBUG_STRIPPER_H_
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f11febc64dbd55aaaebcdf8d1763517a966264b
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
@@ -0,0 +1,169 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/debug_stripper.h"
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class DebugStripperTest : public GrapplerTest {};
+
+TEST_F(DebugStripperTest, OutputEqualToInput) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({}));
+  Output add = ops::Add(s, x, y);
+  Output result = ops::Identity(s, add);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  CompareGraphs(item.graph, output);
+}
+
+TEST_F(DebugStripperTest, StripAssertFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto greaterequal = ops::GreaterEqual(s.WithOpName("GreaterEqual"), x, y);
+  auto assert = ops::Assert(s.WithOpName("Assert"), greaterequal, {x, y});
+  Output add = ops::Add(
+      s.WithOpName("z").WithControlDependencies({assert.operation}), x, y);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "GreaterEqual") {
+      count++;
+      EXPECT_EQ("GreaterEqual", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    } else if (node.name() == "Assert") {
+      count++;
+      EXPECT_EQ("NoOp", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("^GreaterEqual", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ("^y", node.input(2));
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^Assert", node.input(2));
+    }
+  }
+  EXPECT_EQ(5, count);
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  Tensor y_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  y_t.flat<float>()(0) = 0.5f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"z"}, {{"x", x_t}, {"y", y_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"z"}, {{"x", x_t}, {"y", y_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
+TEST_F(DebugStripperTest, StripCheckNumericsFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto check1 = ops::CheckNumerics(s.WithOpName("CheckNumerics1"), x, "foo");
+  auto check2 = ops::CheckNumerics(s.WithOpName("CheckNumerics2"), y, "foo");
+  Output add = ops::Add(s.WithOpName("z"), check1, check2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "CheckNumerics1") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ(1, node.attr_size());
+    } else if (node.name() == "CheckNumerics2") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ(1, node.attr_size());
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("CheckNumerics1", node.input(0));
+      EXPECT_EQ("CheckNumerics2", node.input(1));
+    }
+  }
+  EXPECT_EQ(5, count);
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  Tensor y_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  y_t.flat<float>()(0) = 0.5f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"z"}, {{"x", x_t}, {"y", y_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"z"}, {{"x", x_t}, {"y", y_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 63bc19630de98a651e193558de8e99e11051e37c..ed9bce439c6d6a5c09e3af53718fa49e191549ab 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -87,7 +88,7 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
     // Don't turn Identity nodes following Switch into NoOp or remove them
     // if it requires anchoring a control dependencies the Switch node, which
     // is not valid.
-    if (StringPiece(node.name()).starts_with(kConstantFoldingCtrl)) {
+    if (str_util::StartsWith(node.name(), kConstantFoldingCtrl)) {
       // TODO(rmlarsen): Try to remove this artificial contraint.
       return false;
     }
@@ -298,6 +299,15 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       input_nodes.push_back(input_node);
     }
 
+    // Make sure that we don't increase the number of edges that cross
+    // device boundaries.
+    if ((num_inputs == 1 && num_outputs > 1 &&
+         input_nodes[0]->device() != node->device()) ||
+        (num_inputs > 1 && num_outputs == 1 &&
+         output_nodes[0]->device() != node->device())) {
+      return;
+    }
+
     // TODO(rmlarsen): Not all device crossings are equally expensive.
     // Assign a cost to each based on device affinity and compute a
     // cost before and after.
@@ -316,6 +326,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       // unless they only have consumers on the same device as themselves.
       return;
     }
+
+    // Make sure we do not increase the number of device crossings.
     const int num_cross_before = num_cross_in + num_cross_out;
     int num_cross_after = 0;
     for (NodeDef* input_node : input_nodes) {
@@ -325,7 +337,6 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       }
     }
     if (num_cross_after > num_cross_before) {
-      // Avoid increasing the number of device crossings.
       return;
     }
 
@@ -518,10 +529,6 @@ Status DependencyOptimizer::TransitiveReduction() {
       if (longest_distance[target] > 1) {
         const int input_slot = control_output.second;
         control_edges_to_remove[target].emplace(input_slot, source);
-        //        VLOG(1) << "Removing edge from:\n"
-        //                << optimized_graph_->node(source).DebugString() <<
-        //                "\n\nto:\n\n"
-        //                << optimized_graph_->node(target).DebugString();
       }
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 61ed15479370614bc79c15b450039f0cbf30908d..b4db98125aa740b5d261e8f9ad0ea5bfd8102877 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -29,9 +29,8 @@ namespace grappler {
 // optimizations, such as removing nodes that are effectively noops.
 class DependencyOptimizer : public GraphOptimizer {
  public:
-  DependencyOptimizer() : opt_level_(RewriterConfig::ON) {}
-  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+  DependencyOptimizer() {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level) {}
   ~DependencyOptimizer() override {}
 
   string name() const override { return "dependency_optimizer"; };
@@ -63,7 +62,6 @@ class DependencyOptimizer : public GraphOptimizer {
   // Main driver of dependency optimizations.
   Status OptimizeDependencies();
 
-  RewriterConfig::Toggle opt_level_;
   bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index cc1e142041c36fb645267c13b306d86639b2541e..6a297da52d075ea9bdae4584b7646ee44b950012 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -285,6 +285,38 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_DeviceBoundaries) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
+TEST_F(DependencyOptimizerTest, RemoveIdentityOps_DeviceBoundaries) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  // Identity with a single input- and two output dependencies.
+  auto id_a = ops::Identity(s.WithOpName("id_a").WithDevice("/CPU:1"), x);
+  // Identity with a two input- and a single output dependency.
+  auto id_b = ops::Identity(
+      s.WithOpName("id_b").WithControlDependencies(y).WithDevice("/CPU:0"), x);
+
+  Output id =
+      ops::Identity(s.WithControlDependencies(id_a).WithDevice("/CPU:1"), id_b);
+  Output id_1 = ops::Identity(s.WithDevice("/CPU:1"), id_a);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+  item.fetch.push_back("Identity_1");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // The optimization should be disabled to prevent increasing the number of
+  // nodes crossing device boundaries.
+  TF_CHECK_OK(TopologicalSort(&item.graph));
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
 TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
@@ -646,6 +678,50 @@ TEST_F(DependencyOptimizerTest, Identity_DeviceCrossing_ConsumerOnSameDevice) {
   }
 }
 
+TEST_F(DependencyOptimizerTest, RemoveGreaterEqualWithNoOp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto greaterequal = ops::GreaterEqual(s.WithOpName("GreaterEqual"), x, y);
+  auto noop =
+      ops::NoOp(s.WithOpName("NoOp").WithControlDependencies(greaterequal));
+  Output add = ops::Add(
+      s.WithOpName("z").WithControlDependencies({noop.operation}), x, y);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  item.fetch.push_back("z");
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "GreaterEqual") {
+      count++;
+    } else if (node.name() == "NoOp") {
+      count++;
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+  }
+  EXPECT_EQ(3, count);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 87160f6b83871cde332d643f81d5886be141fa87..f1da469a6c990c8363e387a4bf615e76a742961a 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -15,10 +15,16 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
 #include <unordered_map>
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -26,16 +32,129 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
+namespace {
+
+class FunctionInliningContext {
+ public:
+  explicit FunctionInliningContext(const GrapplerItem& item)
+      : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
+
+  const FunctionDefLibrary& Library() const { return *library_; }
+
+  bool HasInlinedFunctions() const { return !functions_.empty(); }
+
+  // Find inlining candidate by name. Return nullptr if not found.
+  const FunctionDef* FindInlinedFunction(const string& name) const {
+    auto it = functions_.find(name);
+    if (it != functions_.end()) {
+      return it->second;
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::unordered_map<string, const FunctionDef*> InliningCandidates(
+      const GrapplerItem& item) const {
+    std::unordered_map<string, const FunctionDef*> functions;
+    for (const FunctionDef& func : item.graph.library().function()) {
+      // Don't inline functions marked as noinline
+      if (func.attr().count("_noinline") != 0) {
+        continue;
+      }
+      // Don't touch anything marked XLA to prevent XLA failures further down
+      // the road.
+      if (func.attr().count("_XlaCompile") > 0 &&
+          func.attr().at("_XlaCompile").b()) {
+        continue;
+      }
+      // Can't create IdentityN nodes with no input or output: skip these
+      // functions for now.
+      if (func.signature().input_arg_size() == 0 ||
+          func.signature().output_arg_size() == 0) {
+        continue;
+      }
+      functions[func.signature().name()] = &func;
+    }
+    return functions;
+  }
+
+  const FunctionDefLibrary* library_;
+  std::unordered_map<string, const FunctionDef*> functions_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
+};
+
+// Copy input/output argument type to the type_list. Return error if argument
+// type is not explicitly defined, and not specified in function attributes.
+Status CopyArgType(const NodeDef& func_node,
+                   const std::unordered_map<string, AttrValue>& func_attr,
+                   const string& arg_kind, const OpDef::ArgDef& arg,
+                   AttrValue::ListValue* type_list) {
+  if (arg.type() != DT_INVALID) {
+    type_list->add_type(arg.type());
+  } else {
+    auto it = func_attr.find(arg.type_attr());
+    if (it == func_attr.end() || it->second.type() == DT_INVALID) {
+      return errors::InvalidArgument(
+          "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
+          func_node.op(), " instantiated by ", func_node.name());
+    }
+    type_list->add_type(it->second.type());
+  }
+  return Status::OK();
+}
+
+// Add an IdentityN op to hook the function inputs to: this ensures that
+// they're all evaluated before the evaluation of the function body starts.
+Status HookInlinedFunctionInputs(
+    const NodeDef& func_node, const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr, NodeDef* inputs) {
+  inputs->set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
+  inputs->set_op("IdentityN");
+  inputs->set_device(func_node.device());
+  *inputs->mutable_input() = func_node.input();
+  AttrValue::ListValue* type_list =
+      (*inputs->mutable_attr())["T"].mutable_list();
+  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
+    TF_RETURN_IF_ERROR(
+        CopyArgType(func_node, func_attr, "input", arg, type_list));
+  }
+  return Status::OK();
+}
+
+// Add an IdentityN op to hook the function outputs to: this ensures that the
+// function body is fully evaluated before its fanout gets scheduled.
+Status HookInlinedFunctionOutputs(
+    const NodeDef& func_node, const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const gtl::ArraySlice<string> fetch, NodeDef* outputs) {
+  outputs->set_name(func_node.name());
+  outputs->set_op("IdentityN");
+  outputs->set_device(func_node.device());
+  AttrValue::ListValue* type_list =
+      (*outputs->mutable_attr())["T"].mutable_list();
+  for (int i = 0; i < func.signature().output_arg_size(); ++i) {
+    const OpDef::ArgDef& arg = func.signature().output_arg(i);
+    TF_RETURN_IF_ERROR(
+        CopyArgType(func_node, func_attr, "output", arg, type_list));
+    // Use the fetch names since they take into account the output mapping.
+    outputs->add_input(strings::StrCat(func_node.name(), "/", fetch[i]));
+  }
+  return Status::OK();
+}
+
+Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
+                      const FunctionInliningContext& ctx,
+                      GraphDef* optimized_graph) {
+  const std::unordered_map<string, AttrValue> func_attr(
+      func_node.attr().begin(), func_node.attr().end());
 
-Status InlineFunction(const NodeDef& node, const FunctionDef& func,
-                      const FunctionDefLibrary& library, GraphDef* graph) {
-  const std::unordered_map<string, AttrValue> attr(node.attr().begin(),
-                                                   node.attr().end());
   std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, attr, library);
+      GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
   if (!item) {
-    return errors::InvalidArgument("Failed to inline function ", node.op(),
-                                   " instantiated by ", node.name());
+    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
+                                   " instantiated by ", func_node.name());
   }
 
   std::unordered_map<string, int> input_nodes;
@@ -44,43 +163,25 @@ Status InlineFunction(const NodeDef& node, const FunctionDef& func,
     input_nodes[arg.name()] = i;
   }
 
-  // Add an IdentityN op to hook the function inputs to: this ensures that
-  // they're all evaluated before the evaluation of the function body starts.
-  NodeDef* func_inputs = graph->add_node();
-  func_inputs->set_name(strings::StrCat(node.name(), "/", "inlined_inputs"));
-  func_inputs->set_op("IdentityN");
-  func_inputs->set_device(node.device());
-  *func_inputs->mutable_input() = node.input();
-  AttrValue::ListValue* type_list =
-      (*func_inputs->mutable_attr())["T"].mutable_list();
-  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
-    if (arg.type() != DT_INVALID) {
-      type_list->add_type(arg.type());
-    } else {
-      auto it = attr.find(arg.type_attr());
-      if (it == attr.end()) {
-        return errors::InvalidArgument("Invalid input argument ", arg.name(),
-                                       " for function ", node.op(),
-                                       " instantiated by ", node.name());
-      }
-      type_list->add_type(it->second.type());
-    }
-  }
+  // Hook inlined function inputs to IdentityN node
+  NodeDef* func_inputs = optimized_graph->add_node();
+  TF_RETURN_IF_ERROR(
+      HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs));
 
   for (NodeDef& func_body_node : *item->graph.mutable_node()) {
     if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
+      CHECK_EQ(0, func_body_node.input_size());
       // Turn input placeholders into identity nodes
       if (IsPlaceholder(func_body_node)) {
         func_body_node.set_op("Identity");
       }
-      CHECK_EQ(0, func_body_node.input_size());
       int input_id = input_nodes[func_body_node.name()];
       func_body_node.add_input(
           strings::StrCat(func_inputs->name(), ":", input_id));
     } else {
       // Update the input names if any.
       for (string& input : *func_body_node.mutable_input()) {
-        input = AddPrefixToNodeName(input, node.name());
+        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
       }
       // If the node has no input, make hook it up to the func_inputs node to
       // ensure it runs in the same frame as the other nodes of the function
@@ -92,78 +193,204 @@ Status InlineFunction(const NodeDef& node, const FunctionDef& func,
 
     // Add the node name as a prefix to avoid collisions after inlining
     func_body_node.set_name(
-        strings::StrCat(node.name(), "/", func_body_node.name()));
+        strings::StrCat(func_node.name(), "/", func_body_node.name()));
 
     // Make sure the node is placed
-    func_body_node.set_device(node.device());
+    func_body_node.set_device(func_node.device());
 
-    // Move the node to the main graph
-    graph->add_node()->Swap(&func_body_node);
+    // Check if a body node is itself a function
+    const FunctionDef* func_body_node_func =
+        ctx.FindInlinedFunction(func_body_node.op());
+    if (func_body_node_func != nullptr) {
+      // Recursively inline function calls
+      TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
+                                        ctx, optimized_graph));
+    } else {
+      // Move the node to the main graph
+      optimized_graph->add_node()->Swap(&func_body_node);
+    }
   }
 
-  // Add an IdentityN op to hook the function outputs to: this ensures that the
-  // function body is fully evaluated before its fanout gets scheduled.
-  NodeDef* func_outputs = graph->add_node();
-  func_outputs->set_name(node.name());
-  func_outputs->set_op("IdentityN");
-  func_outputs->set_device(node.device());
-  type_list = (*func_outputs->mutable_attr())["T"].mutable_list();
-  for (int i = 0; i < func.signature().output_arg_size(); ++i) {
-    const OpDef::ArgDef& arg = func.signature().output_arg(i);
-    if (arg.type() != DT_INVALID) {
-      type_list->add_type(arg.type());
+  // Hook inlined function outputs to IdentityN node
+  NodeDef* func_outputs = optimized_graph->add_node();
+  TF_RETURN_IF_ERROR(HookInlinedFunctionOutputs(func_node, func, func_attr,
+                                                item->fetch, func_outputs));
+
+  return Status::OK();
+}
+
+class FakeCPUDevice : public Device {
+ public:
+  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
+  Status Sync() override { return Status::OK(); }
+};
+
+class SymbolicGradientEnv {
+ public:
+  SymbolicGradientEnv(int graph_version, const FunctionDefLibrary& library)
+      : graph_version_(graph_version), library_(library) {}
+
+  FunctionLibraryDefinition* function_library() {
+    InitializeIfNeeded();
+    return fld_.get();
+  }
+  FunctionLibraryRuntime* function_library_runtime() {
+    InitializeIfNeeded();
+    return flr_;
+  }
+
+ private:
+  // This initialization is expensive. Do it lazily to avoid paying for it
+  // unless it's needed.
+  void InitializeIfNeeded() {
+    if (flr_) {
+      return;
+    }
+    Env* env = Env::Default();
+    DeviceAttributes attr;
+    attr.set_name("/device:CPU:0");
+    attr.set_device_type("CPU");
+    FakeCPUDevice* dev = new FakeCPUDevice(env, attr);
+    std::vector<Device*> devices;
+    devices.push_back(dev);
+    dvc_mgr_.reset(new DeviceMgr(devices));
+    fld_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), library_));
+    OptimizerOptions optimizer_opts;
+    optimizer_opts.set_do_function_inlining(true);
+    pflr_.reset(new ProcessFunctionLibraryRuntime(
+        dvc_mgr_.get(), env, graph_version_, fld_.get(), optimizer_opts));
+    flr_ = pflr_->GetFLR(dev->name());
+  }
+
+  const int graph_version_;
+  const FunctionDefLibrary& library_;
+  std::unique_ptr<DeviceMgr> dvc_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> fld_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* flr_ = nullptr;
+};
+
+Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
+                              GraphDef* inlined_graph) {
+  GraphDef graph_def;
+
+  // Create a node to anchor the gradient inputs
+  NodeDef* inlined_input = graph_def.add_node();
+  inlined_input->set_name("FunctionInputs");
+  inlined_input->set_op("IdentityN");
+  AttrValue::ListValue* type_list =
+      (*inlined_input->mutable_attr())["T"].mutable_list();
+  for (const auto& type : node.attr().at("Tin").list().type()) {
+    type_list->add_type(static_cast<DataType>(type));
+  }
+
+  // Add the gradient node
+  NodeDef* inlined = graph_def.add_node();
+  *inlined = node;
+  inlined->clear_input();
+  for (int i = 0; i < node.attr().at("Tin").list().type_size(); ++i) {
+    inlined->add_input(strings::StrCat(inlined_input->name(), ":", i));
+  }
+
+  // Create a node to anchor the gradient outputs
+  NodeDef* inlined_output = graph_def.add_node();
+  inlined_output->set_name("FunctionOutputs");
+  inlined_output->set_op("IdentityN");
+  type_list = (*inlined_output->mutable_attr())["T"].mutable_list();
+  for (const auto& type : node.attr().at("Tout").list().type()) {
+    type_list->add_type(static_cast<DataType>(type));
+  }
+  for (int i = 0; i < node.attr().at("Tout").list().type_size(); ++i) {
+    inlined_output->add_input(strings::StrCat(inlined->name(), ":", i));
+  }
+
+  // Convert the graphdef to a graph
+  GraphConstructorOptions graph_ctor_opts;
+  graph_ctor_opts.allow_internal_ops = true;
+  graph_ctor_opts.expect_device_spec = false;
+  Graph graph(env->function_library());
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
+
+  // Recursively inline the functions until there is nothing more to inline. We
+  // should at least expand one function.
+  int counter = 0;
+  while (counter < 50 &&
+         ExpandInlineFunctions(env->function_library_runtime(), &graph)) {
+    ++counter;
+  }
+
+  GraphDef inlined_graph_def;
+  graph.ToGraphDef(&inlined_graph_def);
+
+  // Add the default values of attributes to the nodes that have been inlined.
+  TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&inlined_graph_def,
+                                               *graph.op_registry(), 0, true));
+
+  // Add the inlined nodes to the graph
+  for (NodeDef& inlined_node : *inlined_graph_def.mutable_node()) {
+    if (inlined_node.name() == "FunctionOutputs") {
+      inlined_node.set_name(node.name());
+      for (int i = 0; i < inlined_node.input_size(); ++i) {
+        inlined_node.set_input(
+            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
+      }
+    } else if (inlined_node.name() == "FunctionInputs") {
+      inlined_node.set_name(
+          AddPrefixToNodeName(inlined_node.name(), node.name()));
+      inlined_node.clear_input();
+      for (int i = 0; i < node.input_size(); ++i) {
+        inlined_node.add_input(node.input(i));
+      }
     } else {
-      auto it = attr.find(arg.type_attr());
-      if (it == attr.end()) {
-        return errors::InvalidArgument("Invalid output argument ", arg.name(),
-                                       " for function ", node.op(),
-                                       " instantiated by ", node.name());
+      inlined_node.set_name(
+          AddPrefixToNodeName(inlined_node.name(), node.name()));
+      for (int i = 0; i < inlined_node.input_size(); ++i) {
+        inlined_node.set_input(
+            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
+      }
+      // If the node has no input, hook it up to the function input node to make
+      // sure it runs in the same frame as the other nodes of the function body.
+      if (inlined_node.input_size() == 0) {
+        *inlined_node.add_input() = AsControlDependency(
+            AddPrefixToNodeName("FunctionInputs", node.name()));
       }
-      type_list->add_type(it->second.type());
     }
-    // Use the fetch names since they take into account the output mapping.
-    func_outputs->add_input(strings::StrCat(node.name(), "/", item->fetch[i]));
+    inlined_node.set_device(node.device());
+    inlined_graph->add_node()->Swap(&inlined_node);
   }
 
   return Status::OK();
 }
 
+}  // namespace
+
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
-  std::unordered_map<string, const FunctionDef*> functions;
-  for (const FunctionDef& func : item.graph.library().function()) {
-    // Don't inline functions marked as noinline
-    if (func.attr().count("_noinline") != 0) {
-      continue;
-    }
-    // Don't touch anything marked XLA to prevent XLA failures further down the
-    // road.
-    if (func.attr().count("_XlaCompile") != 0) {
-      continue;
-    }
-    // Can't create IdentityN nodes with no input or output: skip these
-    // functions for now.
-    if (func.signature().input_arg_size() == 0 ||
-        func.signature().output_arg_size() == 0) {
-      continue;
-    }
-    functions[func.signature().name()] = &func;
-  }
+  FunctionInliningContext function_inlining_ctx(item);
 
-  // Nothing to do.
-  if (functions.empty()) {
+  // Nothing to do here.
+  if (!function_inlining_ctx.HasInlinedFunctions()) {
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  // Inline functions when possible.
+  SymbolicGradientEnv env(item.graph.versions().producer(),
+                          item.graph.library());
+
   for (const NodeDef& node : item.graph.node()) {
-    auto it = functions.find(node.op());
-    if (it == functions.end()) {
-      *optimized_graph->add_node() = node;
+    if (node.op() == "SymbolicGradient") {
+      TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
+      continue;
+    }
+
+    const FunctionDef* func =
+        function_inlining_ctx.FindInlinedFunction(node.op());
+    if (func != nullptr) {
+      TF_RETURN_IF_ERROR(
+          InlineFunction(node, *func, function_inlining_ctx, optimized_graph));
     } else {
-      TF_RETURN_IF_ERROR(InlineFunction(node, *it->second, item.graph.library(),
-                                        optimized_graph));
+      *optimized_graph->add_node() = node;
     }
   }
 
@@ -171,8 +398,8 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   // inlined based on the context in which they're instantiated.
 
   // TODO(bsteiner): trim the library to remove unused function definitions
-  *optimized_graph->mutable_library() = item.graph.library();
   *optimized_graph->mutable_versions() = item.graph.versions();
+  *optimized_graph->mutable_library() = item.graph.library();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index 5c80226e9dbf57908f8942f31051761f743265b8..41444e467364f83e7627477a7651203100e47d8a 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -25,7 +26,7 @@ namespace grappler {
 // operations to make the overall graph more efficient.
 class FunctionOptimizer : public GraphOptimizer {
  public:
-  FunctionOptimizer() {}
+  FunctionOptimizer(RewriterConfig::Toggle opt_level) {}
   ~FunctionOptimizer() override {}
 
   string name() const override { return "function_optimizer"; };
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index bafcdf4923004df7855138b2669e687c882d194d..c804d75756c6734addcbc6f7c6d836b1f1812e55 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -24,7 +26,22 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class FunctionOptimizerTest : public GrapplerTest {};
+constexpr char kDevice[] = "/device:CPU:0";
+
+class FunctionOptimizerTest : public GrapplerTest {
+ protected:
+  Tensor MakeScalarTensor(float value) {
+    Tensor tensor(DT_FLOAT, {});
+    tensor.scalar<float>()() = value;
+    return tensor;
+  }
+
+  Tensor MakeScalarTensor(int value) {
+    Tensor tensor(DT_INT32, {});
+    tensor.scalar<int>()() = value;
+    return tensor;
+  }
+};
 
 TEST_F(FunctionOptimizerTest, SimpleFunction) {
   // Build a graph to compute y = XTimesTwo(x)
@@ -40,7 +57,7 @@ TEST_F(FunctionOptimizerTest, SimpleFunction) {
           test::function::XTimesTwo(),
       });
 
-  FunctionOptimizer optimizer;
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -92,9 +109,8 @@ TEST_F(FunctionOptimizerTest, SimpleFunction) {
   }
   EXPECT_EQ(7, count);
 
+  Tensor pi = MakeScalarTensor(3.14f);
   item.fetch = {"z"};
-  Tensor pi(DT_FLOAT, {});
-  pi.flat<float>()(0) = 3.14f;
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
@@ -133,7 +149,7 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
           x_times_two,
       });
 
-  FunctionOptimizer optimizer;
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -181,9 +197,8 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
   }
   EXPECT_EQ(6, count);
 
+  Tensor pi = MakeScalarTensor(3.14f);
   item.fetch = {"z"};
-  Tensor pi(DT_FLOAT, {});
-  pi.flat<float>()(0) = 3.14f;
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
@@ -219,7 +234,7 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
           func,
       });
 
-  FunctionOptimizer optimizer;
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -266,9 +281,8 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
   }
   EXPECT_EQ(6, count);
 
+  Tensor pi = MakeScalarTensor(3.14f);
   item.fetch = {"z"};
-  Tensor pi(DT_FLOAT, {});
-  pi.flat<float>()(0) = 3.14f;
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
@@ -317,24 +331,17 @@ TEST_F(FunctionOptimizerTest, FunctionWithInputForwarding) {
           func,
       });
 
-  FunctionOptimizer optimizer;
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   item.fetch = {"z0", "z1", "z2"};
-  Tensor in(DT_FLOAT, {});
-  in.flat<float>()(0) = 3.14f;
-  item.feed.emplace_back("x0", in);
-  in.flat<float>()(0) = 2.7f;
-  item.feed.emplace_back("x1", in);
-  in.flat<float>()(0) = 1.0f;
-  item.feed.emplace_back("x2", in);
-  in.flat<float>()(0) = -1.0f;
-  item.feed.emplace_back("x4", in);
-  Tensor in_int(DT_INT32, {});
-  in_int.flat<int>()(0) = 1234;
-  item.feed.emplace_back("x3", in_int);
+  item.feed.emplace_back("x0", MakeScalarTensor(3.14f));
+  item.feed.emplace_back("x1", MakeScalarTensor(2.7f));
+  item.feed.emplace_back("x2", MakeScalarTensor(1.0f));
+  item.feed.emplace_back("x4", MakeScalarTensor(-1.0f));
+  item.feed.emplace_back("x3", MakeScalarTensor(1234));
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
@@ -368,7 +375,7 @@ TEST_F(FunctionOptimizerTest, FunctionWithoutInput) {
           func,
       });
 
-  FunctionOptimizer optimizer;
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -377,6 +384,243 @@ TEST_F(FunctionOptimizerTest, FunctionWithoutInput) {
   EXPECT_EQ(item.graph.DebugString(), output.DebugString());
 }
 
+TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
+  // Define square via function library:
+  //   MySquare(x) = MyMul(x, x)
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {test::function::NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}},
+                            kDevice),
+       test::function::NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}},
+                            kDevice),
+       test::function::NDef("outputs", "Identity", {"square:0"},
+                            {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {mul_func, square_func});
+
+  GraphDef output;
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "square/inlined_inputs" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("a", node.input(0));
+    } else if (node.name() == "square/x" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "square/output/inlined_inputs" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square/x", node.input(0));
+      EXPECT_EQ("square/x", node.input(1));
+    } else if (node.name() == "square/output/x" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "square/output/y" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/inlined_inputs:1", node.input(0));
+    } else if (node.name() == "square/output/output" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square/output/x", node.input(0));
+      EXPECT_EQ("square/output/y", node.input(1));
+    } else if (node.name() == "square/output" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/output:0", node.input(0));
+    } else if (node.name() == "square" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output:0", node.input(0));
+    } else if (node.name() == "outputs" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square:0", node.input(0));
+    }
+  }
+  EXPECT_EQ(9, count);
+
+  item.fetch = {"outputs"};
+  item.feed.emplace_back("a", MakeScalarTensor(2.0f));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, SymbolicGradients) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  FunctionDef func = FunctionDefHelper::Define(
+      "TestFunc", {"x:float", "y:float"}, {"l:float"}, {},
+      {
+          {{"z"}, "Add", {"x", "y"}, {{"T", DT_FLOAT}}},
+          FunctionDefHelper::Const("zero", 0),
+          FunctionDefHelper::Const("one", 1),
+          {{"r"}, "Rank", {"z"}, {{"T", DT_FLOAT}}},
+          {{"indices"}, "Range", {"zero", "r", "one"}},
+          {{"l"}, "Sum", {"z", "indices"}, {{"T", DT_FLOAT}}},
+      });
+
+  auto x = ops::Const(scope, 1.0f);
+  auto y = ops::Const(scope, 2.0f);
+  auto dl = ops::Const(scope, 3.0f);
+
+  NameAttrList fn;
+  fn.set_name("TestFunc");
+  (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+  auto g0 = ops::SymbolicGradient(scope, std::initializer_list<Input>{x, y, dl},
+                                  {DT_FLOAT, DT_FLOAT}, fn);
+  auto out1 = ops::Identity(scope.WithOpName("out1"), g0.output[0]);
+  auto out2 = ops::Identity(scope.WithOpName("out2"), g0.output[1]);
+
+  GrapplerItem item;
+  TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
+  *item.graph.mutable_library()->add_function() = func;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"out1", "out2"}, {});
+  std::vector<Tensor> optimized = EvaluateNodes(output, {"out1", "out2"}, {});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+  test::ExpectTensorEqual<float>(expected[1], optimized[1]);
+}
+
+TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  FunctionDef func = FunctionDefHelper::Create(
+      // Name
+      "Identity_func",
+      // Args
+      {"in: float"},
+      // Return values
+      {"out: float"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"Identity"}, "Identity", {"in"}, {{"T", DT_FLOAT}}}},
+      // Mapping
+      {{"out", "Identity:output:0"}});
+
+  auto x = ops::Const(scope, 1.0f, {3, 5, 7});
+  auto z = ops::Const(scope, 3.0f, {3, 5, 7});
+
+  NameAttrList fn;
+  fn.set_name("Identity_func");
+  auto g0 = ops::SymbolicGradient(scope, std::initializer_list<Input>{x, z},
+                                  {DT_FLOAT}, fn);
+  auto out = ops::Identity(scope.WithOpName("out"), g0.output[0]);
+
+  GrapplerItem item;
+  TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
+  *item.graph.mutable_library()->add_function() = func;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(13, output.node_size());
+  EXPECT_EQ("Const", output.node(0).name());
+  EXPECT_EQ("Const_1", output.node(1).name());
+  EXPECT_EQ("SymbolicGradient/FunctionInputs", output.node(2).name());
+  EXPECT_EQ("SymbolicGradient", output.node(3).name());
+  EXPECT_EQ("SymbolicGradient/SymbolicGradient/Identity",
+            output.node(4).name());
+  EXPECT_EQ("SymbolicGradient/Func/_0", output.node(5).name());
+  EXPECT_EQ("SymbolicGradient/Func/_1", output.node(6).name());
+  EXPECT_EQ("SymbolicGradient/Func/_2", output.node(7).name());
+  EXPECT_EQ("SymbolicGradient/SymbolicGradient/Func/_1/dx",
+            output.node(8).name());
+  EXPECT_EQ("SymbolicGradient/Func/_3", output.node(9).name());
+  EXPECT_EQ("SymbolicGradient/Func/_4", output.node(10).name());
+  EXPECT_EQ("SymbolicGradient/Func/_5", output.node(11).name());
+  EXPECT_EQ("out", output.node(12).name());
+  for (int i = 2; i < 4; ++i) {
+    EXPECT_EQ("IdentityN", output.node(i).op());
+  }
+  for (int i = 4; i < 11; ++i) {
+    EXPECT_EQ("Identity", output.node(i).op());
+  }
+
+  std::vector<Tensor> expected = EvaluateNodes(item.graph, {"out"}, {});
+  std::vector<Tensor> optimized = EvaluateNodes(output, {"out"}, {});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
+TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
+  FunctionDef func = FunctionDefHelper::Define(
+      "TestFunc", {"x:float", "y:float"}, {"l:float"}, {},
+      {
+          {{"z"}, "Add", {"x", "y"}, {{"T", DT_FLOAT}}},
+          FunctionDefHelper::Const("zero", 0),
+          FunctionDefHelper::Const("one", 1),
+          {{"r"}, "Rank", {"z"}, {{"T", DT_FLOAT}}},
+          {{"indices"}, "Range", {"zero", "r", "one"}},
+          {{"l"}, "Sum", {"z", "indices"}, {{"T", DT_FLOAT}}},
+      });
+  (*func.mutable_attr())["_noinline"].set_b(true);
+
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(scope, 1.0f);
+  auto y = ops::Const(scope, 2.0f);
+  auto dl = ops::Const(scope, 3.0f);
+
+  NameAttrList fn;
+  fn.set_name("TestFunc");
+  (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+  auto g0 = ops::SymbolicGradient(scope, std::initializer_list<Input>{x, y, dl},
+                                  {DT_FLOAT, DT_FLOAT}, fn);
+  auto out1 = ops::Identity(scope.WithOpName("out1"), g0.output[0]);
+  auto out2 = ops::Identity(scope.WithOpName("out2"), g0.output[1]);
+
+  GrapplerItem item;
+  TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
+  *item.graph.mutable_library()->add_function() = func;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  // The optimizer should succeed but the graphs should be the same.
+  TF_EXPECT_OK(status);
+  CompareGraphs(item.graph, output);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7044705adee7c1be52b04e6556066546b17f944f
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+
+namespace tensorflow {
+namespace grappler {
+
+const NodeScopeAndName ParseNodeScopeAndName(const string& node_name) {
+  auto pos = node_name.find_last_of("/");
+  if (pos == string::npos) {
+    return {"", node_name};
+  } else {
+    return {node_name.substr(0, pos), node_name.substr(pos + 1)};
+  }
+};
+
+Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
+                    NodeDef** node) {
+  string node_name = NodeName(input);
+  NodeDef* node_by_name = ctx.node_map->GetNode(node_name);
+  if (node_by_name == nullptr) {
+    return errors::FailedPrecondition("Node ", node_name,
+                                      " doesn't exists in a node map");
+  }
+  *node = node_by_name;
+  return Status::OK();
+}
+
+Status GetTensorProperties(const GraphOptimizerContext& ctx,
+                           const string& tensor,
+                           OpInfo::TensorProperties* properties) {
+  int port;
+  string tensor_node_name = ParseNodeName(tensor, &port);
+  if (port < 0) {
+    return errors::InvalidArgument(
+        "Can't get tensor properties of control dependency ", tensor);
+  }
+
+  const auto& output_properties =
+      ctx.graph_properties->GetOutputProperties(tensor_node_name);
+  auto num_outputs = output_properties.size();
+
+  if (num_outputs == 0 || port > num_outputs - 1) {
+    return errors::InvalidArgument(
+        "Node ", tensor_node_name,
+        " is missing output properties at position :", port,
+        " (num_outputs=", num_outputs, ")");
+  }
+
+  properties->CopyFrom(output_properties[port]);
+  return Status::OK();
+}
+
+NodeDef* AddCopyNode(const GraphOptimizerContext& ctx, const string& name,
+                     const NodeDef* node_to_copy) {
+  CHECK(node_to_copy != nullptr);
+  CHECK(!ctx.node_map->NodeExists(name))
+      << "Node " << name << " already exists in a graph";
+  NodeDef* new_node = ctx.optimized_graph->add_node();
+  *new_node = *node_to_copy;
+  new_node->set_name(name);
+  ctx.node_map->AddNode(name, new_node);
+  return new_node;
+}
+
+NodeDef* AddEmptyNode(const GraphOptimizerContext& ctx, const string& name) {
+  CHECK(!ctx.node_map->NodeExists(name))
+      << "Node " << name << " already exists in a graph";
+  NodeDef* new_node = ctx.optimized_graph->add_node();
+  new_node->set_name(name);
+  ctx.node_map->AddNode(name, new_node);
+  return new_node;
+}
+
+const string MakeOptimizedNodeName(const NodeScopeAndName& node,
+                                   const string& sub_scope,
+                                   const string& prefix) {
+  CHECK(!sub_scope.empty() || !prefix.empty())
+      << "Either optimized node name prefix or sub-scope must be non-empty";
+  string optimized_node_name;
+  if (!node.scope.empty()) {
+    strings::StrAppend(&optimized_node_name, node.scope, "/");
+  }
+  if (!sub_scope.empty()) {
+    strings::StrAppend(&optimized_node_name, sub_scope, "/");
+  }
+  if (!prefix.empty()) {
+    strings::StrAppend(&optimized_node_name, prefix, "_");
+  }
+  strings::StrAppend(&optimized_node_name, node.name);
+  return optimized_node_name;
+}
+
+const string MakeOptimizedNodeName(const NodeScopeAndName& root,
+                                   const std::vector<string> node_names,
+                                   const string& sub_scope,
+                                   const string& prefix) {
+  string optimized_node_name = MakeOptimizedNodeName(root, sub_scope, prefix);
+  for (const string& node_name : node_names) {
+    auto name_and_scope = ParseNodeScopeAndName(node_name);
+    strings::StrAppend(&optimized_node_name, "_", name_and_scope.name);
+  }
+  return optimized_node_name;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ed047486111992dd61cc116319da91f0f93ac64
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -0,0 +1,240 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_OPTIMIZER_STAGE_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_OPTIMIZER_STAGE_H_
+
+#include <unordered_map>
+#include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+struct NodeScopeAndName {
+  string scope;
+  string name;
+};
+
+// Parse scope and name: "a/b/c/Add_1" -> {"a/b/c", "Add_1"}
+const NodeScopeAndName ParseNodeScopeAndName(const string& node_name);
+
+// Context owned by GraphOptimizer, and passed to every stage at construction
+// time. Each optimizer stage is responsible for updating it according to the
+// changes it made to the graph.
+//
+// If an optimizer needs access to some helper class that is not present in this
+// context, consider creating an extension context, specific to that
+// optimizer (see example of ArithmeticOptimizerContext). GraphOptimizerContext
+// should only have members that are useful to almost all optimizers.
+struct GraphOptimizerContext {
+  GraphOptimizerContext(const std::unordered_set<string>* nodes_to_preserve,
+                        GraphDef* optimized_graph,
+                        GraphProperties* graph_properties, NodeMap* node_map)
+      : nodes_to_preserve(nodes_to_preserve),
+        optimized_graph(optimized_graph),
+        graph_properties(graph_properties),
+        node_map(node_map) {}
+
+  const std::unordered_set<string>* nodes_to_preserve;
+  GraphDef* optimized_graph;
+  GraphProperties* graph_properties;
+  NodeMap* node_map;
+};
+
+Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
+                    NodeDef** node);
+Status GetTensorProperties(const GraphOptimizerContext& ctx,
+                           const string& tensor,
+                           OpInfo::TensorProperties* properties);
+
+NodeDef* AddCopyNode(const GraphOptimizerContext& ctx, const string& name,
+                     const NodeDef* node_to_copy);
+NodeDef* AddEmptyNode(const GraphOptimizerContext& ctx, const string& name);
+
+// WARNING:
+// Optimizer stage must try to re-use original nodes of a graph and
+// make all updates in place. This helps to make robust node placement
+// decisions. Create new nodes only if there is a reason for that.
+
+// Make a name for a new node obtained by optimizing a single node of the
+// original graph. The optimized node is placed under the original node scope.
+//
+// Node name uniqueness is guaranteed by unique name of an original node in
+// a same scope.
+//
+// Empty sub_scope or prefix ignored. At least one of them must be non-empty.
+//
+// Example: a/b/c/Add -> a/b/c/${sub_scope}/${prefix}_Add.
+const string MakeOptimizedNodeName(const NodeScopeAndName& node,
+                                   const string& sub_scope,
+                                   const string& prefix);
+// Make a name for a new node obtained by optimizing multiple nodes of the
+// original graph, starting from "root". The optimized node is placed under
+// the original scope of a "root" node.
+//
+// Example: [a/b/c/Add, x/y/z/Mul] -> a/b/c/${sub_scope}/${prefix}_Add_Mul
+const string MakeOptimizedNodeName(const NodeScopeAndName& root,
+                                   const std::vector<string> node_names,
+                                   const string& sub_scope,
+                                   const string& prefix);
+
+// Base class for multi-stage GraphOptimizers (ArithmeticOptimizer, etc...).
+//
+// If a graph optimizer consists of large number of small independent
+// rewrites, each of them should be implemented as a separate stage.
+//
+// * Result:
+// Each graph optimizer choose what result is reported by each stage
+// (e.g. each stage can fill in the name of optimized nodes, or have more
+// complex result).
+template <typename Result>
+class GraphOptimizerStage {
+ public:
+  explicit GraphOptimizerStage(const string& optimizer_name,
+                               const string& stage_name,
+                               const GraphOptimizerContext& ctx)
+      : optimizer_name_(optimizer_name), stage_name_(stage_name), ctx_(ctx) {}
+  virtual ~GraphOptimizerStage() = default;
+
+  const string& stage_name() const { return stage_name_; }
+  const string& optimizer_name() const { return optimizer_name_; }
+
+  // Check if we should try to simplify node. Returning true doesn't
+  // guarantee that node will be simplified.
+  //
+  // Should implement just a basic sanity check, without any expensive graph
+  // traversals.
+  virtual bool IsSupported(const NodeDef* node) const = 0;
+
+  // Try to simplify the given node.
+  //
+  // Return error status only if some precondition is failed, or got an
+  // incorrect graph. In every other case return Status:OK(), even if didn't
+  // simplify anything.
+  //
+  // Report result using output argument. Each GraphOptimizer can choose it's
+  // own Result type.
+  // TODO(ezhulenev): if it will appear that Result output parameter is not
+  // sufficiently useful (used with a reason by most optimizers), get rid of it,
+  // and remove template parameter.
+  virtual Status TrySimplify(NodeDef* node, Result* result) = 0;
+
+  // Get a name for a new node, created by this stage, based on one or multiple
+  // nodes of an original graph.
+  const string OptimizedNodeName(const NodeScopeAndName& node) const {
+    return MakeOptimizedNodeName(node, optimizer_name_, stage_name_);
+  }
+  const string OptimizedNodeName(const NodeScopeAndName& root,
+                                 const std::vector<string>& nodes) const {
+    return MakeOptimizedNodeName(root, nodes, optimizer_name_, stage_name_);
+  }
+  const string OptimizedNodeName(const NodeScopeAndName& node,
+                                 const string& rewrite_rule) const {
+    const string prefix = strings::StrCat(stage_name_, "_", rewrite_rule);
+    return MakeOptimizedNodeName(node, optimizer_name_, prefix);
+  }
+
+  // Get a node by input name from a node map. Return an error if node was not
+  // found.
+  Status GetInputNode(const string& input, NodeDef** node) const {
+    return ::tensorflow::grappler::GetInputNode(ctx_, input, node);
+  }
+  // Lookup tensor properties by name. Tensor name might have non-zero port
+  // number. Return an error if tensor node doesn't exists in a graph, or it
+  // doesn't have properties defined for requested port.
+  Status GetTensorProperties(const string& tensor,
+                             OpInfo::TensorProperties* properties) const {
+    return ::tensorflow::grappler::GetTensorProperties(ctx_, tensor,
+                                                       properties);
+  }
+
+  NodeDef* AddCopyNode(const string& name, const NodeDef* node_to_copy) {
+    return ::tensorflow::grappler::AddCopyNode(ctx_, name, node_to_copy);
+  }
+  NodeDef* AddEmptyNode(const string& name) {
+    return ::tensorflow::grappler::AddEmptyNode(ctx_, name);
+  }
+
+ protected:  // Data members
+  const string optimizer_name_;
+  const string stage_name_;
+  const GraphOptimizerContext ctx_;
+};
+
+template <typename Result>
+class GraphOptimizerStagePipeline {
+ public:
+  // Break predicate specifies if a pipeline should stop early, and not pass
+  // a node to the next registered optimizer stage, typically that should be the
+  // case when a stage successfully optimized a node, and it wants to yield
+  // control to the optimizer.
+  explicit GraphOptimizerStagePipeline(
+      const std::function<bool(const Result&)> break_predicate)
+      : break_predicate_(break_predicate) {}
+
+  // Add a stage to the pipeline. It should be called with the arguments for the
+  // stage constructor:
+  //
+  //   pipeline.AddStage<FooStage>(constructor_arg1, constructor_arg2);
+  //
+  // Returns a reference to the added stage.
+  template <typename T, typename... Args>
+  T& AddStage(Args&&... args) {
+    auto stage = new T(std::forward<Args>(args)...);
+    stages_.push_back(std::unique_ptr<T>(stage));
+    return *stage;
+  }
+
+  // Pass a node through all registered optimizer stages, until break predicate
+  // is true.
+  //
+  // Return true, if pipeline exited after a break predicate was evaluated as
+  // 'true', which typically means that a node was optimized by one of the
+  // registered stages.
+  //
+  // Return false, if node was not optimized by any of registered stages.
+  bool PassThroughAllStages(NodeDef* node, Result* result) {
+    for (auto& stage : stages_) {
+      if (stage->IsSupported(node)) {
+        const Status stage_status = stage->TrySimplify(node, result);
+        // Each stage must be "error safe" (just like exception safe). In
+        // case of any error it must leave optimized graph unmodified.
+        if (!stage_status.ok()) {
+          LOG(WARNING) << "Failed to run optimizer " << stage->optimizer_name()
+                       << ", stage " << stage->stage_name()
+                       << ". Error: " << stage_status.error_message();
+        }
+        if (break_predicate_(*result)) return true;
+      }
+    }
+    return false;
+  }
+
+  std::size_t NumStages() { return stages_.size(); }
+
+ private:
+  std::vector<std::unique_ptr<GraphOptimizerStage<Result>>> stages_;
+  std::function<bool(const Result&)> break_predicate_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GraphOptimizerStagePipeline);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_OPTIMIZER_STAGE_H_
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f5ab87a5a372a0dc954aa5a9ae57241635d5594
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
@@ -0,0 +1,166 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class GraphOptimizerStageTest : public ::testing::Test {};
+
+struct FakeResult {};
+
+// NoOp optimizer stage that supports all the node types and does nothing
+class FakeOptimizerStage : public GraphOptimizerStage<FakeResult> {
+ public:
+  explicit FakeOptimizerStage(const string& optimizer_name,
+                              const string& stage_name,
+                              const GraphOptimizerContext& ctx)
+      : GraphOptimizerStage(optimizer_name, stage_name, ctx) {}
+  ~FakeOptimizerStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override { return true; }
+  Status TrySimplify(NodeDef* node, FakeResult* result) override {
+    return Status::OK();
+  }
+};
+
+TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScope_InRoot) {
+  const auto scope_and_name = ParseNodeScopeAndName("Add");
+  EXPECT_EQ("", scope_and_name.scope);
+  EXPECT_EQ("Add", scope_and_name.name);
+}
+
+TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScope_InScope) {
+  const auto scope_and_name = ParseNodeScopeAndName("a/b/c/Add");
+  EXPECT_EQ("a/b/c", scope_and_name.scope);
+  EXPECT_EQ("Add", scope_and_name.name);
+}
+
+TEST_F(GraphOptimizerStageTest, OptimizedNodeName) {
+  GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
+                            /*optimized_graph*/ nullptr,
+                            /*graph_properties*/ nullptr,
+                            /*node_name*/ nullptr);
+  FakeOptimizerStage stage("my_opt", "my_stg", ctx);
+
+  const auto node = ParseNodeScopeAndName("a/b/c/Add");
+
+  // Without rewrite rule
+  EXPECT_EQ("a/b/c/my_opt/my_stg_Add", stage.OptimizedNodeName(node));
+  EXPECT_EQ(
+      "a/b/c/my_opt/my_stg_Add_Mul_Sqrt",
+      stage.OptimizedNodeName(node, std::vector<string>({"Mul", "Sqrt"})));
+
+  // With rewrite rule
+  const string rewrite = "my_rewrite";
+  EXPECT_EQ("a/b/c/my_opt/my_stg_my_rewrite_Add",
+            stage.OptimizedNodeName(node, rewrite));
+}
+
+TEST_F(GraphOptimizerStageTest, GetInputNodeAndProperties) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto add = ops::Add(s.WithOpName("Add"), a, b);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(/*assume_valid_feeds*/ false));
+
+  NodeMap node_map(&item.graph);
+
+  GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
+                            /*optimized_graph*/ &item.graph,
+                            /*graph_properties*/ &properties,
+                            /*node_name*/ &node_map);
+  FakeOptimizerStage stage("my_opt", "my_stg", ctx);
+
+  NodeDef* add_node;
+  TF_CHECK_OK(stage.GetInputNode("Add", &add_node));
+  EXPECT_EQ("a", add_node->input(0));
+  EXPECT_EQ("b", add_node->input(1));
+
+  OpInfo::TensorProperties add_properties;
+  TF_CHECK_OK(stage.GetTensorProperties("Add", &add_properties));
+  EXPECT_EQ(DT_FLOAT, add_properties.dtype());
+
+  OpInfo::TensorProperties a_properties;
+  TF_CHECK_OK(stage.GetTensorProperties("a:0", &a_properties));
+  EXPECT_EQ(DT_FLOAT_REF, a_properties.dtype());
+
+  OpInfo::TensorProperties b_properties;
+  TF_CHECK_OK(stage.GetTensorProperties("b:0", &b_properties));
+  EXPECT_EQ(DT_FLOAT_REF, b_properties.dtype());
+}
+
+TEST_F(GraphOptimizerStageTest, AddNodes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
+  auto add = ops::Add(s.WithOpName("Add"), a, b);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(/*assume_valid_feeds*/ false));
+
+  NodeMap node_map(&item.graph);
+
+  GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
+                            /*optimized_graph*/ &item.graph,
+                            /*graph_properties*/ &properties,
+                            /*node_name*/ &node_map);
+  FakeOptimizerStage stage("my_opt", "my_stg", ctx);
+
+  NodeDef* add_node;
+  TF_CHECK_OK(stage.GetInputNode("Add", &add_node));
+
+  // Add a new copy node
+  NodeDef* add_node_copy = stage.AddCopyNode("Add_1", add_node);
+  EXPECT_EQ("Add_1", add_node_copy->name());
+  EXPECT_EQ("Add", add_node_copy->op());
+  EXPECT_EQ("a", add_node_copy->input(0));
+  EXPECT_EQ("b", add_node_copy->input(1));
+
+  // It must be available for by-name lookup
+  NodeDef* add_node_copy_by_name;
+  TF_CHECK_OK(stage.GetInputNode("Add_1", &add_node_copy_by_name));
+  EXPECT_EQ(add_node_copy, add_node_copy_by_name);
+
+  // Add new empty node
+  NodeDef* empty_node = stage.AddEmptyNode("Add_2");
+  EXPECT_EQ("Add_2", empty_node->name());
+
+  // It must be available for by-name lookup
+  NodeDef* empty_node_by_name;
+  TF_CHECK_OK(stage.GetInputNode("Add_2", &empty_node_by_name));
+  EXPECT_EQ(empty_node, empty_node_by_name);
+}
+
+}  // namespace
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index e4af71c40ace855f3c58846b47903bedc81c35d0..308eecd4205d0d6efd6aecc9f8ca18e958342faa 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -301,10 +301,6 @@ bool IsComparisonOp(const NodeDef& node) {
   return is_compare;
 }
 
-bool IsLogicalOp(const NodeDef& node) {
-  return IsLogicalAnd(node) || IsLogicalNot(node) || IsLogicalOr(node);
-}
-
 bool IsReduceOp(const NodeDef& node) {
   return IsSum(node) || IsMean(node) || IsProd(node) || IsMax(node) ||
          IsMin(node) || IsAll(node) || IsAny(node);
@@ -551,8 +547,8 @@ class NodeProcessor : public GraphProcessor {
     string device;
     string not_used;
     if (DeviceNameUtils::SplitDeviceName(device_name, &not_used, &device) &&
-        (StringPiece(str_util::Lowercase(device)))
-            .contains(str_util::Lowercase(DEVICE_GPU))) {
+        str_util::StrContains(str_util::Lowercase(device),
+                              str_util::Lowercase(DEVICE_GPU))) {
       return true;
     }
     return false;
@@ -2123,6 +2119,10 @@ Status LayoutOptimizer::Tune(const GrapplerItem& item,
 
 Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
+  if (cluster == nullptr) {
+    return errors::InvalidArgument("cluster == nullptr");
+  }
+
   if (GetNumGPUs(*cluster) < 1) {
     // LayoutOptimizer is currently only tuned for GPU.
     *output = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 8f13c4a7022165bf2e809b12e11781cd689fa5c1..a063dc33816e25c560a385e188203c9ad9bfe4cd 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -46,8 +46,9 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-std::vector<int> GetStackPushNodesToConvert(const SimpleGraphView& graph_view,
-                                            int stack_node_idx) {
+std::vector<int> GetStackPushNodesToConvert(
+    const SimpleGraphView& graph_view,
+    const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
   VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
   const std::unordered_set<string> op_types_to_traverse(
       {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
@@ -65,7 +66,9 @@ std::vector<int> GetStackPushNodesToConvert(const SimpleGraphView& graph_view,
                    op_types_to_traverse.end()) {
       continue;
     } else if (!IsStackPopOp(fanout_node) ||
-               !graph_view.outputs(fanout_idx).empty()) {
+               (!graph_view.outputs(fanout_idx).empty() ||
+                nodes_to_preserve.find(fanout_node.name()) !=
+                    nodes_to_preserve.end())) {
       // The node is either a stack pop with consumers or something unexpected
       // so we leave the graph alone.
       nodes_to_convert.clear();
@@ -75,15 +78,17 @@ std::vector<int> GetStackPushNodesToConvert(const SimpleGraphView& graph_view,
   return nodes_to_convert;
 }
 
-Status RemoveStackOps(const GraphDef& graph, GraphDef* optimized_graph) {
+Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
+  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
+  const GraphDef& graph = item.graph;
   *optimized_graph = graph;
   NodeMap node_map(optimized_graph);
   SimpleGraphView graph_view;
   TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
   for (int node_idx = 0; node_idx < graph.node_size(); ++node_idx) {
     if (IsStackOp(graph.node(node_idx))) {
-      for (int push_node_idx :
-           GetStackPushNodesToConvert(graph_view, node_idx)) {
+      for (int push_node_idx : GetStackPushNodesToConvert(
+               graph_view, nodes_to_preserve, node_idx)) {
         // We found push nodes without corresponding pops. Convert them to
         // Identity passing the data through and add a control dependency from
         // the op supplying the stack handle.
@@ -363,7 +368,7 @@ Status LoopOptimizer::FindInvariantNodes(NodeDef* node) {
     bool is_invariant = true;
     for (const auto& input : consumer->input()) {
       if (!IsControlInput(input)) {
-        const auto& name = NodeName(input);
+        const string name = NodeName(input);
         auto* producer = node_map_->GetNode(name);
         if (!invariant_nodes_.count(producer)) {
           if (IsConstant(*producer)) {
@@ -464,16 +469,19 @@ Status LoopOptimizer::LoopInvariantNodeMotion() {
 
 Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                GraphDef* optimized_graph) {
-  TF_RETURN_IF_ERROR(RemoveStackOps(item.graph, optimized_graph));
-  optimized_graph_ = optimized_graph;
 
-  // Set up helper data structures.
-  node_map_.reset(new NodeMap(optimized_graph_));
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                               &frame_map_, &num_frames));
+  TF_RETURN_IF_ERROR(RemoveStackOps(item, optimized_graph));
+
+  if (opt_level_ == RewriterConfig::AGGRESSIVE) {
+    optimized_graph_ = optimized_graph;
+    // Set up helper data structures.
+    node_map_.reset(new NodeMap(optimized_graph_));
+    int num_frames;
+    TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
+                                                 &frame_map_, &num_frames));
+    TF_RETURN_IF_ERROR(LoopInvariantNodeMotion());
+  }
 
-  TF_RETURN_IF_ERROR(LoopInvariantNodeMotion());
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 0bd202a2abd87b4df4a45f51bc6b2bb09feddafb..a0bd3351976ccbeddd8778281dbdc0c07bbd6455 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -26,77 +27,61 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class LoopOptimizerTest : public ::testing::Test {
+class LoopOptimizerTest : public GrapplerTest {
  protected:
-  static NodeDef CreateNode(const string& name,
-                            const std::vector<string>& inputs) {
-    return CreateNode(name, "Identity", "", false, 0, inputs);
-  }
-  static NodeDef CreateNode(const string& name, const string& op,
-                            const std::vector<string>& inputs) {
-    return CreateNode(name, op, "", false, 0, inputs);
+  // These helpers always sets T=DT_FLOAT.
+  void AddEnterNode(const string& name, const string& frame,
+                    const bool is_constant, const int piterations,
+                    const std::vector<string>& inputs, GraphDef* graph) const {
+    std::vector<std::pair<string, AttrValue>> attributes;
+    AttrValue type;
+    type.set_type(DT_FLOAT);
+    attributes.emplace_back("T", type);
+    AttrValue frame_name;
+    frame_name.set_s(frame);
+    attributes.emplace_back("frame_name", frame_name);
+    AttrValue is_const;
+    is_const.set_b(is_constant);
+    attributes.emplace_back("is_constant", is_const);
+    AttrValue parallel_iterations;
+    parallel_iterations.set_i(piterations);
+    attributes.emplace_back("parallel_iterations", parallel_iterations);
+    AddNode(name, "Enter", inputs, attributes, graph);
   }
-  static NodeDef CreateNode(const string& name, const string& op,
-                            const string& frame,
-                            const bool is_constant,
-                            const int piterations,
-                            const std::vector<string>& inputs) {
-    NodeDef node;
-    node.set_name(name);
-    if (!op.empty()) {
-      node.set_op(op);
-    }
-    if (!frame.empty()) {
-      AttrValue frame_name;
-      frame_name.set_s(frame);
-      node.mutable_attr()->insert({"frame_name", frame_name});
-    }
-    if (op == "Enter") {
-      AttrValue is_const;
-      is_const.set_b(is_constant);
-      node.mutable_attr()->insert({"is_constant", is_const});
-      AttrValue parallel_iterations;
-      parallel_iterations.set_i(piterations);
-      node.mutable_attr()->insert(
-          {"parallel_iterations", parallel_iterations});
-    }
+
+  void AddSimpleNode(const string& name, const string& op,
+                     const std::vector<string>& inputs, GraphDef* graph) const {
+    std::vector<std::pair<string, AttrValue>> attributes;
     AttrValue type;
     type.set_type(DT_FLOAT);
-    node.mutable_attr()->insert({"T", type});
-    for (const string& input : inputs) {
-      node.add_input(input);
-    }
-    return node;
+    attributes.emplace_back("T", type);
+    AddNode(name, op, inputs, attributes, graph);
   }
 };
 
 TEST_F(LoopOptimizerTest, Basic) {
   GraphDef graph;
-  *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() = CreateNode(
-      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "Merge", "Merge", {"VariantEnter", "NextIteration"});
-  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
-  *graph.add_node() = CreateNode("Less", "Less", {"VariantAdd", "less/y"});
-  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
-  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
-  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration", "NextIteration", {"VariantAdd"});
-  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
-  *graph.add_node() = CreateNode("1", {"Exit"});
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"VariantAdd", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"VariantAdd"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
 
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer;
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -123,32 +108,27 @@ TEST_F(LoopOptimizerTest, Basic) {
 
 TEST_F(LoopOptimizerTest, Const) {
   GraphDef graph;
-  *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() = CreateNode(
-      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
-  *graph.add_node() = CreateNode("Const", "Const", {"^Identity"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd", "Add", {"InvariantEnter", "Const"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "Merge", "Merge", {"VariantEnter", "NextIteration"});
-  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
-  *graph.add_node() = CreateNode("Less", "Less", {"VariantAdd", "less/y"});
-  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
-  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
-  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration", "NextIteration", {"VariantAdd"});
-  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
-  *graph.add_node() = CreateNode("1", {"Exit"});
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("Const", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "Const"}, &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"VariantAdd", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"VariantAdd"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
 
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer;
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -174,32 +154,28 @@ TEST_F(LoopOptimizerTest, Const) {
 
 TEST_F(LoopOptimizerTest, ControlOutput) {
   GraphDef graph;
-  *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() = CreateNode(
-      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "Merge", "Merge", {"VariantEnter", "NextIteration"});
-  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
-  *graph.add_node() = CreateNode(
-      "Less", "Less", {"VariantAdd", "less/y", "^InvariantAdd"});
-  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
-  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
-  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration", "NextIteration", {"VariantAdd"});
-  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
-  *graph.add_node() = CreateNode("1", {"Exit"});
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"VariantAdd", "Less/y", "^InvariantAdd"},
+                &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"VariantAdd"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
 
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer;
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -223,52 +199,43 @@ TEST_F(LoopOptimizerTest, ControlOutput) {
 
 TEST_F(LoopOptimizerTest, NestedLoop1) {
   GraphDef graph;
-  *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() = CreateNode(
-      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "Merge", "Merge", {"VariantEnter", "NextIteration"});
-  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
-  *graph.add_node() = CreateNode("Less", "Less", {"Exit2", "less/y"});
-  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
-  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
-  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration", "NextIteration", {"Exit2"});
-  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
-  *graph.add_node() = CreateNode("1", {"Exit"});
-
-  *graph.add_node() = CreateNode(
-      "InvariantEnter2", "Enter", "while/while/while_context", true, 1,
-      {"VariantAdd"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd2", "Add", {"InvariantEnter2", "InvariantEnter2"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd2", "Add", {"InvariantAdd2", "Identity2"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter2", "Enter", "while/while/while_context", false, 1,
-      {"VariantEnter"});
-  *graph.add_node() = CreateNode(
-      "Merge2", "Merge", {"VariantEnter2", "NextIteration2"});
-  *graph.add_node() = CreateNode("Less2/y", "Const", {"^Identity2"});
-  *graph.add_node() = CreateNode("Less2", "Less", {"VariantAdd2", "less2/y"});
-  *graph.add_node() = CreateNode("LoopCond2", "LoopCond", {"Less2"});
-  *graph.add_node() = CreateNode("Switch2", "Switch", {"Merge2", "LoopCond2"});
-  *graph.add_node() = CreateNode("Identity2", {"Switch2:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration2", "NextIteration", {"VariantAdd2"});
-  *graph.add_node() = CreateNode("Exit2", "Exit", {"Switch2"});
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"Exit2", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"Exit2"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  AddEnterNode("InvariantEnter2", "while/while/while_context", true, 1,
+               {"VariantAdd"}, &graph);
+  AddSimpleNode("InvariantAdd2", "Add", {"InvariantEnter2", "InvariantEnter2"},
+                &graph);
+  AddSimpleNode("VariantAdd2", "Add", {"InvariantAdd2", "Identity2"}, &graph);
+  AddEnterNode("VariantEnter2", "while/while/while_context", false, 1,
+               {"VariantEnter"}, &graph);
+  AddSimpleNode("Merge2", "Merge", {"VariantEnter2", "NextIteration2"}, &graph);
+  AddSimpleNode("Less2/y", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("Less2", "Less", {"VariantAdd2", "Less2/y"}, &graph);
+  AddSimpleNode("LoopCond2", "LoopCond", {"Less2"}, &graph);
+  AddSimpleNode("Switch2", "Switch", {"Merge2", "LoopCond2"}, &graph);
+  AddSimpleNode("Identity2", "Identity", {"Switch2:1"}, &graph);
+  AddSimpleNode("NextIteration2", "NextIteration", {"VariantAdd2"}, &graph);
+  AddSimpleNode("Exit2", "Exit", {"Switch2"}, &graph);
 
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer;
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -299,52 +266,43 @@ TEST_F(LoopOptimizerTest, NestedLoop1) {
 
 TEST_F(LoopOptimizerTest, NestedLoop2) {
   GraphDef graph;
-  *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() = CreateNode(
-      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "Merge", "Merge", {"VariantEnter", "NextIteration"});
-  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
-  *graph.add_node() = CreateNode("Less", "Less", {"Exit2", "less/y"});
-  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
-  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
-  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration", "NextIteration", {"Exit2"});
-  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
-  *graph.add_node() = CreateNode("1", {"Exit"});
-
-  *graph.add_node() = CreateNode(
-      "InvariantEnter2", "Enter", "while/while/while_context", true, 1,
-      {"InvariantAdd"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd2", "Add", {"InvariantEnter2", "InvariantEnter2"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd2", "Add", {"InvariantAdd2", "Identity2"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter2", "Enter", "while/while/while_context", false, 1,
-      {"VariantEnter"});
-  *graph.add_node() = CreateNode(
-      "Merge2", "Merge", {"VariantEnter2", "NextIteration2"});
-  *graph.add_node() = CreateNode("Less2/y", "Const", {"^Identity2"});
-  *graph.add_node() = CreateNode("Less2", "Less", {"VariantAdd2", "less2/y"});
-  *graph.add_node() = CreateNode("LoopCond2", "LoopCond", {"Less2"});
-  *graph.add_node() = CreateNode("Switch2", "Switch", {"Merge2", "LoopCond2"});
-  *graph.add_node() = CreateNode("Identity2", {"Switch2:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration2", "NextIteration", {"VariantAdd2"});
-  *graph.add_node() = CreateNode("Exit2", "Exit", {"Switch2"});
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"Exit2", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"Exit2"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  AddEnterNode("InvariantEnter2", "while/while/while_context", true, 1,
+               {"InvariantAdd"}, &graph);
+  AddSimpleNode("InvariantAdd2", "Add", {"InvariantEnter2", "InvariantEnter2"},
+                &graph);
+  AddSimpleNode("VariantAdd2", "Add", {"InvariantAdd2", "Identity2"}, &graph);
+  AddEnterNode("VariantEnter2", "while/while/while_context", false, 1,
+               {"VariantEnter"}, &graph);
+  AddSimpleNode("Merge2", "Merge", {"VariantEnter2", "NextIteration2"}, &graph);
+  AddSimpleNode("Less2/y", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("Less2", "Less", {"VariantAdd2", "Less2/y"}, &graph);
+  AddSimpleNode("LoopCond2", "LoopCond", {"Less2"}, &graph);
+  AddSimpleNode("Switch2", "Switch", {"Merge2", "LoopCond2"}, &graph);
+  AddSimpleNode("Identity2", "Identity", {"Switch2:1"}, &graph);
+  AddSimpleNode("NextIteration2", "NextIteration", {"VariantAdd2"}, &graph);
+  AddSimpleNode("Exit2", "Exit", {"Switch2"}, &graph);
 
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer;
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -371,53 +329,43 @@ TEST_F(LoopOptimizerTest, NestedLoop2) {
 
 TEST_F(LoopOptimizerTest, NestedLoopConst1) {
   GraphDef graph;
-  *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() = CreateNode(
-      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "Merge", "Merge", {"VariantEnter", "NextIteration"});
-  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
-  *graph.add_node() = CreateNode("Less", "Less", {"Exit2", "less/y"});
-  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
-  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
-  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration", "NextIteration", {"Exit2"});
-  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
-  *graph.add_node() = CreateNode("1", {"Exit"});
-
-  *graph.add_node() = CreateNode(
-      "InvariantEnter2", "Enter", "while/while/while_context", true, 1,
-      {"VariantAdd"});
-  *graph.add_node() = CreateNode("Const2", "Const", {"^Identity2"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd2", "Add", {"InvariantEnter2", "Const2"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd2", "Add", {"InvariantAdd2", "Identity2"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter2", "Enter", "while/while/while_context", false, 1,
-      {"VariantEnter"});
-  *graph.add_node() = CreateNode(
-      "Merge2", "Merge", {"VariantEnter2", "NextIteration2"});
-  *graph.add_node() = CreateNode("Less2/y", "Const", {"^Identity2"});
-  *graph.add_node() = CreateNode("Less2", "Less", {"VariantAdd2", "less2/y"});
-  *graph.add_node() = CreateNode("LoopCond2", "LoopCond", {"Less2"});
-  *graph.add_node() = CreateNode("Switch2", "Switch", {"Merge2", "LoopCond2"});
-  *graph.add_node() = CreateNode("Identity2", {"Switch2:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration2", "NextIteration", {"VariantAdd2"});
-  *graph.add_node() = CreateNode("Exit2", "Exit", {"Switch2"});
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"Exit2", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"Exit2"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  AddEnterNode("InvariantEnter2", "while/while/while_context", true, 1,
+               {"VariantAdd"}, &graph);
+  AddSimpleNode("Const2", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("InvariantAdd2", "Add", {"InvariantEnter2", "Const2"}, &graph);
+  AddSimpleNode("VariantAdd2", "Add", {"InvariantAdd2", "Identity2"}, &graph);
+  AddEnterNode("VariantEnter2", "while/while/while_context", false, 1,
+               {"VariantEnter"}, &graph);
+  AddSimpleNode("Merge2", "Merge", {"VariantEnter2", "NextIteration2"}, &graph);
+  AddSimpleNode("Less2/y", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("Less2", "Less", {"VariantAdd2", "Less2/y"}, &graph);
+  AddSimpleNode("LoopCond2", "LoopCond", {"Less2"}, &graph);
+  AddSimpleNode("Switch2", "Switch", {"Merge2", "LoopCond2"}, &graph);
+  AddSimpleNode("Identity2", "Identity", {"Switch2:1"}, &graph);
+  AddSimpleNode("NextIteration2", "NextIteration", {"VariantAdd2"}, &graph);
+  AddSimpleNode("Exit2", "Exit", {"Switch2"}, &graph);
 
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer;
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -445,53 +393,43 @@ TEST_F(LoopOptimizerTest, NestedLoopConst1) {
 
 TEST_F(LoopOptimizerTest, NestedLoopConst2) {
   GraphDef graph;
-  *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() = CreateNode(
-      "InvariantEnter", "Enter", "while/while_context", true, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd", "Add", {"InvariantAdd", "Identity"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter", "Enter", "while/while_context", false, 1, {"0"});
-  *graph.add_node() = CreateNode(
-      "Merge", "Merge", {"VariantEnter", "NextIteration"});
-  *graph.add_node() = CreateNode("Less/y", "Const", {"^Identity"});
-  *graph.add_node() = CreateNode("Less", "Less", {"Exit2", "less/y"});
-  *graph.add_node() = CreateNode("LoopCond", "LoopCond", {"Less"});
-  *graph.add_node() = CreateNode("Switch", "Switch", {"Merge", "LoopCond"});
-  *graph.add_node() = CreateNode("Identity", {"Switch:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration", "NextIteration", {"Exit2"});
-  *graph.add_node() = CreateNode("Exit", "Exit", {"Switch"});
-  *graph.add_node() = CreateNode("1", {"Exit"});
-
-  *graph.add_node() = CreateNode(
-      "InvariantEnter2", "Enter", "while/while/while_context", true, 1,
-      {"InvariantAdd"});
-  *graph.add_node() = CreateNode("Const2", "Const", {"^Identity2"});
-  *graph.add_node() = CreateNode(
-      "InvariantAdd2", "Add", {"InvariantEnter2", "Const2"});
-  *graph.add_node() = CreateNode(
-      "VariantAdd2", "Add", {"InvariantAdd2", "Identity2"});
-  *graph.add_node() = CreateNode(
-      "VariantEnter2", "Enter", "while/while/while_context", false, 1,
-      {"VariantEnter"});
-  *graph.add_node() = CreateNode(
-      "Merge2", "Merge", {"VariantEnter2", "NextIteration2"});
-  *graph.add_node() = CreateNode("Less2/y", "Const", {"^Identity2"});
-  *graph.add_node() = CreateNode("Less2", "Less", {"VariantAdd2", "less2/y"});
-  *graph.add_node() = CreateNode("LoopCond2", "LoopCond", {"Less2"});
-  *graph.add_node() = CreateNode("Switch2", "Switch", {"Merge2", "LoopCond2"});
-  *graph.add_node() = CreateNode("Identity2", {"Switch2:1"});
-  *graph.add_node() = CreateNode(
-      "NextIteration2", "NextIteration", {"VariantAdd2"});
-  *graph.add_node() = CreateNode("Exit2", "Exit", {"Switch2"});
+  AddSimpleNode("In", "Identity", {}, &graph);
+  AddEnterNode("InvariantEnter", "while/while_context", true, 1, {"In"},
+               &graph);
+  AddSimpleNode("InvariantAdd", "Add", {"InvariantEnter", "InvariantEnter"},
+                &graph);
+  AddSimpleNode("VariantAdd", "Add", {"InvariantAdd", "Identity"}, &graph);
+  AddEnterNode("VariantEnter", "while/while_context", false, 1, {"In"}, &graph);
+  AddSimpleNode("Merge", "Merge", {"VariantEnter", "NextIteration"}, &graph);
+  AddSimpleNode("Less/y", "Const", {"^Identity"}, &graph);
+  AddSimpleNode("Less", "Less", {"Exit2", "Less/y"}, &graph);
+  AddSimpleNode("LoopCond", "LoopCond", {"Less"}, &graph);
+  AddSimpleNode("Switch", "Switch", {"Merge", "LoopCond"}, &graph);
+  AddSimpleNode("Identity", "Identity", {"Switch:1"}, &graph);
+  AddSimpleNode("NextIteration", "NextIteration", {"Exit2"}, &graph);
+  AddSimpleNode("Exit", "Exit", {"Switch"}, &graph);
+  AddSimpleNode("Out", "Identity", {"Exit"}, &graph);
+
+  AddEnterNode("InvariantEnter2", "while/while/while_context", true, 1,
+               {"InvariantAdd"}, &graph);
+  AddSimpleNode("Const2", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("InvariantAdd2", "Add", {"InvariantEnter2", "Const2"}, &graph);
+  AddSimpleNode("VariantAdd2", "Add", {"InvariantAdd2", "Identity2"}, &graph);
+  AddEnterNode("VariantEnter2", "while/while/while_context", false, 1,
+               {"VariantEnter"}, &graph);
+  AddSimpleNode("Merge2", "Merge", {"VariantEnter2", "NextIteration2"}, &graph);
+  AddSimpleNode("Less2/y", "Const", {"^Identity2"}, &graph);
+  AddSimpleNode("Less2", "Less", {"VariantAdd2", "Less2/y"}, &graph);
+  AddSimpleNode("LoopCond2", "LoopCond", {"Less2"}, &graph);
+  AddSimpleNode("Switch2", "Switch", {"Merge2", "LoopCond2"}, &graph);
+  AddSimpleNode("Identity2", "Identity", {"Switch2:1"}, &graph);
+  AddSimpleNode("NextIteration2", "NextIteration", {"VariantAdd2"}, &graph);
+  AddSimpleNode("Exit2", "Exit", {"Switch2"}, &graph);
 
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer;
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -544,50 +482,27 @@ TEST_F(LoopOptimizerTest, NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-namespace {
-NodeDef* AddNode(const string& name, const string& op,
-                 const std::vector<string>& inputs,
-                 const std::vector<std::pair<string, AttrValue>>& attributes,
-                 GraphDef* graph) {
-  NodeDef* node = graph->add_node();
-  node->set_name(name);
-  node->set_op(op);
-  for (const string& input : inputs) {
-    node->add_input(input);
-  }
-  for (auto attr : attributes) {
-    (*node->mutable_attr())[attr.first] = attr.second;
-  }
-  return node;
-}
-}  // namespace
-
 TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
   GrapplerItem item;
-  AttrValue frame_name;
-  frame_name.set_s("foo");
-  AttrValue type;
-  type.set_type(DT_RESOURCE);
   GraphDef& graph = item.graph;
-  AddNode("c", "Const", {}, {}, &graph);
+  AddSimpleNode("c", "Const", {}, &graph);
   // Stack with corresponding push/pop.
-  AddNode("stack1", "StackV2", {}, {}, &graph);
-  AddNode("push1", "StackPushV2", {"stack1", "c"}, {}, &graph);
-  AddNode("pop1", "StackPopV2", {"stack1"}, {}, &graph);
-  AddNode("id1", "Identity", {"pop1"}, {}, &graph);
+  AddSimpleNode("stack1", "StackV2", {}, &graph);
+  AddSimpleNode("push1", "StackPushV2", {"stack1", "c"}, &graph);
+  AddSimpleNode("pop1", "StackPopV2", {"stack1"}, &graph);
+  AddSimpleNode("id1", "Identity", {"pop1"}, &graph);
   // Stack with corresponding push/pop behind Enter.
-  AddNode("stack2", "StackV2", {}, {}, &graph);
-  AddNode("push_enter", "Enter", {"stack2"},
-          {{"T", type}, {"frame_name", frame_name}}, &graph);
-  AddNode("push2", "StackPushV2", {"push_enter", "c"}, {}, &graph);
-  AddNode("pop_enter", "Enter", {"stack2"},
-          {{"T", type}, {"frame_name", frame_name}}, &graph);
-  AddNode("pop2", "StackPopV2", {"pop_enter"}, {}, &graph);
-  AddNode("id2", "Identity", {"pop2"}, {}, &graph);
+  AddSimpleNode("stack2", "StackV2", {}, &graph);
+  AddEnterNode("enter2_c", "frame_name", false, 1, {"c"}, &graph);
+  AddEnterNode("enter2_stack2", "frame_name", false, 1, {"stack2"}, &graph);
+  AddSimpleNode("push2", "StackPushV2", {"enter2_stack2", "enter2_c"}, &graph);
+  AddSimpleNode("pop2", "StackPopV2", {"enter2_stack2"}, &graph);
+  AddSimpleNode("id2", "Identity", {"pop2"}, &graph);
   // Stack with unexpected op type in fanout of Stack.
-  AddNode("stack3", "StackV2", {}, {}, &graph);
-  AddNode("push3", "StackPushV2", {"stack3", "c"}, {}, &graph);
-  AddNode("stop", "StopGradient", {"stack3"}, {}, &graph);
+  AddSimpleNode("stack3", "StackV2", {}, &graph);
+  AddSimpleNode("push3", "StackPushV2", {"stack3", "c"}, &graph);
+  AddSimpleNode("stop", "StopGradient", {"stack3"}, &graph);
+
   LoopOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -598,29 +513,32 @@ TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
 TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   GrapplerItem item;
   GraphDef& graph = item.graph;
-  AttrValue frame_name;
-  frame_name.set_s("foo");
-  AttrValue type;
-  type.set_type(DT_RESOURCE);
-  AddNode("c", "Const", {}, {}, &graph);
+  AddSimpleNode("c", "Const", {}, &graph);
   // Push without Pop.
-  AddNode("stack1", "StackV2", {}, {}, &graph);
-  AddNode("push1", "StackPushV2", {"stack1", "c"}, {}, &graph);
+  AddSimpleNode("stack1", "StackV2", {}, &graph);
+  AddSimpleNode("push1", "StackPushV2", {"stack1", "c"}, &graph);
   // Push without Pop behind Enter.
-  AddNode("stack2", "StackV2", {}, {}, &graph);
-  AddNode("push_enter", "Enter", {"stack2"},
-          {{"T", type}, {"frame_name", frame_name}}, &graph);
-  AddNode("push2", "StackPushV2", {"push_enter", "c"}, {}, &graph);
+  AddSimpleNode("stack2", "StackV2", {}, &graph);
+  AddEnterNode("enter_c", "frame_name", false, 1, {"c"}, &graph);
+  AddEnterNode("enter_stack2", "frame_name", false, 1, {"stack2"}, &graph);
+  AddSimpleNode("push2", "StackPushV2", {"enter_stack2", "enter_c"}, &graph);
   // Pop without consumer.
-  AddNode("stack3", "StackV2", {}, {}, &graph);
-  AddNode("push3", "StackPushV2", {"stack3", "c"}, {}, &graph);
-  AddNode("pop3", "StackPopV2", {"stack3"}, {}, &graph);
+  AddSimpleNode("stack3", "StackV2", {}, &graph);
+  AddSimpleNode("push3", "StackPushV2", {"stack3", "c"}, &graph);
+  AddSimpleNode("pop3", "StackPopV2", {"stack3"}, &graph);
+  // Push for a Pop without consumer that is fetched should not be removed.
+  AddSimpleNode("stack4", "StackV2", {}, &graph);
+  AddSimpleNode("push4", "StackPushV2", {"stack4", "c"}, &graph);
+  AddSimpleNode("pop4", "StackPopV2", {"stack4"}, &graph);
+
+  item.fetch.push_back("pop4");
 
   LoopOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  EXPECT_EQ(9, output.node_size());
+
+  EXPECT_EQ(13, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
     if (node.name() == "push1") {
@@ -631,8 +549,8 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
     } else if (node.name() == "push2") {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("c", node.input(0));
-      EXPECT_EQ("^push_enter", node.input(1));
+      EXPECT_EQ("enter_c", node.input(0));
+      EXPECT_EQ("^enter_stack2", node.input(1));
     } else if (node.name() == "push3") {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(2, node.input_size());
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index 9595936e9e6158045a13ebede95d63b9291ca434..a1f80802ddc2b3c959a74e010f6b45cb421864cf 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -426,7 +426,7 @@ TEST_F(MemoryOptimizerTest, AccumulationRewrites) {
   EXPECT_EQ(4, count);
 
   std::vector<string> fetch = {"a", "b", "c", "e"};
-  auto tensors = EvaluateNodes(output, fetch);
+  auto tensors = EvaluateNodes(output, fetch, {});
   EXPECT_EQ(4, tensors.size());
 
   for (int i = 0; i < tensors[0].NumElements(); ++i) {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 3a764937fd05b816316b27c38ddf9163180908ef..5723e397abe2348bec82fb939ea8bfca1df72eb7 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/auto_parallel.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/colocation.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -43,22 +45,21 @@ int64 NumEdges(const GraphDef& graph) {
 }
 
 string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
-  return strings::StrCat("Graph size before: ", before.node_size(), " nodes, ",
-                         NumEdges(before),
-                         " edges. Graph size after: ", after.node_size(),
-                         " nodes, ", NumEdges(after), " edges.");
+  return strings::StrCat("Graph size after: ", after.node_size(), " nodes (",
+                         after.node_size() - before.node_size(), "), ",
+                         NumEdges(after), " edges (",
+                         NumEdges(after) - NumEdges(before), ")");
 }
 }  // namespace
 
 std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     const string& optimizer) {
-  VLOG(1) << "Adding graph optimization pass: " << optimizer;
   std::unique_ptr<GraphOptimizer> graph_optimizer;
   if (optimizer == "pruning") {
     graph_optimizer.reset(new ModelPruner());
   }
   if (optimizer == "function") {
-    graph_optimizer.reset(new FunctionOptimizer());
+    graph_optimizer.reset(new FunctionOptimizer(cfg_.function_optimization()));
   }
   if (optimizer == "constfold") {
     graph_optimizer.reset(new ConstantFolding(cpu_device_));
@@ -84,6 +85,9 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     graph_optimizer.reset(
         new DependencyOptimizer(cfg_.dependency_optimization()));
   }
+  if (optimizer == "debug_stripper") {
+    graph_optimizer.reset(new DebugStripper());
+  }
   return graph_optimizer;
 }
 
@@ -95,8 +99,12 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
     if (cfg_.function_optimization() != RewriterConfig::OFF) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new FunctionOptimizer(cfg_.function_optimization())));
+    }
+    if (cfg_.debug_stripper() == RewriterConfig::ON) {
       optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new FunctionOptimizer()));
+          std::unique_ptr<GraphOptimizer>(new DebugStripper()));
     }
     if (cfg_.constant_folding() != RewriterConfig::OFF) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
@@ -106,7 +114,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new ArithmeticOptimizer(cfg_.arithmetic_optimization())));
     }
-    if (cfg_.loop_optimization() == RewriterConfig::ON) {
+    if (cfg_.loop_optimization() != RewriterConfig::OFF) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new LoopOptimizer(cfg_.loop_optimization())));
     }
@@ -136,8 +144,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   } else {
     const std::set<string> available_optimizers = {
-        "pruning",      "function",   "constfold", "layout",    "memory",
-        "autoparallel", "arithmetic", "loop",      "dependency"};
+        "pruning",    "function",      "constfold",  "layout",
+        "memory",     "autoparallel",  "arithmetic", "loop",
+        "dependency", "debug_stripper"};
     std::vector<string> custom_optimizer_names;
     for (const auto& optimizer_name : cfg_.optimizers()) {
       if (available_optimizers.find(optimizer_name) !=
@@ -162,46 +171,58 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     return Status::OK();
   }
 
+  // Some optimizers should be run only once.
+  const std::set<string> run_once_optimizers = {"layout"};
   bool already_optimized = false;
-  for (const auto& optimizer : optimizers) {
-    if (!already_optimized) {
-      Status status = optimizer->Optimize(cluster, item, optimized_graph);
-      string result;
-      if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                << ". Return status: " << status.ToString();
-        result = status.ToString();
-      } else {
-        already_optimized = true;
-        result = strings::StrCat(
-            "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
+  const int num_iterations =
+      cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
+          ? 1
+          : cfg_.meta_optimizer_iterations();
+  for (int iteration = 0; iteration < num_iterations; ++iteration) {
+    VLOG(1) << "Starting optimization iteration " << iteration + 1;
+    for (const auto& optimizer : optimizers) {
+      if (iteration > 0 && run_once_optimizers.count(optimizer->name())) {
+        continue;
       }
-      result_.push_back(std::make_pair(optimizer->name(), result));
-      VLOG(1) << "Optimizer " << optimizer->name()
-              << " return status: " << result;
-    } else {
-      GrapplerItem optimized_item(item, std::move(*optimized_graph));
-      Status status =
-          optimizer->Optimize(cluster, optimized_item, optimized_graph);
-      string result;
-      if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                << ". Return status: " << status.ToString();
-        optimized_graph->Swap(&optimized_item.graph);
-        result = status.ToString();
+      if (!already_optimized) {
+        Status status = optimizer->Optimize(cluster, item, optimized_graph);
+        string result;
+        if (!status.ok()) {
+          VLOG(1) << "Not able to apply optimizer " << optimizer->name()
+                  << ". Return status: " << status.ToString();
+          result = status.ToString();
+        } else {
+          already_optimized = true;
+          result = strings::StrCat(
+              "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
+        }
+        result_.push_back(std::make_pair(optimizer->name(), result));
+        VLOG(1) << "Optimizer " << optimizer->name()
+                << " return status: " << result;
       } else {
-        result = strings::StrCat(
-            "OK. ",
-            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
+        GrapplerItem optimized_item(item, std::move(*optimized_graph));
+        Status status =
+            optimizer->Optimize(cluster, optimized_item, optimized_graph);
+        string result;
+        if (!status.ok()) {
+          VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
+                  << status.ToString();
+          optimized_graph->Swap(&optimized_item.graph);
+          result = status.ToString();
+        } else {
+          result = strings::StrCat(
+              optimizer->name(), ": ",
+              PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
+        }
+        result_.push_back(std::make_pair(optimizer->name(), result));
+        VLOG(1) << result;
       }
-      result_.push_back(std::make_pair(optimizer->name(), result));
-      VLOG(1) << "Optimizer " << optimizer->name()
-              << " return status: " << result;
     }
   }
 
   if (already_optimized) {
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
+    ReassignColocation(optimized_graph);
     // Make sure that the optimizers preserved the graph version and library.
     DCHECK_GE(optimized_graph->library().function_size(),
               item.graph.library().function_size());
@@ -234,10 +255,11 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
          cfg.function_optimization() != RewriterConfig::OFF ||
          cfg.constant_folding() != RewriterConfig::OFF ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
-         cfg.loop_optimization() == RewriterConfig::ON ||
+         cfg.loop_optimization() != RewriterConfig::OFF ||
          cfg.dependency_optimization() != RewriterConfig::OFF ||
          cfg.auto_parallel().enable() ||
          cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
+         cfg.debug_stripper() == RewriterConfig::ON ||
          !cfg.optimizers().empty();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 536347d8348738e1755e920f3f08c2d4858cb256..d9a386b9be2cecbbd7562c38bc24626ff966fa46 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -72,6 +72,20 @@ TEST(MetaOptimizerTest, RunsCustomOptimizer) {
   EXPECT_TRUE(TestOptimizer::IsOptimized());
 }
 
+TEST(MetaOptimizerTest, RunOptimizersTwice) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  RewriterConfig rewriter_config;
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
index 450e85340796fdde9afdfebbd0eb9a724cb9440a..5206e9957dc75c13a03dfcb060b8b3b3dc732ad8 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -40,6 +40,12 @@ static Costs::NanoSeconds PredictExecutionTime(
     op_context.op_info.add_inputs()->Swap(&input);
   }
 
+  std::vector<OpInfo::TensorProperties> outputs =
+      properties.GetOutputProperties(node.name());
+  for (auto& output : outputs) {
+    op_context.op_info.add_outputs()->Swap(&output);
+  }
+
   DeviceProperties device = placer.get_device(node);
   op_context.op_info.mutable_device()->Swap(&device);
 
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index 08580d92842377c2dd999950b2e01bef01e2fee6..d632e460e7ccfc092945805a9a0b6b4b4c2215d1 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -64,17 +64,17 @@ TEST_F(StaticScheduleTest, BasicGraph) {
     if (time.first->name() == "Const/Const") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(250001), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(1500001), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(1500004), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(4000004), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(2750007), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(6500007), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(4000010), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(9000010), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(5250013), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(11500013), time.second);
     } else if (time.first->name() == "y") {
-      EXPECT_EQ(Costs::NanoSeconds(6500013), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(14000013), time.second);
     }
   }
 }
@@ -110,13 +110,13 @@ TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
     if (time.first->name() == "a") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "b") {
-      EXPECT_EQ(Costs::NanoSeconds(12500001), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(25000001), time.second);
     } else if (time.first->name() == "c") {
-      EXPECT_EQ(Costs::NanoSeconds(12500002), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(25000002), time.second);
     } else if (time.first->name() == "d") {
-      EXPECT_EQ(Costs::NanoSeconds(12500003), time.second);
-    } else if (time.first->name() == "e") {
       EXPECT_EQ(Costs::NanoSeconds(25000003), time.second);
+    } else if (time.first->name() == "e") {
+      EXPECT_EQ(Costs::NanoSeconds(50000003), time.second);
     }
   }
 }
@@ -142,17 +142,17 @@ TEST_F(StaticScheduleTest, RequiredTimes) {
 
   for (auto time : required_times) {
     if (time.first->name() == "Const/Const") {
-      EXPECT_EQ(Costs::NanoSeconds(-6500012), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-14000012), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(-6250012), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-12500012), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(-5000009), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-10000009), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(-3750006), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-7500006), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(-2500003), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-5000003), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(-1250000), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-2500000), time.second);
     } else if (time.first->name() == "y") {
       EXPECT_EQ(Costs::NanoSeconds(0), time.second);
     }
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.cc b/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfca2dc0d38480240c9b158ecbb3cc718bfa1ad2
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+BCast::Vec ShapeDims(const TensorShapeProto& shape) {
+  BCast::Vec dims;
+  dims.reserve(shape.dim_size());
+  for (int i = 0; i < shape.dim_size(); ++i)
+    dims.push_back(shape.dim(i).size());
+  return dims;
+}
+
+}  // namespace
+
+bool IsKnown(const TensorShapeProto::Dim& dim) { return dim.size() >= 0; }
+
+bool IsKnownSymbolically(const TensorShapeProto::Dim& dim) {
+  return dim.size() <= -2;
+}
+
+bool IsUnknown(const TensorShapeProto::Dim& dim) { return dim.size() == -1; }
+
+bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape) {
+  return !shape.unknown_rank() &&
+         std::all_of(
+             shape.dim().begin(), shape.dim().end(),
+             [](const TensorShapeProto::Dim& dim) { return !IsUnknown(dim); });
+}
+
+bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties) {
+  return ShapeIsSymbolicallyDefined(properties.shape());
+}
+
+bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
+                             const TensorShapeProto& right) {
+  if (left.unknown_rank() || right.unknown_rank() ||
+      left.dim_size() != right.dim_size()) {
+    return false;
+  }
+  for (int i = 0; i < left.dim_size(); ++i) {
+    const auto& ldim = left.dim(i);
+    const auto& rdim = right.dim(i);
+    if (IsUnknown(ldim) || IsUnknown(rdim) || ldim.size() != rdim.size()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ShapesSymbolicallyEqual(const OpInfo::TensorProperties& left,
+                             const OpInfo::TensorProperties& right) {
+  return ShapesSymbolicallyEqual(left.shape(), right.shape());
+}
+
+bool ShapesBroadcastable(const TensorShapeProto& left,
+                         const TensorShapeProto& right) {
+  if (!ShapeIsSymbolicallyDefined(left) || !ShapeIsSymbolicallyDefined(right)) {
+    return false;
+  }
+  BCast bcast(ShapeDims(left), ShapeDims(right),
+              /*fewer_dims_optimization*/ false);
+  return bcast.IsValid();
+}
+
+bool ShapesBroadcastable(const OpInfo::TensorProperties& left,
+                         const OpInfo::TensorProperties& right) {
+  return ShapesBroadcastable(left.shape(), right.shape());
+}
+
+bool CompareSymbolicallyShapedTensorSizes(const TensorShapeProto& left,
+                                          const TensorShapeProto& right) {
+  // if one of the ranks is unknown, it's impossible to compare tensor sizes
+  if (left.unknown_rank() || right.unknown_rank()) {
+    return false;
+  }
+
+  // Tensor size, computed as a product of defined dimensions
+  int64 left_defined_size = 1;
+  int64 right_defined_size = 1;
+
+  // Keep how many times each unknown dimension appeared on the left and right
+  std::unordered_map<int64, int64> left_unknown_dims;
+  std::unordered_map<int64, int64> right_unknown_dims;
+
+  // Assign unique id to every unknown dimension (-1). We are going to
+  // assign positive ids, because negative values are already used by
+  // symbolic dimensions.
+  int64 unknown_dim_id = 1;
+
+  // For each shape dimension update "defined tensor size", if shape is defined,
+  // or increment a counter for unknown dim.
+  auto process_dimensions =
+      [&unknown_dim_id](const TensorShapeProto& shape, int64* defined_size,
+                        std::unordered_map<int64, int64>* unknown_dims) {
+        for (int i = 0; i < shape.dim_size(); ++i) {
+          const auto& dim = shape.dim(i);
+          int64 dim_size = dim.size();
+          if (dim_size > 0) {
+            *defined_size *= dim_size;
+          } else if (IsUnknown(dim)) {
+            ++(*unknown_dims)[unknown_dim_id++];
+          } else if (IsKnownSymbolically(dim)) {
+            ++(*unknown_dims)[dim_size];
+          }
+        }
+      };
+
+  process_dimensions(left, &left_defined_size, &left_unknown_dims);
+  process_dimensions(right, &right_defined_size, &right_unknown_dims);
+
+  // Compute a union of unknown dimension ids appeared in both shapes
+  std::set<int64> unknown_dims;
+  for (const auto& el : left_unknown_dims) unknown_dims.insert(el.first);
+  for (const auto& el : right_unknown_dims) unknown_dims.insert(el.first);
+
+  // Cancel unknown dimensions that appeared in both shapes
+  for (int64 unknown_dim : unknown_dims) {
+    int64 co_occurrence = std::min(left_unknown_dims[unknown_dim],
+                                   right_unknown_dims[unknown_dim]);
+    left_unknown_dims[unknown_dim] -= co_occurrence;
+    right_unknown_dims[unknown_dim] -= co_occurrence;
+  }
+
+  // Count unbalanced unknown dimensions
+  int64 left_unbalanced_unknown_dims = 0;
+  int64 right_unbalanced_unknown_dims = 0;
+  for (const auto& el : left_unknown_dims)
+    left_unbalanced_unknown_dims += el.second;
+  for (const auto& el : right_unknown_dims)
+    right_unbalanced_unknown_dims += el.second;
+
+  if (left_unbalanced_unknown_dims == 0 && right_unbalanced_unknown_dims == 0) {
+    // If unknown dimensions cancelled each other, compare tensor sizes
+    // represented by defined dimensions
+    return left_defined_size < right_defined_size;
+  }
+
+  if (left_defined_size <= right_defined_size &&
+      left_unbalanced_unknown_dims == 0 && right_unbalanced_unknown_dims > 0) {
+    // If size of a 'left" tensor computed from defined dimensions less or
+    // equal, and shape on the right has unbalanced unknown dimensions, we can
+    // guarantee that shape on the left is strictly smaller (assuming that
+    // unknown dimension size is larger than 1)
+    return true;
+  }
+
+  // In every other case, assuming that unknown dimensions can be arbitrary
+  // large in size, we can't guarantee any ordering
+  return false;
+}
+
+bool CompareSymbolicallyShapedTensorSizes(
+    const OpInfo::TensorProperties& left,
+    const OpInfo::TensorProperties& right) {
+  return CompareSymbolicallyShapedTensorSizes(left.shape(), right.shape());
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.h b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb79bab3141579132ea2e2d2afc5733f0013a0d5
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SYMBOLIC_SHAPES_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SYMBOLIC_SHAPES_H_
+
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool IsKnown(const TensorShapeProto::Dim& dim);
+bool IsKnownSymbolically(const TensorShapeProto::Dim& dim);
+bool IsUnknown(const TensorShapeProto::Dim& dim);
+
+// Shape is symbolically defined, if it has a known rank, and each dimension is
+// known (dim_size >= 0), or is a symbolic dimension size (dim_size <= -2).
+bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape);
+bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties);
+
+// Shapes are symbolically equal, if they have the same rank, they are known or
+// symbolically defined, and have matching dimensions.
+bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
+                             const TensorShapeProto& right);
+bool ShapesSymbolicallyEqual(const OpInfo::TensorProperties& left,
+                             const OpInfo::TensorProperties& right);
+
+// Check if two shapes can be broadcasted to each other. Both shapes must be at
+// least symbolically defined, and the have valid BCast instance.
+bool ShapesBroadcastable(const TensorShapeProto& left,
+                         const TensorShapeProto& right);
+bool ShapesBroadcastable(const OpInfo::TensorProperties& left,
+                         const OpInfo::TensorProperties& right);
+
+// Return true if can prove, that tensor of size 'left' is smaller than tensor
+// of size 'right'. Return false if it's larger or equal, or it's impossible to
+// compare because of unknown dimensions, or mismatch in symbolic dimensions.
+bool CompareSymbolicallyShapedTensorSizes(const TensorShapeProto& left,
+                                          const TensorShapeProto& right);
+bool CompareSymbolicallyShapedTensorSizes(
+    const OpInfo::TensorProperties& left,
+    const OpInfo::TensorProperties& right);
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SYMBOLIC_SHAPES_H_
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc b/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ef9f6592571062564d16e5fb282b1dd85d074ef
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class SymbolicShapesTest : public ::testing::Test {
+ protected:
+  TensorShapeProto MakeUnknown() {
+    TensorShapeProto shape;
+    shape.set_unknown_rank(true);
+    return shape;
+  }
+
+  TensorShapeProto MakeShape(std::vector<int> dims) {
+    TensorShapeProto shape;
+    for (int dim_size : dims) {
+      TensorShapeProto::Dim dim;
+      dim.set_size(dim_size);
+      *shape.add_dim() = dim;
+    }
+    return shape;
+  }
+};
+
+bool operator<(const TensorShapeProto& lhs, const TensorShapeProto& rhs) {
+  return CompareSymbolicallyShapedTensorSizes(lhs, rhs);
+}
+
+TEST_F(SymbolicShapesTest, ShapeIsSymbolicallyDefined) {
+  EXPECT_FALSE(ShapeIsSymbolicallyDefined(MakeUnknown()));
+  EXPECT_FALSE(ShapeIsSymbolicallyDefined(MakeShape({-1, 2})));
+
+  EXPECT_TRUE(ShapeIsSymbolicallyDefined(MakeShape({1, 2})));
+  EXPECT_TRUE(ShapeIsSymbolicallyDefined(MakeShape({-2, 2})));
+}
+
+TEST_F(SymbolicShapesTest, ShapesSymbolicallyEqual) {
+  EXPECT_FALSE(ShapesSymbolicallyEqual(MakeUnknown(), MakeUnknown()));
+  EXPECT_FALSE(ShapesSymbolicallyEqual(MakeShape({-1, 2}), MakeShape({-1, 2})));
+  EXPECT_FALSE(ShapesSymbolicallyEqual(MakeShape({-2, 2}), MakeShape({-3, 2})));
+
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({1, 2}), MakeShape({1, 2})));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, 2}), MakeShape({-2, 2})));
+}
+
+TEST_F(SymbolicShapesTest, ShapesBroadcastable) {
+  EXPECT_FALSE(ShapesBroadcastable(MakeUnknown(), MakeUnknown()));
+  EXPECT_FALSE(ShapesBroadcastable(MakeShape({-2}), MakeShape({1, -3})));
+  EXPECT_FALSE(ShapesBroadcastable(MakeShape({-1, 2}), MakeShape({-1, 2})));
+  EXPECT_FALSE(ShapesBroadcastable(MakeShape({-2, 2}), MakeShape({-3, 2})));
+  EXPECT_FALSE(ShapesBroadcastable(MakeShape({-2, 4}), MakeShape({-2, 8})));
+
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({1, 2}), MakeShape({1, 2})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 2}), MakeShape({-2, 2})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 32}), MakeShape({-2, 1})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 1}), MakeShape({1, -2})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 1}), MakeShape({1, -3})));
+  EXPECT_TRUE(ShapesBroadcastable(MakeShape({-3}), MakeShape({-2, -3})));
+}
+
+TEST_F(SymbolicShapesTest, CompareSymbolicallyShapedTensorSizes) {
+  EXPECT_TRUE(MakeShape({1, 1, 32}) < MakeShape({32, 32}));
+  EXPECT_TRUE(MakeShape({1, 32, 32}) < MakeShape({2048}));
+  EXPECT_TRUE(MakeShape({1, -2, 32}) < MakeShape({-2, 32, 32}));
+  EXPECT_TRUE(MakeShape({1, 32, 32}) < MakeShape({-2, 32, 32}));
+  EXPECT_TRUE(MakeShape({1, 32, 32}) < MakeShape({-1, 32, 32}));
+  EXPECT_TRUE(MakeShape({1, -2, 32}) < MakeShape({-2, -2, 32}));
+
+  EXPECT_FALSE(MakeShape({1, -2, 32}) < MakeShape({-3, 32, 32}));
+  EXPECT_FALSE(MakeShape({1, -1, 32}) < MakeShape({1, -1, 32}));
+  EXPECT_FALSE(MakeShape({1, -1, 32}) < MakeShape({-1, -1, 32}));
+  EXPECT_FALSE(MakeShape({-1, -1, 32}) < MakeShape({1, -1, 32}));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index eb1f882ff1d21034e233987fb778d295f00bec85..5893f286ed267f0600a40ef58eeff9f98b472e2e 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -33,13 +33,23 @@ namespace {
 template <typename T>
 bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
   using RealType = typename Eigen::NumTraits<T>::Real;
-  if (value > std::numeric_limits<RealType>::max() ||
-      value < std::numeric_limits<RealType>::min()) {
+  if (value > static_cast<double>(std::numeric_limits<RealType>::max()) ||
+      value < static_cast<double>(std::numeric_limits<RealType>::min())) {
     return false;
   }
   tensor->flat<T>()(0) = static_cast<T>(value);
   return true;
 }
+
+// Is 'node' an operator that consumes only the shape of its input, not the
+// data itself?
+// TODO(ezhulenev): move to op_types.h. Requires to break circular dependency.
+// TODO(ezhulenev): what about Identity passing tensor to Shape consumer?
+bool IsShapeConsumer(const NodeDef& node) {
+  const string& op = node.op();
+  return op == "Shape" || op == "ShapeN" || op == "Rank" || op == "Size";
+}
+
 }  // namespace
 
 NodeMap::NodeMap(GraphDef* graph) {
@@ -245,6 +255,14 @@ int NumOutputs(const NodeDef& node, GraphDef* graph) {
   return num_outputs;
 }
 
+bool HasControlInputs(const NodeDef& node) {
+  int num_inputs = node.input_size();
+  if (num_inputs > 0 && IsControlInput(node.input(num_inputs - 1))) {
+    return true;
+  }
+  return false;
+}
+
 int NumNonControlInputs(const NodeDef& node) {
   int num_inputs = node.input_size();
   for (const string& input : node.input()) {
@@ -270,6 +288,22 @@ int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
   return num_outputs;
 }
 
+int NumNonControlDataOutputs(const NodeDef& node, const NodeMap& node_map) {
+  int num_data_outputs = 0;
+  for (const NodeDef* output : node_map.GetOutputs(node.name())) {
+    if (IsShapeConsumer(*output)) continue;
+
+    for (int i = 0; i < output->input_size(); ++i) {
+      const string& input = output->input(i);
+      if (!IsControlInput(input) && NodeName(input) == node.name()) {
+        ++num_data_outputs;
+        break;
+      }
+    }
+  }
+  return num_data_outputs;
+}
+
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
 DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name) {
@@ -447,8 +481,8 @@ Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
         "Expected scalar tensor, got num_elements = ", tensor->NumElements());
   }
   switch (dtype) {
-    // TODO(rmlarsen): Handle DT_HALF.
-    //    HANDLE_CASE(DT_HALF);
+    HANDLE_CASE(DT_HALF);
+    HANDLE_CASE(DT_BFLOAT16);
     HANDLE_CASE(DT_BOOL);
     HANDLE_CASE(DT_FLOAT);
     HANDLE_CASE(DT_DOUBLE);
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index fbd38c1531e3945091fcb328633a750c6a71ce2e..11555d712abd1de538aa8526f1574f249f630cbf 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -138,12 +138,19 @@ string AsControlDependency(const string& node);
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
 
+// Returns true iff the node has at least one control input.
+bool HasControlInputs(const NodeDef& node);
+
 // Number of connected non-control inputs.
 int NumNonControlInputs(const NodeDef& node);
 
 // Number of connected non-control outputs.
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map);
 
+// Number of connected non-control data outputs (Ops that consume output tensor
+// data, not just it's shape).
+int NumNonControlDataOutputs(const NodeDef& node, const NodeMap& node_map);
+
 // Removes redundant control inputs from node.
 void DedupControlInputs(NodeDef* node);
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 939031c44b57e930b80fc7897be8e9f5e7906688..7419c26dff58067856c5e5280edcecd77a41c6c7 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -2,18 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "scc",
     srcs = ["scc.cc"],
@@ -193,3 +181,28 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
+
+cc_library(
+    name = "colocation",
+    srcs = ["colocation.cc"],
+    hdrs = ["colocation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "colocation_test",
+    size = "small",
+    srcs = ["colocation_test.cc"],
+    deps = [
+        ":colocation",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/colocation.cc b/tensorflow/core/grappler/utils/colocation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0573e0a8309d7525733013d27befce53a0ecc44b
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation.cc
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/colocation.h"
+
+#include <cstring>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+// Find root node of the colocation group.
+// The map is mapping from one node name to its parent. node_name is the
+// starting node to search. By iteratively following the path from child to
+// parent, we can find the root node for the colocation group that node_name
+// belongs to.
+string GetColocationGroupRoot(std::unordered_map<string, string>* map,
+                              const string& node_name) {
+  if (map->find(node_name) == map->end()) {
+    // If node_name is not in the map, we create a new root node which points
+    // to itself.
+    map->insert({node_name, node_name});
+    return node_name;
+  }
+  string cur = node_name;
+  while ((*map)[cur] != cur) {
+    // Backtracing the map until we reach the root node.
+    cur = (*map)[cur];
+  }
+  return cur;
+}
+
+// Merge two colocation groups into one.
+// left and right is the root node of two colocation groups respectively.
+void MergeColocationGroup(std::unordered_map<string, string>* map,
+                          const string& left, const string& right) {
+  // Do nothing if left or right node is not in the map.
+  if (map->find(left) == map->end() || map->find(right) == map->end()) {
+    return;
+  }
+  if (left != right) {
+    // Make the right node a child of the left node, which merges the two
+    // groups.
+    map->at(right) = left;
+  }
+}
+}  // namespace
+
+// Use of disjoint set algorithm to build the colocation groups from the input
+// graph. The core data structure in use is a hash map from one node to its
+// parent node. Whenever we see two nodes colocate with each other, we merge
+// their colocation groups together. After we traverse all colocation pairs
+// in the graph, we will have several disjoint sets. Then we pick the root node
+// of each disjoint set as the representative node, and let all other nodes in
+// the group colocate with the representative node.
+void ReassignColocation(GraphDef* graph) {
+  constexpr char kClassAttr[] = "_class";
+  constexpr char kColocPrefix[] = "loc:@";
+
+  // A hashmap that maps from a node name to its parent node name.
+  std::unordered_map<string, string> coloc_groups;
+  NodeMap node_map(graph);
+  for (const auto& node : graph->node()) {
+    auto iter = node.attr().find(kClassAttr);
+    if (iter != node.attr().end() && iter->second.has_list()) {
+      for (const auto& str : iter->second.list().s()) {
+        size_t pos = str.find(kColocPrefix);
+        if (pos == 0) {
+          // After we find a colocation, update the colocation groups.
+          string colocate_node = str.substr(pos + strlen(kColocPrefix));
+          MergeColocationGroup(
+              &coloc_groups, GetColocationGroupRoot(&coloc_groups, node.name()),
+              GetColocationGroupRoot(&coloc_groups, colocate_node));
+        }
+      }
+    }
+  }
+
+  // We use the root node of each colocation groups as its representative
+  // node. For each node in one group, colocate with the representative node
+  // if the node is in the graph.
+  for (const auto& pair : coloc_groups) {
+    if (pair.first != pair.second) {
+      // This is a child node.
+      NodeDef* node = node_map.GetNode(pair.first);
+      if (node) {
+        // Colocate this node with the root node.
+        AttrValue new_value;
+        new_value.mutable_list()->add_s(
+            kColocPrefix + GetColocationGroupRoot(&coloc_groups, pair.first));
+        node->mutable_attr()->erase(kClassAttr);
+        node->mutable_attr()->insert({kClassAttr, new_value});
+      }
+    } else {
+      // This is a root node. Clear the _class attribute.
+      NodeDef* node = node_map.GetNode(pair.first);
+      if (node) {  // root node should always exist in the graph as guaranteed
+                   // by order of merging. Just put check here to ensure safety.
+        node->mutable_attr()->erase(kClassAttr);
+      }
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/colocation.h b/tensorflow/core/grappler/utils/colocation.h
new file mode 100644
index 0000000000000000000000000000000000000000..6062db6102c50853145b15dae08994e971cca83d
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+
+#include <unordered_map>
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Evaluates the colocation relation in the graph and rewrites the new
+// colocation relation in the graph. We scan the graph nodes sequentially, and
+// builds a disjoint-sets of nodes (within each disjoint-set the nodes are
+// colocated with each other). We then select the root node of each set as a
+// representative node, and then colocate each node within the set (should also
+// exist in graph) with the representative node.
+// Note that there is current one situation this function can't handle:
+// Node A colocates with X, node B colocates with Y, X colocates with Y but
+// X, Y are removed from graph. In this case we can't know A colocates with B.
+void ReassignColocation(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
diff --git a/tensorflow/core/grappler/utils/colocation_test.cc b/tensorflow/core/grappler/utils/colocation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6638364240fe4a85a9bb066b812b087fe407db6e
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/colocation.h"
+
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ColocationTest : public ::testing::Test {};
+
+bool VerifyNodeHasColocation(const NodeDef& ndef, const string& coloc) {
+  if (ndef.attr().empty()) {
+    return false;
+  }
+  if (ndef.attr().find("_class") == ndef.attr().end()) {
+    return false;
+  }
+  return ndef.attr().at("_class").list().s(0) == coloc;
+}
+
+TEST(ColocationTest, ReassignColocation_SingleNode) {
+  // Node A colocates with B, but node B is not in the graph.
+  //   A
+  //   |
+  //   |
+  //  [B]
+
+  NodeDef ndef;
+  const Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@B"}).Finalize(&ndef);
+  TF_EXPECT_OK(status);
+  GraphDef gdef = test::function::GDef({ndef});
+
+  EXPECT_EQ(1, gdef.node_size());
+  EXPECT_EQ(1, gdef.node(0).attr_size());
+
+  ReassignColocation(&gdef);
+
+  // Validates that node A's colocation info is cleared.
+  EXPECT_EQ(1, gdef.node_size());
+  EXPECT_EQ(0, gdef.node(0).attr_size());
+}
+
+TEST(ColocationTest, ReassignColocation_MultiNode_SingleGroup) {
+  // Node A, B, C colocate with X. D colocates with C. E colocates with D.
+  // Node X is not in the graph.
+  //  A   B   C---D---E
+  //  |   |   |
+  //  |   |   |
+  //  +--[X]--+
+  // After re-assign of colocation, A, B, C, D should colocate with E.
+  // A   B   C   D
+  // |   |   |   |
+  // |   |   |   |
+  // +---+-E-+---+
+
+  NodeDef ndef_a, ndef_b, ndef_c, ndef_d, ndef_e;
+  Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_a);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("B", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_b);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("C", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_c);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("D", "Const").Attr("_class", {"loc:@C"}).Finalize(&ndef_d);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("E", "Const").Attr("_class", {"loc:@D"}).Finalize(&ndef_e);
+  TF_EXPECT_OK(status);
+  GraphDef gdef =
+      test::function::GDef({ndef_a, ndef_b, ndef_c, ndef_d, ndef_e});
+
+  EXPECT_EQ(5, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@X"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@X"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@X"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@C"));  // D
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(4), "loc:@D"));  // E
+
+  ReassignColocation(&gdef);
+
+  EXPECT_EQ(5, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@E"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@E"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@E"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@E"));  // D
+  EXPECT_EQ(0, gdef.node(4).attr_size());                        // E
+}
+
+TEST(ColocationTest, ReassignColocation_MultiNode_MultiGroup) {
+  // Before re-assign:
+  // Node A, B, C colocate with X. D colocates with C. E colocates with D.
+  // Node U, V colocates with W. Node X, W are not in the graph:
+  //  A   B   C---D---E
+  //  |   |   |
+  //  |   |   |
+  //  +--[X]--+
+  //
+  //  U       V
+  //  |       |
+  //  |       |
+  //  +--[W]--+
+  //
+  // After re-assign:
+  // A, B, C, D should colocate with E. U should colocate with V.
+  // A   B   C   D
+  // |   |   |   |
+  // |   |   |   |
+  // +---+-E-+---+
+  //
+  // U
+  // |
+  // |
+  // V
+
+  NodeDef ndef_a, ndef_b, ndef_c, ndef_d, ndef_e, ndef_u, ndef_v;
+  Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_a);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("B", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_b);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("C", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_c);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("D", "Const").Attr("_class", {"loc:@C"}).Finalize(&ndef_d);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("E", "Const").Attr("_class", {"loc:@D"}).Finalize(&ndef_e);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("U", "Const").Attr("_class", {"loc:@W"}).Finalize(&ndef_u);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("V", "Const").Attr("_class", {"loc:@W"}).Finalize(&ndef_v);
+  TF_EXPECT_OK(status);
+  GraphDef gdef = test::function::GDef(
+      {ndef_a, ndef_b, ndef_c, ndef_d, ndef_e, ndef_u, ndef_v});
+
+  EXPECT_EQ(7, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@X"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@X"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@X"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@C"));  // D
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(4), "loc:@D"));  // E
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(5), "loc:@W"));  // U
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(6), "loc:@W"));  // V
+
+  ReassignColocation(&gdef);
+
+  EXPECT_EQ(7, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@E"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@E"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@E"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@E"));  // D
+  EXPECT_EQ(0, gdef.node(4).attr_size());                        // E
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(5), "loc:@V"));  // U
+  EXPECT_EQ(0, gdef.node(6).attr_size());                        // V
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 89c3aa82bf21ef089580b29dac3d9b8e3eada87a..910b0acaefbb10e1da24ab9ec5bfa95b1b5710d4 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -17,27 +17,49 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace grappler {
 
+GrapplerTest::GrapplerTest() {
+  // Turn off all the automatic optimizations to ensure that we run the graph
+  // exactly as it is given to us. This ensures that we can compare the results
+  // before and after manual optimization, without any of the automatic
+  // optimizations interfering in the comparison.
+  RewriterConfig* cfg =
+      options_.config.mutable_graph_options()->mutable_rewrite_options();
+  cfg->set_constant_folding(RewriterConfig::OFF);
+  cfg->set_arithmetic_optimization(RewriterConfig::OFF);
+  cfg->set_dependency_optimization(RewriterConfig::OFF);
+  cfg->set_loop_optimization(RewriterConfig::OFF);
+  cfg->set_function_optimization(RewriterConfig::OFF);
+  cfg->set_layout_optimizer(RewriterConfig::OFF);
+  cfg->set_debug_stripper(RewriterConfig::OFF);
+}
+
+std::vector<Tensor> GrapplerTest::EvaluateNodes(
+    const GraphDef& graph, const std::vector<string>& node_names) const {
+  return EvaluateNodes(graph, node_names, {});
+}
+
 std::vector<Tensor> GrapplerTest::EvaluateNodes(
-    const GraphDef& graph, const std::vector<string>& node_names) {
-  SessionOptions options;
-  std::unique_ptr<tensorflow::Session> session(NewSession(options));
+    const GraphDef& graph, const std::vector<string>& node_names,
+    const std::vector<std::pair<string, Tensor>>& inputs) const {
+  std::unique_ptr<tensorflow::Session> session(NewSession(options_));
   TF_CHECK_OK(session->Create(graph));
   RunOptions run_options;
   std::vector<Tensor> output_tensors;
-  TF_CHECK_OK(session->Run(run_options, {}, node_names, node_names,
+  TF_CHECK_OK(session->Run(run_options, inputs, node_names, node_names,
                            &output_tensors, nullptr));
   TF_CHECK_OK(session->Close());
   return output_tensors;
 }
 
-std::vector<Tensor> GrapplerTest::EvaluateFetchNodes(const GrapplerItem& item) {
-  SessionOptions options;
-  std::unique_ptr<tensorflow::Session> session(NewSession(options));
+std::vector<Tensor> GrapplerTest::EvaluateFetchNodes(
+    const GrapplerItem& item) const {
+  std::unique_ptr<tensorflow::Session> session(NewSession(options_));
   TF_CHECK_OK(session->Create(item.graph));
   RunOptions run_options;
   if (!item.init_ops.empty()) {
@@ -52,17 +74,23 @@ std::vector<Tensor> GrapplerTest::EvaluateFetchNodes(const GrapplerItem& item) {
   return output_tensors;
 }
 
-void GrapplerTest::AddNode(const string& name, const string& op,
-                           const std::vector<string>& inputs, GraphDef* graph) {
-  auto* node = graph->add_node();
+NodeDef* GrapplerTest::AddNode(
+    const string& name, const string& op, const std::vector<string>& inputs,
+    const std::vector<std::pair<string, AttrValue>>& attributes,
+    GraphDef* graph) const {
+  NodeDef* node = graph->add_node();
   node->set_name(name);
   node->set_op(op);
-  for (const auto& input : inputs) {
+  for (const string& input : inputs) {
     node->add_input(input);
   }
+  for (auto attr : attributes) {
+    (*node->mutable_attr())[attr.first] = attr.second;
+  }
+  return node;
 }
 
-void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) {
+void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) const {
   auto comparator = [](const NodeDef& n1, const NodeDef& n2) -> bool {
     return n1.name() < n2.name();
   };
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index 3df6625d5ce45edfe2efecfff9e341c30b8772ed..e1394b9c35f2cd5ec55db19dc165352399faf568 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -18,26 +18,37 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace grappler {
 
 class GrapplerTest : public ::testing::Test {
+ public:
+  GrapplerTest();
+
  protected:
-  std::vector<Tensor> EvaluateNodes(const GraphDef& graph,
-                                    const std::vector<string>& node_names);
+  std::vector<Tensor> EvaluateNodes(
+      const GraphDef& graph, const std::vector<string>& node_names) const;
+
+  std::vector<Tensor> EvaluateNodes(
+      const GraphDef& graph, const std::vector<string>& node_names,
+      const std::vector<std::pair<string, Tensor>>& inputs) const;
 
-  std::vector<Tensor> EvaluateFetchNodes(const GrapplerItem& item);
+  std::vector<Tensor> EvaluateFetchNodes(const GrapplerItem& item) const;
 
-  void AddNode(const string& name, const string& op,
-               const std::vector<string>& inputs, GraphDef* graph);
+  NodeDef* AddNode(const string& name, const string& op,
+                   const std::vector<string>& inputs,
+                   const std::vector<std::pair<string, AttrValue>>& attributes,
+                   GraphDef* graph) const;
 
-  void CompareGraphs(GraphDef want, GraphDef got);
+  void CompareGraphs(GraphDef want, GraphDef got) const;
 
   // Check if node 'src' is directly connected to the input($position) of 'dst'.
   bool IsNodesDirectlyConnected(const NodeMap& node_map, const string& src,
@@ -45,6 +56,18 @@ class GrapplerTest : public ::testing::Test {
 
   // Count nodes of the given op-type in a graph.
   int CountOpNodes(const GraphDef& graph, const string& op);
+
+  // Get a random tansor with given shape.
+  template <DataType DTYPE>
+  Tensor GenerateRandomTensor(const TensorShape& shape) const {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    Tensor tensor(DTYPE, shape);
+    tensor.flat<T>() = tensor.flat<T>().random();
+    return tensor;
+  }
+
+ private:
+  SessionOptions options_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index eabce5b5ee7b037b7bc429abfa86ee8735bdbede..49a1996d25e78d17908b1eae04c9acbeb7e2c788 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -292,6 +292,47 @@ TEST_F(UtilsTest, DedupControlInputs) {
   EXPECT_EQ("gnu", foo.input(1));
 }
 
+TEST_F(UtilsTest, NumNonControlOutputs) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  //  *) Round node has control dependency edge from Add, which
+  //     is not on this scheme (ASCII graphics limitation).
+  //
+  //   *Round    [Sqrt, Shape]
+  //      |           |
+  //      |   ctrl    |
+  //     Mul ------> Add
+  //     / \         / \
+  //    x   y       a   b
+  auto x = ops::Variable(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  auto y = ops::Variable(s.WithOpName("y"), {1, 2}, DT_FLOAT);
+  auto a = ops::Variable(s.WithOpName("a"), {1, 2}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {1, 2}, DT_FLOAT);
+
+  auto mul = ops::Multiply(s.WithOpName("mul"), x, y);
+  auto add = ops::Add(s.WithOpName("add").WithControlDependencies(mul), a, b);
+
+  auto shape = ops::Shape(s.WithOpName("shape"), add);
+  auto sqrt = ops::Sqrt(s.WithOpName("sqrt"), add);
+
+  auto round =
+      ops::Round(s.WithOpName("round").WithControlDependencies(add), mul);
+
+  GraphDef graph;
+  TF_CHECK_OK(s.ToGraphDef(&graph));
+  NodeMap node_map(&graph);
+
+  const NodeDef* add_node = node_map.GetNode("add");
+  ASSERT_TRUE(add_node != nullptr);
+
+  // [a, b] are only non-control inputs
+  EXPECT_EQ(2, NumNonControlInputs(*add_node));
+  // [sqrt, shape] are non control outputs
+  EXPECT_EQ(2, NumNonControlOutputs(*add_node, node_map));
+  // sqrt is the only data output
+  EXPECT_EQ(1, NumNonControlDataOutputs(*add_node, node_map));
+}
+
 TEST_F(UtilsTest, DeleteNodes) {}
 
 }  // namespace
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 27a96217fd7cb58f0f789c2e4462142806bec3aa..d2a2cdd13d53cdf599e605dcbbf19c6717d23fc3 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -920,6 +920,22 @@ tf_kernel_library(
     ]) + ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "cudnn_rnn_kernels",
+    srcs = ["cudnn_rnn_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:cudnn_rnn_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//third_party/eigen3",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
 tf_cc_test(
     name = "batch_norm_op_test",
     size = "small",
@@ -1666,6 +1682,43 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "scoped_allocator_ops",
+    prefix = "scoped_allocator_ops",
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:scoped_allocator_ops_op_lib",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "scoped_allocator_ops_test",
+    srcs = ["scoped_allocator_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),  #Required for benchmarking
+    deps = [
+        ":cwise_op",
+        ":dense_update_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":scoped_allocator_ops",
+        ":variable_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "session_ops",
     prefix = "session_ops",
@@ -2475,13 +2528,13 @@ tf_kernel_library(
 tf_kernel_library(
     name = "self_adjoint_eig_op",
     prefix = "self_adjoint_eig_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"],
 )
 
 tf_kernel_library(
     name = "self_adjoint_eig_v2_op",
     prefix = "self_adjoint_eig_v2_op",
-    deps = LINALG_DEPS + if_cuda([
+    deps = LINALG_DEPS + ["//tensorflow/core:lib_internal"] + if_cuda([
         ":cast_op",
         ":cwise_op",
     ]),
@@ -5042,6 +5095,7 @@ filegroup(
             # not used on Android. Those ops also do not compile if included,
             # unless we add the additional deps they need.
             "tf_record_reader_op.*",
+            "cudnn_rnn_ops.*",
             "lmdb_reader_op.*",
             "string_to_hash_bucket_op.*",
             "sdca_ops.*",
@@ -5231,6 +5285,7 @@ tf_cc_test(
     name = "quantization_utils_test",
     srcs = ["quantization_utils_test.cc"],
     deps = [
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
@@ -5293,6 +5348,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5354,6 +5410,7 @@ tf_cc_test(
         ":math",
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
@@ -5376,6 +5433,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
@@ -5440,6 +5498,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5460,6 +5519,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5499,6 +5559,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5555,6 +5616,7 @@ tf_cc_test(
         ":math",
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
@@ -5577,6 +5639,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5613,6 +5676,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
@@ -5634,6 +5698,7 @@ tf_cc_test(
     deps = [
         ":batch_norm_op",
         ":ops_testutil",
+        ":quantization_utils",
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
@@ -5894,6 +5959,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn",
     ],
 )
 
@@ -5914,6 +5980,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn",
     ],
 )
 
@@ -5945,6 +6012,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
         "@mkl_dnn",
     ],
@@ -5964,6 +6032,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_aggregate_ops",
     deps = MATH_DEPS + [
         "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn",
     ],
 )
 
@@ -5981,6 +6050,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_reshape_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn",
     ],
 )
 
@@ -6026,6 +6096,13 @@ cc_library(
     ],
 )
 
+tf_kernel_library(
+    name = "boosted_trees_ops",
+    deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_ops",
+    ],
+)
+
 cc_library(
     name = "captured_function",
     hdrs = ["captured_function.h"],
@@ -6077,18 +6154,6 @@ tf_kernel_library(
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # Library to link with when compiling the cwise_op kernels directly,
 # e.g. for selective registration.
 # should not be linked by projects that also link the cwise_op library.
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index a312e8e8a420f7f909b20b28f84bf55597a58aba..2ed1628bf1a84bf8729a949ca1b6d66ce58bdcdc 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -77,7 +77,8 @@ class AssignOp : public OpKernel {
 
       // 1. Try to reuse the rhs.
       std::unique_ptr<Tensor> input_alias = context->forward_input(
-          1, old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
+          1, OpKernelContext::Params::kNoReservation /*output_index*/,
+          old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
       if (input_alias != nullptr) {
         // Transfer ownership to the ref.
         context->replace_ref_input(0, *input_alias.release(),
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index ec9cbc2a9b5d4c1ac6d91913fc015e139fa2a068..c581d1451f0e6740cbcf526e0fd8636ea925eb69 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -102,6 +102,9 @@ class AvgPoolingOp : public UnaryOp<T> {
   TensorFormat data_format_;
 };
 
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<double>("T"),
+    AvgPoolingOp<CPUDevice, double>);
 REGISTER_KERNEL_BUILDER(
     Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     AvgPoolingOp<CPUDevice, float>);
@@ -189,6 +192,7 @@ namespace functor {
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -198,6 +202,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     AvgPoolingOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    AvgPoolingOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA
 
 // The operation to compute AvgPool gradients.
@@ -423,6 +430,12 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
   TensorFormat data_format_;
 };
 
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("orig_input_shape")
+                            .Label("cudnn"),
+                        AvgPoolingGradOp<GPUDevice, double>);
 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<float>("T")
@@ -553,6 +566,11 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .TypeConstraint<float>("T")
                             .HostMemory("orig_input_shape"),
                         AvgPoolingGradOpCustomGPUKernel<float>);
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("orig_input_shape"),
+                        AvgPoolingGradOpCustomGPUKernel<double>);
 REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<Eigen::half>("T")
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 6537b42f1ed8856a5f701023eb5fc55ded278ec8..35511d5c313fb4b3794d00bd685ec4249580daa3 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -35,6 +35,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 DEFINE_GPU_KERNELS(Eigen::half)
 DEFINE_GPU_KERNELS(float)
+DEFINE_GPU_KERNELS(double)
 
 #undef DEFINE_GPU_KERNELS
 
@@ -99,6 +100,12 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
   return d.ok();
 }
 
+template bool RunAvePoolBackwardNHWC(
+    const double* const top_diff, const int num, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    double* const bottom_diff, const GPUDevice& d);
 template bool RunAvePoolBackwardNHWC(
     const float* const top_diff, const int num, const int height,
     const int width, const int channels, const int pooled_height,
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 4397410a5cee839a70bde69f34ca72e31530565f..de05c647d6bfc80a0368ee3edba8f31bccff33f9 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -8,18 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "**/google_*",
-        ],
-    ),
-)
-
 cc_library(
     name = "periodic_function_dynamic",
     srcs = ["periodic_function.cc"],
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..62327dfe1d044bd05966d420e557fc39edd84afd
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -0,0 +1,89 @@
+# Description:
+#   OpKernels for boosted trees ops.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library",
+)
+
+tf_proto_library(
+    name = "boosted_trees_proto",
+    srcs = ["boosted_trees.proto"],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
+tf_kernel_library(
+    name = "prediction_ops",
+    srcs = ["prediction_ops.cc"],
+    deps = [
+        ":resource_ops",
+        ":resources",
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "resources",
+    srcs = ["resources.cc"],
+    hdrs = ["resources.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "resource_ops",
+    srcs = ["resource_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_ops",
+    srcs = ["stats_ops.cc"],
+    deps = [
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "training_ops",
+    srcs = ["training_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "boosted_trees_ops",
+    deps = [
+        ":prediction_ops",
+        ":resource_ops",
+        ":stats_ops",
+        ":training_ops",
+    ],
+)
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
new file mode 100644
index 0000000000000000000000000000000000000000..106ceedc00721f51468639a1c9e235728db8dbae
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -0,0 +1,113 @@
+syntax = "proto3";
+
+package tensorflow.boosted_trees;
+option cc_enable_arenas = true;
+option java_outer_classname = "BoostedTreesProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+// Node describes a node in a tree.
+message Node {
+  oneof node {
+    Leaf leaf = 1;
+    BucketizedSplit bucketized_split = 2;
+  }
+  NodeMetadata metadata = 777;
+}
+
+// NodeMetadata encodes metadata associated with each node in a tree.
+message NodeMetadata {
+  // The gain associated with this node.
+  float gain = 1;
+
+  // The original leaf node before this node was split.
+  Leaf original_leaf = 2;
+}
+
+// Leaves can either hold dense or sparse information.
+message Leaf {
+  oneof leaf {
+    // See third_party/tensorflow/contrib/decision_trees/
+    // proto/generic_tree_model.proto
+    // for a description of how vector and sparse_vector might be used.
+    Vector vector = 1;
+    SparseVector sparse_vector = 2;
+  }
+  float scalar = 3;
+}
+
+message Vector {
+  repeated float value = 1;
+}
+
+message SparseVector {
+  repeated int32 index = 1;
+  repeated float value = 2;
+}
+
+message BucketizedSplit {
+  // Float feature column and split threshold describing
+  // the rule feature <= threshold.
+  int32 feature_id = 1;
+  int32 threshold = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
+// Tree describes a list of connected nodes.
+// Node 0 must be the root and can carry any payload including a leaf
+// in the case of representing the bias.
+// Note that each node id is implicitly its index in the list of nodes.
+message Tree {
+  repeated Node nodes = 1;
+}
+
+message TreeMetadata {
+  // Number of layers grown for this tree.
+  int32 num_layers_grown = 2;
+
+  // Whether the tree is finalized in that no more layers can be grown.
+  bool is_finalized = 3;
+
+  // If tree was finalized and post pruning happened, it is possible that cache
+  // still refers to some nodes that were deleted or that the node ids changed
+  // (e.g. node id 5 became node id 2 due to pruning of the other branch).
+  // The mapping below allows us to understand where the old ids now map to and
+  // how the values should be adjusted due to post-pruning.
+  // The size of the list should be equal to the number of nodes in the tree
+  // before post-pruning happened.
+  // If the node was pruned, it will have new_node_id equal to the id of a node
+  // that this node was collapsed into. For a node that didn't get pruned, it is
+  // possible that its id still changed, so new_node_id will have the
+  // corresponding id in the pruned tree.
+  // If post-pruning didn't happen, or it did and it had no effect (e.g. no
+  // nodes got pruned), this list will be empty.
+  repeated PostPruneNodeUpdate post_pruned_nodes_meta = 4;
+
+  message PostPruneNodeUpdate {
+    int32 new_node_id = 1;
+    float logit_change = 2;
+  }
+}
+
+message GrowingMetadata {
+  // Number of trees that we have attempted to build. After pruning, these
+  // trees might have been removed.
+  int64 num_trees_attempted = 1;
+  // Number of layers that we have attempted to build. After pruning, these
+  // layers might have been removed.
+  int64 num_layers_attempted = 2;
+}
+
+// TreeEnsemble describes an ensemble of decision trees.
+message TreeEnsemble {
+  repeated Tree trees = 1;
+  repeated float tree_weights = 2;
+
+  repeated TreeMetadata tree_metadata = 3;
+  // Metadata that is used during the training.
+  GrowingMetadata growing_metadata = 4;
+}
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b13a4505464cf74fef82eb0151f23cb6b56a434e
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -0,0 +1,263 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+// The Op used during training time to get the predictions so far with the
+// current ensemble being built.
+// Expect some logits are cached from the previous step and passed through
+// to be reused.
+class BoostedTreesTrainingPredictOp : public OpKernel {
+ public:
+  explicit BoostedTreesTrainingPredictOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_bucketized_features",
+                                             &num_bucketized_features_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+    OP_REQUIRES(context, logits_dimension_ == 1,
+                errors::InvalidArgument(
+                    "Currently only one dimensional outputs are supported."));
+    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    BoostedTreesEnsembleResource* resource;
+    // Get the resource.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &resource));
+    // Release the reference to the resource once we're done using it.
+    core::ScopedUnref unref_me(resource);
+
+    // Get the inputs.
+    OpInputList bucketized_features_list;
+    OP_REQUIRES_OK(context, context->input_list("bucketized_features",
+                                                &bucketized_features_list));
+    std::vector<tensorflow::TTypes<int32>::ConstVec> batch_bucketized_features;
+    batch_bucketized_features.reserve(bucketized_features_list.size());
+    for (const Tensor& tensor : bucketized_features_list) {
+      batch_bucketized_features.emplace_back(tensor.vec<int32>());
+    }
+    const int batch_size = batch_bucketized_features[0].size();
+
+    const Tensor* cached_tree_ids_t;
+    OP_REQUIRES_OK(context,
+                   context->input("cached_tree_ids", &cached_tree_ids_t));
+    const auto cached_tree_ids = cached_tree_ids_t->vec<int32>();
+
+    const Tensor* cached_node_ids_t;
+    OP_REQUIRES_OK(context,
+                   context->input("cached_node_ids", &cached_node_ids_t));
+    const auto cached_node_ids = cached_node_ids_t->vec<int32>();
+
+    // Allocate outputs.
+    Tensor* output_partial_logits_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("partial_logits",
+                                            {batch_size, logits_dimension_},
+                                            &output_partial_logits_t));
+    auto output_partial_logits = output_partial_logits_t->matrix<float>();
+
+    Tensor* output_tree_ids_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("tree_ids", {batch_size},
+                                                     &output_tree_ids_t));
+    auto output_tree_ids = output_tree_ids_t->vec<int32>();
+
+    Tensor* output_node_ids_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("node_ids", {batch_size},
+                                                     &output_node_ids_t));
+    auto output_node_ids = output_node_ids_t->vec<int32>();
+
+    // Indicate that the latest tree was used.
+    const int32 latest_tree = resource->num_trees() - 1;
+
+    if (latest_tree < 0) {
+      // Ensemble was empty. Nothing changes.
+      output_node_ids = cached_node_ids;
+      output_tree_ids = cached_tree_ids;
+      // All the predictions are zeros.
+      output_partial_logits.setZero();
+    } else {
+      output_tree_ids.setConstant(latest_tree);
+      auto do_work = [&resource, &batch_bucketized_features, &cached_tree_ids,
+                      &cached_node_ids, &output_partial_logits,
+                      &output_node_ids, batch_size,
+                      latest_tree](int32 start, int32 end) {
+        for (int32 i = start; i < end; ++i) {
+          int32 tree_id = cached_tree_ids(i);
+          int32 node_id = cached_node_ids(i);
+          float partial_tree_logit = 0.0;
+
+          // If the tree was pruned, returns the node id into which the
+          // current_node_id was pruned, as well the correction of the cached
+          // logit prediction.
+          resource->GetPostPruneCorrection(tree_id, node_id, &node_id,
+                                           &partial_tree_logit);
+
+          // Logic in the loop adds the cached node value again if it is a leaf.
+          // If it is not a leaf anymore we need to subtract the old node's
+          // value. The following logic handles both of these cases.
+          partial_tree_logit -= resource->node_value(tree_id, node_id);
+          float partial_all_logit = 0.0;
+          while (true) {
+            if (resource->is_leaf(tree_id, node_id)) {
+              partial_tree_logit += resource->node_value(tree_id, node_id);
+
+              // Tree is done
+              partial_all_logit +=
+                  resource->GetTreeWeight(tree_id) * partial_tree_logit;
+              partial_tree_logit = 0.0;
+              // Stop if it was the latest tree.
+              if (tree_id == latest_tree) {
+                break;
+              }
+              // Move onto other trees.
+              ++tree_id;
+              node_id = 0;
+            } else {
+              node_id = resource->next_node(tree_id, node_id, i,
+                                            batch_bucketized_features);
+            }
+          }
+          output_node_ids(i) = node_id;
+          output_partial_logits(i, 0) = partial_all_logit;
+        }
+      };
+      // Assume we will not go over more than one full tree. 4 is a magic
+      // number.
+      const int64 cost = 4 * max_depth_;
+      thread::ThreadPool* const worker_threads =
+          context->device()->tensorflow_cpu_worker_threads()->workers;
+      Shard(worker_threads->NumThreads(), worker_threads, batch_size,
+            /*cost_per_unit=*/cost, do_work);
+    }
+  }
+
+ private:
+  int32 logits_dimension_;         // the size of the output prediction vector.
+  int32 num_bucketized_features_;  // Indicates the number of features.
+  int32 max_depth_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesTrainingPredict").Device(DEVICE_CPU),
+                        BoostedTreesTrainingPredictOp);
+
+// The Op to get the predictions at the evaluation/inference time.
+class BoostedTreesPredictOp : public OpKernel {
+ public:
+  explicit BoostedTreesPredictOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_bucketized_features",
+                                             &num_bucketized_features_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+    OP_REQUIRES(context, logits_dimension_ == 1,
+                errors::InvalidArgument(
+                    "Currently only one dimensional outputs are supported."));
+    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    BoostedTreesEnsembleResource* resource;
+    // Get the resource.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &resource));
+    // Release the reference to the resource once we're done using it.
+    core::ScopedUnref unref_me(resource);
+
+    // Get the inputs.
+    OpInputList bucketized_features_list;
+    OP_REQUIRES_OK(context, context->input_list("bucketized_features",
+                                                &bucketized_features_list));
+    std::vector<tensorflow::TTypes<int32>::ConstVec> batch_bucketized_features;
+    batch_bucketized_features.reserve(bucketized_features_list.size());
+    for (const Tensor& tensor : bucketized_features_list) {
+      batch_bucketized_features.emplace_back(tensor.vec<int32>());
+    }
+    const int batch_size = batch_bucketized_features[0].size();
+
+    // Allocate outputs.
+    Tensor* output_logits_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "logits", {batch_size, logits_dimension_},
+                                &output_logits_t));
+    auto output_logits = output_logits_t->matrix<float>();
+
+    const int32 latest_tree = resource->num_trees() - 1;
+
+    auto do_work = [&resource, &batch_bucketized_features, &output_logits,
+                    batch_size, latest_tree](int32 start, int32 end) {
+      for (int32 i = start; i < end; ++i) {
+        float tree_logit = 0.0;
+        int32 tree_id = 0;
+        int32 node_id = 0;
+        while (true) {
+          if (resource->is_leaf(tree_id, node_id)) {
+            tree_logit += resource->GetTreeWeight(tree_id) *
+                          resource->node_value(tree_id, node_id);
+
+            // Stop if it was the latest tree.
+            if (tree_id == latest_tree) {
+              break;
+            }
+            // Move onto other trees.
+            ++tree_id;
+            node_id = 0;
+          } else {
+            node_id = resource->next_node(tree_id, node_id, i,
+                                          batch_bucketized_features);
+          }
+        }
+        output_logits(i, 0) = tree_logit;
+      }
+    };
+    const int64 cost = (latest_tree + 1) * max_depth_;
+    thread::ThreadPool* const worker_threads =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    Shard(worker_threads->NumThreads(), worker_threads, batch_size,
+          /*cost_per_unit=*/cost, do_work);
+  }
+
+ private:
+  int32
+      logits_dimension_;  // Indicates the size of the output prediction vector.
+  int32 num_bucketized_features_;  // Indicates the number of features.
+  int32 max_depth_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesPredict").Device(DEVICE_CPU),
+                        BoostedTreesPredictOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/resource_ops.cc b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f49242d8566f09d34088131b7f74ea4362a86860
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_KERNEL(BoostedTreesEnsembleResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("IsBoostedTreesEnsembleInitialized").Device(DEVICE_CPU),
+    IsResourceInitialized<BoostedTreesEnsembleResource>);
+
+// Creates a tree ensemble resource.
+class BoostedTreesCreateEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesCreateEnsembleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Get the tree ensemble proto.
+    const Tensor* tree_ensemble_serialized_t;
+    OP_REQUIRES_OK(context, context->input("tree_ensemble_serialized",
+                                           &tree_ensemble_serialized_t));
+    std::unique_ptr<BoostedTreesEnsembleResource> result(
+        new BoostedTreesEnsembleResource());
+    if (!result->InitFromSerialized(
+            tree_ensemble_serialized_t->scalar<string>()(), stamp_token)) {
+      result->Unref();
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unable to parse tree ensemble proto."));
+    }
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status =
+        CreateResource(context, HandleFromInput(context, 0), result.release());
+    if (status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES_OK(context, status);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesCreateEnsemble").Device(DEVICE_CPU),
+                        BoostedTreesCreateEnsembleOp);
+
+// Op for retrieving some model states (needed for training).
+class BoostedTreesGetEnsembleStatesOp : public OpKernel {
+ public:
+  explicit BoostedTreesGetEnsembleStatesOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Looks up the resource.
+    BoostedTreesEnsembleResource* tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_ensemble_resource));
+    tf_shared_lock l(*tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(tree_ensemble_resource);
+
+    // Sets the outputs.
+    const int num_trees = tree_ensemble_resource->num_trees();
+    const int num_finalized_trees =
+        (num_trees <= 0 ||
+         tree_ensemble_resource->IsTreeFinalized(num_trees - 1))
+            ? num_trees
+            : num_trees - 1;
+    const int num_attempted_layers =
+        tree_ensemble_resource->GetNumLayersAttempted();
+
+    // growing_metadata
+    Tensor* output_stamp_token_t = nullptr;
+    Tensor* output_num_trees_t = nullptr;
+    Tensor* output_num_finalized_trees_t = nullptr;
+    Tensor* output_num_attempted_layers_t = nullptr;
+
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
+                                                     &output_stamp_token_t));
+    OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape(),
+                                                     &output_num_trees_t));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, TensorShape(),
+                                            &output_num_finalized_trees_t));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(3, TensorShape(),
+                                            &output_num_attempted_layers_t));
+
+    output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
+    output_num_trees_t->scalar<int32>()() = num_trees;
+    output_num_finalized_trees_t->scalar<int32>()() = num_finalized_trees;
+    output_num_attempted_layers_t->scalar<int32>()() = num_attempted_layers;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesGetEnsembleStates").Device(DEVICE_CPU),
+    BoostedTreesGetEnsembleStatesOp);
+
+// Op for serializing a model.
+class BoostedTreesSerializeEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesSerializeEnsembleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    BoostedTreesEnsembleResource* tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_ensemble_resource));
+    tf_shared_lock l(*tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(tree_ensemble_resource);
+    Tensor* output_stamp_token_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
+                                                     &output_stamp_token_t));
+    output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
+    Tensor* output_proto_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, TensorShape(), &output_proto_t));
+    output_proto_t->scalar<string>()() =
+        tree_ensemble_resource->SerializeAsString();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesSerializeEnsemble").Device(DEVICE_CPU),
+    BoostedTreesSerializeEnsembleOp);
+
+// Op for deserializing a tree ensemble variable from a checkpoint.
+class BoostedTreesDeserializeEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesDeserializeEnsembleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    BoostedTreesEnsembleResource* tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_ensemble_resource));
+    mutex_lock l(*tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(tree_ensemble_resource);
+
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Get the tree ensemble proto.
+    const Tensor* tree_ensemble_serialized_t;
+    OP_REQUIRES_OK(context, context->input("tree_ensemble_serialized",
+                                           &tree_ensemble_serialized_t));
+    // Deallocate all the previous objects on the resource.
+    tree_ensemble_resource->Reset();
+    OP_REQUIRES(
+        context,
+        tree_ensemble_resource->InitFromSerialized(
+            tree_ensemble_serialized_t->scalar<string>()(), stamp_token),
+        errors::InvalidArgument("Unable to parse tree ensemble proto."));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesDeserializeEnsemble").Device(DEVICE_CPU),
+    BoostedTreesDeserializeEnsembleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ea12c522c8bb73d64715e3a75c481ccf6d054dc
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -0,0 +1,301 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+int32 BoostedTreesEnsembleResource::next_node(
+    const int32 tree_id, const int32 node_id, const int32 index_in_batch,
+    const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  const auto& split = node.bucketized_split();
+  if (bucketized_features[split.feature_id()](index_in_batch) <=
+      split.threshold()) {
+    return split.left_id();
+  } else {
+    return split.right_id();
+  }
+}
+
+float BoostedTreesEnsembleResource::node_value(const int32 tree_id,
+                                               const int32 node_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  if (node.node_case() == boosted_trees::Node::kLeaf) {
+    return node.leaf().scalar();
+  } else {
+    return node.metadata().original_leaf().scalar();
+  }
+}
+
+void BoostedTreesEnsembleResource::UpdateGrowingMetadata() const {
+  tree_ensemble_->mutable_growing_metadata()->set_num_layers_attempted(
+      tree_ensemble_->growing_metadata().num_layers_attempted() + 1);
+
+  const int n_trees = num_trees();
+
+  if (n_trees <= 0 ||
+      // Checks if we are building the first layer of the dummy empty tree
+      ((n_trees == 1 || IsTreeFinalized(n_trees - 2)) &&
+       (tree_ensemble_->trees(n_trees - 1).nodes_size() == 1))) {
+    tree_ensemble_->mutable_growing_metadata()->set_num_trees_attempted(
+        tree_ensemble_->growing_metadata().num_trees_attempted() + 1);
+  }
+}
+
+// Add a tree to the ensemble and returns a new tree_id.
+int32 BoostedTreesEnsembleResource::AddNewTree(const float weight) {
+  const int32 new_tree_id = tree_ensemble_->trees_size();
+  auto* node = tree_ensemble_->add_trees()->add_nodes();
+  node->mutable_leaf()->set_scalar(0.0);
+  tree_ensemble_->add_tree_weights(weight);
+  tree_ensemble_->add_tree_metadata();
+
+  return new_tree_id;
+}
+
+void BoostedTreesEnsembleResource::AddBucketizedSplitNode(
+    const int32 tree_id, const int32 node_id, const int32 feature_id,
+    const int32 threshold, const float gain, const float left_contrib,
+    const float right_contrib, int32* left_node_id, int32* right_node_id) {
+  auto* tree = tree_ensemble_->mutable_trees(tree_id);
+  auto* node = tree->mutable_nodes(node_id);
+  DCHECK_EQ(node->node_case(), boosted_trees::Node::kLeaf);
+  float prev_node_value = node->leaf().scalar();
+  *left_node_id = tree->nodes_size();
+  *right_node_id = *left_node_id + 1;
+  auto* left_node = tree->add_nodes();
+  auto* right_node = tree->add_nodes();
+  if (node_id != 0) {
+    // Save previous leaf value if it is not the first leaf in the tree.
+    node->mutable_metadata()->mutable_original_leaf()->Swap(
+        node->mutable_leaf());
+  }
+  node->mutable_metadata()->set_gain(gain);
+  auto* new_split = node->mutable_bucketized_split();
+  new_split->set_feature_id(feature_id);
+  new_split->set_threshold(threshold);
+  new_split->set_left_id(*left_node_id);
+  new_split->set_right_id(*right_node_id);
+  // TODO(npononareva): this is LAYER-BY-LAYER boosting; add WHOLE-TREE.
+  left_node->mutable_leaf()->set_scalar(prev_node_value + left_contrib);
+  right_node->mutable_leaf()->set_scalar(prev_node_value + right_contrib);
+}
+
+void BoostedTreesEnsembleResource::Reset() {
+  // Reset stamp.
+  set_stamp(-1);
+
+  // Clear tree ensemle.
+  arena_.Reset();
+  CHECK_EQ(0, arena_.SpaceAllocated());
+  tree_ensemble_ =
+      protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(&arena_);
+}
+
+void BoostedTreesEnsembleResource::PostPruneTree(const int32 current_tree) {
+  // No-op if tree is empty.
+  auto* tree = tree_ensemble_->mutable_trees(current_tree);
+  int32 num_nodes = tree->nodes_size();
+  if (num_nodes == 0) {
+    return;
+  }
+
+  std::vector<int32> nodes_to_delete;
+  // If a node was pruned, we need to save the change of the prediction from
+  // this node to its parent, as well as the parent id.
+  std::vector<std::pair<int32, float>> nodes_changes;
+  nodes_changes.reserve(num_nodes);
+  for (int32 i = 0; i < num_nodes; ++i) {
+    nodes_changes.emplace_back(i, 0.0);
+  }
+  // Prune the tree recursively starting from the root. Each node that has
+  // negative gain and only leaf children will be pruned recursively up from
+  // the bottom of the tree. This method returns the list of nodes pruned, and
+  // updates the nodes in the tree not to refer to those pruned nodes.
+  RecursivelyDoPostPrunePreparation(current_tree, 0, &nodes_to_delete,
+                                    &nodes_changes);
+
+  if (nodes_to_delete.empty()) {
+    // No pruning happened, and no post-processing needed.
+    return;
+  }
+
+  // Sort node ids so they are in asc order.
+  std::sort(nodes_to_delete.begin(), nodes_to_delete.end());
+
+  // We need to
+  // - update split left and right children ids with new indices
+  // - actually remove the nodes that need to be removed
+  // - save the information about pruned node so we could recover the
+  // predictions from cache. Build a map for old node index=>new node index.
+  // nodes_to_delete contains nodes who's indices should be skipped, in
+  // ascending order. Save the information about new indices into meta.
+  std::map<int32, int32> old_to_new_ids;
+  int32 new_index = 0;
+  int32 index_for_deleted = 0;
+  auto* post_prune_meta = tree_ensemble_->mutable_tree_metadata(current_tree)
+                              ->mutable_post_pruned_nodes_meta();
+
+  for (int32 i = 0; i < num_nodes; ++i) {
+    if (index_for_deleted < nodes_to_delete.size() &&
+        i == nodes_to_delete[index_for_deleted]) {
+      // Node i will get removed,
+      ++index_for_deleted;
+      // Update meta info that will allow us to use cached predictions from
+      // those nodes.
+      int32 new_id;
+      float logit_change;
+      CalculateParentAndLogitUpdate(i, nodes_changes, &new_id, &logit_change);
+      auto* meta = post_prune_meta->Add();
+      meta->set_new_node_id(old_to_new_ids[new_id]);
+      meta->set_logit_change(logit_change);
+    } else {
+      old_to_new_ids[i] = new_index++;
+      auto* meta = post_prune_meta->Add();
+      // Update meta info that will allow us to use cached predictions from
+      // those nodes.
+      meta->set_new_node_id(old_to_new_ids[i]);
+      meta->set_logit_change(0.0);
+    }
+  }
+  index_for_deleted = 0;
+  int32 i = 0;
+  protobuf::RepeatedPtrField<boosted_trees::Node> new_nodes;
+  new_nodes.Reserve(old_to_new_ids.size());
+  for (auto node : *(tree->mutable_nodes())) {
+    if (index_for_deleted < nodes_to_delete.size() &&
+        i == nodes_to_delete[index_for_deleted]) {
+      ++index_for_deleted;
+      ++i;
+      continue;
+    } else {
+      if (node.node_case() == boosted_trees::Node::kBucketizedSplit) {
+        node.mutable_bucketized_split()->set_left_id(
+            old_to_new_ids[node.bucketized_split().left_id()]);
+        node.mutable_bucketized_split()->set_right_id(
+            old_to_new_ids[node.bucketized_split().right_id()]);
+      }
+      *new_nodes.Add() = std::move(node);
+    }
+    ++i;
+  }
+  // Replace all the nodes in a tree with the ones we keep.
+  *tree->mutable_nodes() = std::move(new_nodes);
+
+  // Note that if the whole tree got pruned, we will end up with one node.
+  // We can't remove that tree because it will cause problems with cache.
+}
+
+void BoostedTreesEnsembleResource::GetPostPruneCorrection(
+    const int32 tree_id, const int32 initial_node_id, int32* current_node_id,
+    float* logit_update) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  if (IsTreeFinalized(tree_id) && IsTreePostPruned(tree_id)) {
+    DCHECK_LT(
+        initial_node_id,
+        tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta_size());
+    const auto& meta =
+        tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta(
+            initial_node_id);
+    *current_node_id = meta.new_node_id();
+    *logit_update += meta.logit_change();
+  }
+}
+
+bool BoostedTreesEnsembleResource::IsTerminalSplitNode(
+    const int32 tree_id, const int32 node_id) const {
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  const int32 left_id = node.bucketized_split().left_id();
+  const int32 right_id = node.bucketized_split().right_id();
+  return is_leaf(tree_id, left_id) && is_leaf(tree_id, right_id);
+}
+
+// For each pruned node, finds the leaf where it finally ended up and
+// calculates the total update from that pruned node prediction.
+void BoostedTreesEnsembleResource::CalculateParentAndLogitUpdate(
+    const int32 start_node_id,
+    const std::vector<std::pair<int32, float>>& nodes_change, int32* parent_id,
+    float* change) const {
+  *change = 0.0;
+  int32 node_id = start_node_id;
+  int32 parent = nodes_change[node_id].first;
+
+  while (parent != node_id) {
+    (*change) += nodes_change[node_id].second;
+    node_id = parent;
+    parent = nodes_change[node_id].first;
+  }
+  *parent_id = parent;
+}
+
+void BoostedTreesEnsembleResource::RecursivelyDoPostPrunePreparation(
+    const int32 tree_id, const int32 node_id,
+    std::vector<int32>* nodes_to_delete,
+    std::vector<std::pair<int32, float>>* nodes_meta) {
+  auto* node = tree_ensemble_->mutable_trees(tree_id)->mutable_nodes(node_id);
+  DCHECK_NE(node->node_case(), boosted_trees::Node::NODE_NOT_SET);
+  // Base case when we reach a leaf.
+  if (node->node_case() == boosted_trees::Node::kLeaf) {
+    return;
+  }
+
+  // Traverse node children first and recursively prune their sub-trees.
+  RecursivelyDoPostPrunePreparation(tree_id, node->bucketized_split().left_id(),
+                                    nodes_to_delete, nodes_meta);
+  RecursivelyDoPostPrunePreparation(tree_id,
+                                    node->bucketized_split().right_id(),
+                                    nodes_to_delete, nodes_meta);
+
+  // Two conditions must be satisfied to prune the node:
+  // 1- The split gain is negative.
+  // 2- After depth-first pruning, the node only has leaf children.
+  const auto& node_metadata = node->metadata();
+  if (node_metadata.gain() < 0 && IsTerminalSplitNode(tree_id, node_id)) {
+    const int32 left_id = node->bucketized_split().left_id();
+    const int32 right_id = node->bucketized_split().right_id();
+
+    // Save children that need to be deleted.
+    nodes_to_delete->push_back(left_id);
+    nodes_to_delete->push_back(right_id);
+
+    // Change node back into leaf.
+    *node->mutable_leaf() = node_metadata.original_leaf();
+    const float parent_value = node_value(tree_id, node_id);
+
+    // Save the old values of weights of children.
+    (*nodes_meta)[left_id].first = node_id;
+    (*nodes_meta)[left_id].second = parent_value - node_value(tree_id, left_id);
+
+    (*nodes_meta)[right_id].first = node_id;
+    (*nodes_meta)[right_id].second =
+        parent_value - node_value(tree_id, right_id);
+
+    // Clear gain for leaf node.
+    node->clear_metadata();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
new file mode 100644
index 0000000000000000000000000000000000000000..c82588b9507800a860e6fc7af4a51541f09cad5b
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -0,0 +1,221 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
+#define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// A StampedResource is a resource that has a stamp token associated with it.
+// Before reading from or applying updates to the resource, the stamp should
+// be checked to verify that the update is not stale.
+class StampedResource : public ResourceBase {
+ public:
+  StampedResource() : stamp_(-1) {}
+
+  bool is_stamp_valid(int64 stamp) const { return stamp_ == stamp; }
+
+  int64 stamp() const { return stamp_; }
+  void set_stamp(int64 stamp) { stamp_ = stamp; }
+
+ private:
+  int64 stamp_;
+};
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class BoostedTreesEnsembleResource : public StampedResource {
+ public:
+  // Constructor.
+  BoostedTreesEnsembleResource()
+      : tree_ensemble_(
+            protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
+                &arena_)) {}
+
+  string DebugString() override {
+    return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
+                           "]");
+  }
+
+  bool InitFromSerialized(const string& serialized, const int64 stamp_token) {
+    CHECK_EQ(stamp(), -1) << "Must Reset before Init.";
+    if (ParseProtoUnlimited(tree_ensemble_, serialized)) {
+      set_stamp(stamp_token);
+      return true;
+    }
+    return false;
+  }
+
+  string SerializeAsString() const {
+    return tree_ensemble_->SerializeAsString();
+  }
+
+  int32 num_trees() const { return tree_ensemble_->trees_size(); }
+
+  // Find the next node to which the example (specified by index_in_batch)
+  // traverses down from the current node indicated by tree_id and node_id.
+  // Args:
+  //   tree_id: the index of the tree in the ensemble.
+  //   node_id: the index of the node within the tree.
+  //   index_in_batch: the index of the example within the batch (relevant to
+  //       the index of the row to read in each bucketized_features).
+  //   bucketized_features: vector of feature Vectors.
+  int32 next_node(
+      const int32 tree_id, const int32 node_id, const int32 index_in_batch,
+      const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const;
+
+  float node_value(const int32 tree_id, const int32 node_id) const;
+
+  int32 GetNumLayersGrown(const int32 tree_id) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->tree_metadata(tree_id).num_layers_grown();
+  }
+
+  void SetNumLayersGrown(const int32 tree_id, int32 new_num_layers) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    tree_ensemble_->mutable_tree_metadata(tree_id)->set_num_layers_grown(
+        new_num_layers);
+  }
+
+  void UpdateGrowingMetadata() const;
+
+  int32 GetNumLayersAttempted() {
+    return tree_ensemble_->growing_metadata().num_layers_attempted();
+  }
+
+  bool is_leaf(const int32 tree_id, const int32 node_id) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+    const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    return node.node_case() == boosted_trees::Node::kLeaf;
+  }
+
+  int32 feature_id(const int32 tree_id, const int32 node_id) const {
+    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+    return node.bucketized_split().feature_id();
+  }
+
+  int32 bucket_threshold(const int32 tree_id, const int32 node_id) const {
+    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+    return node.bucketized_split().threshold();
+  }
+
+  int32 left_id(const int32 tree_id, const int32 node_id) const {
+    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+    return node.bucketized_split().left_id();
+  }
+
+  int32 right_id(const int32 tree_id, const int32 node_id) const {
+    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+    return node.bucketized_split().right_id();
+  }
+
+  // Add a tree to the ensemble and returns a new tree_id.
+  int32 AddNewTree(const float weight);
+
+  // Grows the tree by adding a split and leaves.
+  void AddBucketizedSplitNode(const int32 tree_id, const int32 node_id,
+                              const int32 feature_id, const int32 threshold,
+                              const float gain, const float left_contrib,
+                              const float right_contrib, int32* left_node_id,
+                              int32* right_node_id);
+
+  // Retrieves tree weights and returns as a vector.
+  // It involves a copy, so should be called only sparingly (like once per
+  // iteration, not per example).
+  std::vector<float> GetTreeWeights() const {
+    return {tree_ensemble_->tree_weights().begin(),
+            tree_ensemble_->tree_weights().end()};
+  }
+
+  float GetTreeWeight(const int32 tree_id) const {
+    return tree_ensemble_->tree_weights(tree_id);
+  }
+
+  float IsTreeFinalized(const int32 tree_id) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->tree_metadata(tree_id).is_finalized();
+  }
+
+  float IsTreePostPruned(const int32 tree_id) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->tree_metadata(tree_id)
+               .post_pruned_nodes_meta_size() > 0;
+  }
+
+  void SetIsFinalized(const int32 tree_id, const bool is_finalized) {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->mutable_tree_metadata(tree_id)->set_is_finalized(
+        is_finalized);
+  }
+
+  // Sets the weight of i'th tree.
+  void SetTreeWeight(const int32 tree_id, const float weight) {
+    DCHECK_GE(tree_id, 0);
+    DCHECK_LT(tree_id, num_trees());
+    tree_ensemble_->set_tree_weights(tree_id, weight);
+  }
+
+  // Resets the resource and frees the protos in arena.
+  // Caller needs to hold the mutex lock while calling this.
+  virtual void Reset();
+
+  void PostPruneTree(const int32 current_tree);
+
+  // For a given node, returns the id in a pruned tree, as well as correction
+  // to the cached prediction that should be applied. If tree was not
+  // post-pruned, current_node_id will be equal to initial_node_id and logit
+  // update will be equal to zero.
+  void GetPostPruneCorrection(const int32 tree_id, const int32 initial_node_id,
+                              int32* current_node_id,
+                              float* logit_update) const;
+  mutex* get_mutex() { return &mu_; }
+
+ private:
+  // Helper method to check whether a node is a terminal node in that it
+  // only has leaf nodes as children.
+  bool IsTerminalSplitNode(const int32 tree_id, const int32 node_id) const;
+
+  // For each pruned node, finds the leaf where it finally ended up and
+  // calculates the total update from that pruned node prediction.
+  void CalculateParentAndLogitUpdate(
+      const int32 start_node_id,
+      const std::vector<std::pair<int32, float>>& nodes_change,
+      int32* parent_id, float* change) const;
+
+  // Helper method to collect the information to be used to prune some nodes in
+  // the tree.
+  void RecursivelyDoPostPrunePreparation(
+      const int32 tree_id, const int32 node_id,
+      std::vector<int32>* nodes_to_delete,
+      std::vector<std::pair<int32, float>>* nodes_meta);
+
+ protected:
+  protobuf::Arena arena_;
+  mutex mu_;
+  boosted_trees::TreeEnsemble* tree_ensemble_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33fdab6a860358fab05abbb361bf004174e85658
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -0,0 +1,296 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+const float kEps = 1e-15;
+}  // namespace
+
+class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
+ public:
+  explicit BoostedTreesCalculateBestGainsPerFeatureOp(
+      OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("l1", &l1_));
+    OP_REQUIRES_OK(context, context->GetAttr("l2", &l2_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("tree_complexity", &tree_complexity_));
+    OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // node_id_range
+    const Tensor* node_id_range_t;
+    OP_REQUIRES_OK(context, context->input("node_id_range", &node_id_range_t));
+    const auto node_id_range = node_id_range_t->vec<int32>();
+    int32 node_id_first = node_id_range(0);
+    int32 node_id_last = node_id_range(1);  // inclusive.
+    // stats_summary_list
+    OpInputList stats_summary_list;
+    OP_REQUIRES_OK(context, context->input_list("stats_summary_list",
+                                                &stats_summary_list));
+    const int64 num_buckets = stats_summary_list[0].dim_size(1);
+    std::vector<TTypes<float, 3>::ConstTensor> stats_summary;
+    stats_summary.reserve(stats_summary_list.size());
+    for (const auto& tensor : stats_summary_list) {
+      stats_summary.emplace_back(tensor.tensor<float, 3>());
+    }
+
+    // Allocate output lists of tensors:
+    OpOutputList output_node_ids_list;
+    OP_REQUIRES_OK(
+        context, context->output_list("node_ids_list", &output_node_ids_list));
+    OpOutputList output_gains_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list("gains_list", &output_gains_list));
+    OpOutputList output_thresholds_list;
+    OP_REQUIRES_OK(context, context->output_list("thresholds_list",
+                                                 &output_thresholds_list));
+    OpOutputList output_left_node_contribs_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list("left_node_contribs_list",
+                                        &output_left_node_contribs_list));
+    OpOutputList output_right_node_contribs_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list("right_node_contribs_list",
+                                        &output_right_node_contribs_list));
+
+    // Get the best split info per node for each feature.
+    for (int feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      std::vector<float> cum_grad;
+      std::vector<float> cum_hess;
+      cum_grad.reserve(num_buckets);
+      cum_hess.reserve(num_buckets);
+
+      std::vector<int32> output_node_ids;
+      std::vector<float> output_gains;
+      std::vector<int32> output_thresholds;
+      std::vector<float> output_left_node_contribs;
+      std::vector<float> output_right_node_contribs;
+      for (int node_id = node_id_first; node_id <= node_id_last; ++node_id) {
+        // Calculate gains.
+        cum_grad.clear();
+        cum_hess.clear();
+        float total_grad = 0.0;
+        float total_hess = 0.0;
+        for (int bucket = 0; bucket < num_buckets; ++bucket) {
+          // TODO(nponomareva): Consider multi-dimensional gradients/hessians.
+          total_grad += stats_summary[feature_idx](node_id, bucket, 0);
+          total_hess += stats_summary[feature_idx](node_id, bucket, 1);
+          cum_grad.push_back(total_grad);
+          cum_hess.push_back(total_hess);
+        }
+        float best_gain = std::numeric_limits<float>::lowest();
+        float best_bucket = 0;
+        float best_contrib_for_left = 0.0;
+        float best_contrib_for_right = 0.0;
+        // Parent gain.
+        float parent_gain;
+        float unused;
+        CalculateWeightsAndGains(total_grad, total_hess, &unused, &parent_gain);
+
+        for (int bucket = 0; bucket < num_buckets; ++bucket) {
+          const float cum_grad_bucket = cum_grad[bucket];
+          const float cum_hess_bucket = cum_hess[bucket];
+          // Left child.
+          float contrib_for_left;
+          float gain_for_left;
+          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket,
+                                   &contrib_for_left, &gain_for_left);
+          // Right child.
+          float contrib_for_right;
+          float gain_for_right;
+          CalculateWeightsAndGains(total_grad - cum_grad_bucket,
+                                   total_hess - cum_hess_bucket,
+                                   &contrib_for_right, &gain_for_right);
+
+          if (gain_for_left + gain_for_right > best_gain) {
+            best_gain = gain_for_left + gain_for_right;
+            best_bucket = bucket;
+            best_contrib_for_left = contrib_for_left;
+            best_contrib_for_right = contrib_for_right;
+          }
+        }  // for bucket
+        output_node_ids.push_back(node_id);
+        // Remove the parent gain for the parent node.
+        output_gains.push_back(best_gain - parent_gain);
+        output_thresholds.push_back(best_bucket);
+        output_left_node_contribs.push_back(best_contrib_for_left);
+        output_right_node_contribs.push_back(best_contrib_for_right);
+      }  // for node_id
+      const int num_nodes = output_node_ids.size();
+      // output_node_ids
+      Tensor* output_node_ids_t;
+      OP_REQUIRES_OK(context,
+                     output_node_ids_list.allocate(feature_idx, {num_nodes},
+                                                   &output_node_ids_t));
+      auto output_node_ids_vec = output_node_ids_t->vec<int32>();
+      // output_gains
+      Tensor* output_gains_t;
+      OP_REQUIRES_OK(context, output_gains_list.allocate(
+                                  feature_idx, {num_nodes}, &output_gains_t));
+      auto output_gains_vec = output_gains_t->vec<float>();
+      // output_thresholds
+      Tensor* output_thresholds_t;
+      OP_REQUIRES_OK(context,
+                     output_thresholds_list.allocate(feature_idx, {num_nodes},
+                                                     &output_thresholds_t));
+      auto output_thresholds_vec = output_thresholds_t->vec<int32>();
+      // output_left_node_contribs
+      Tensor* output_left_node_contribs_t;
+      OP_REQUIRES_OK(context, output_left_node_contribs_list.allocate(
+                                  feature_idx, {num_nodes, 1},
+                                  &output_left_node_contribs_t));
+      auto output_left_node_contribs_matrix =
+          output_left_node_contribs_t->matrix<float>();
+      // output_right_node_contribs
+      Tensor* output_right_node_contribs_t;
+      OP_REQUIRES_OK(context, output_right_node_contribs_list.allocate(
+                                  feature_idx, {num_nodes, 1},
+                                  &output_right_node_contribs_t));
+      auto output_right_node_contribs_matrix =
+          output_right_node_contribs_t->matrix<float>();
+      // Sets output tensors from vectors.
+      for (int i = 0; i < num_nodes; ++i) {
+        output_node_ids_vec(i) = output_node_ids[i];
+        // Adjust the gains to penalize by tree complexity.
+        output_gains_vec(i) = output_gains[i] - tree_complexity_;
+        output_thresholds_vec(i) = output_thresholds[i];
+        // Logits are 1-dimensional for now.
+        // TODO(nponomareva): Consider multi-dimensional logits.
+        output_left_node_contribs_matrix(i, 0) = output_left_node_contribs[i];
+        output_right_node_contribs_matrix(i, 0) = output_right_node_contribs[i];
+      }
+    }  // for f
+  }
+
+ private:
+  void CalculateWeightsAndGains(const float g, const float h, float* weight,
+                                float* gain) {
+    //
+    // The formula for weight is -(g+l1*sgn(w))/(H+l2), for gain it is
+    // (g+l1*sgn(w))^2/(h+l2).
+    // This is because for each leaf we optimize
+    // 1/2(h+l2)*w^2+g*w+l1*abs(w)
+    float g_with_l1 = g;
+    // Apply L1 regularization.
+    // 1) Assume w>0 => w=-(g+l1)/(h+l2)=> g+l1 < 0 => g < -l1
+    // 2) Assume w<0 => w=-(g-l1)/(h+l2)=> g-l1 > 0 => g > l1
+    // For g from (-l1, l1), thus there is no solution => set to 0.
+    if (l1_ > 0) {
+      if (g > l1_) {
+        g_with_l1 -= l1_;
+      } else if (g < -l1_) {
+        g_with_l1 += l1_;
+      } else {
+        *weight = 0.0;
+        *gain = 0.0;
+        return;
+      }
+    }
+    // Apply L2 regularization.
+    if (h + l2_ <= kEps) {
+      // Avoid division by 0 or infinitesimal.
+      *weight = 0;
+      *gain = 0;
+    } else {
+      *weight = -g_with_l1 / (h + l2_);
+      *gain = -g_with_l1 * (*weight);
+    }
+  }
+
+  float l1_;
+  float l2_;
+  float tree_complexity_;
+  int max_splits_;
+  int num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesCalculateBestGainsPerFeature").Device(DEVICE_CPU),
+    BoostedTreesCalculateBestGainsPerFeatureOp);
+
+class BoostedTreesMakeStatsSummaryOp : public OpKernel {
+ public:
+  explicit BoostedTreesMakeStatsSummaryOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // node_ids
+    const Tensor* node_ids_t;
+    OP_REQUIRES_OK(context, context->input("node_ids", &node_ids_t));
+    const auto node_ids = node_ids_t->vec<int32>();
+    // gradients
+    const Tensor* gradients_t;
+    OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
+    const auto gradients = gradients_t->matrix<float>();
+    // hessians
+    const Tensor* hessians_t;
+    OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
+    const auto hessians = hessians_t->matrix<float>();
+    // bucketized_features
+    OpInputList bucketized_features_list;
+    OP_REQUIRES_OK(context, context->input_list("bucketized_features_list",
+                                                &bucketized_features_list));
+    std::vector<tensorflow::TTypes<int32>::ConstVec> bucketized_features;
+    bucketized_features.reserve(num_features_);
+    for (const Tensor& tensor : bucketized_features_list) {
+      bucketized_features.emplace_back(tensor.vec<int32>());
+    }
+
+    // Infer batch size.
+    const int64 batch_size = node_ids_t->dim_size(0);
+    // Allocate output stats tensor (Rank 4).
+    Tensor* output_stats_summary_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "stats_summary",
+                                {num_features_, max_splits_, num_buckets_, 2},
+                                &output_stats_summary_t));
+    auto output_stats_summary = output_stats_summary_t->tensor<float, 4>();
+    output_stats_summary.setZero();
+
+    // Partition by node, and then bucketize.
+    for (int feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      const auto& features = bucketized_features[feature_idx];
+      for (int i = 0; i < batch_size; ++i) {
+        const int32 node = node_ids(i);
+        const int32 bucket = features(i);
+        output_stats_summary(feature_idx, node, bucket, 0) += gradients(i, 0);
+        output_stats_summary(feature_idx, node, bucket, 1) += hessians(i, 0);
+      }
+    }
+  }
+
+ private:
+  int max_splits_;
+  int num_buckets_;
+  int num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesMakeStatsSummary").Device(DEVICE_CPU),
+                        BoostedTreesMakeStatsSummaryOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ded4054aced4bcb27b0590a44e1f86f6b0a1c2
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -0,0 +1,219 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+
+namespace tensorflow {
+
+namespace {
+constexpr float kLayerByLayerTreeWeight = 1.0;
+
+// TODO(nponomareva, youngheek): consider using vector.
+struct SplitCandidate {
+  SplitCandidate() {}
+
+  // Index in the list of the feature ids.
+  int64 feature_idx;
+
+  // Index in the tensor of node_ids for the feature with idx feature_idx.
+  int64 candidate_idx;
+
+  float gain;
+};
+
+enum PruningMode { kNoPruning = 0, kPrePruning = 1, kPostPruning = 2 };
+
+}  // namespace
+
+class BoostedTreesUpdateEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesUpdateEnsembleOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
+    OP_REQUIRES_OK(context, context->GetAttr("learning_rate", &learning_rate_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+
+    int32 pruning_index;
+    OP_REQUIRES_OK(context, context->GetAttr("pruning_mode", &pruning_index));
+    pruning_mode_ = static_cast<PruningMode>(pruning_index);
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // Get decision tree ensemble.
+    BoostedTreesEnsembleResource* ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &ensemble_resource));
+    core::ScopedUnref unref_me(ensemble_resource);
+    mutex_lock l(*ensemble_resource->get_mutex());
+    // Increase the ensemble stamp.
+    ensemble_resource->set_stamp(ensemble_resource->stamp() + 1);
+
+    // Read node ids, gains, thresholds and node contribs.
+    OpInputList node_ids_list;
+    OpInputList gains_list;
+    OpInputList thresholds_list;
+    OpInputList left_node_contribs;
+    OpInputList right_node_contribs;
+    OP_REQUIRES_OK(context, context->input_list("node_ids", &node_ids_list));
+    OP_REQUIRES_OK(context, context->input_list("gains", &gains_list));
+    OP_REQUIRES_OK(context,
+                   context->input_list("thresholds", &thresholds_list));
+    OP_REQUIRES_OK(context, context->input_list("left_node_contribs",
+                                                &left_node_contribs));
+    OP_REQUIRES_OK(context, context->input_list("right_node_contribs",
+                                                &right_node_contribs));
+
+    const Tensor* feature_ids_t;
+    OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+
+    auto feature_ids = feature_ids_t->vec<int32>();
+
+    // Find best splits for each active node.
+    std::map<int32, SplitCandidate> best_splits;
+    FindBestSplitsPerNode(context, node_ids_list, gains_list, &best_splits);
+
+    int32 current_tree =
+        UpdateGlobalAttemptsAndRetrieveGrowableTree(ensemble_resource);
+
+    // No-op if no new splits can be considered.
+    if (best_splits.empty()) {
+      LOG(WARNING) << "Not growing tree ensemble as no good splits were found.";
+      return;
+    }
+
+    const int32 new_num_layers =
+        ensemble_resource->GetNumLayersGrown(current_tree) + 1;
+    VLOG(1) << "Adding layer #" << new_num_layers - 1 << " to tree #"
+            << current_tree << " of ensemble of " << current_tree + 1
+            << " trees.";
+    bool split_happened = false;
+    // Add the splits to the tree.
+    for (auto& split_entry : best_splits) {
+      const int32 node_id = split_entry.first;
+      const SplitCandidate& candidate = split_entry.second;
+
+      const int64 feature_idx = candidate.feature_idx;
+      const int64 candidate_idx = candidate.candidate_idx;
+
+      const int32 feature_id = feature_ids(feature_idx);
+      const int32 threshold =
+          thresholds_list[feature_idx].vec<int32>()(candidate_idx);
+      const float gain = gains_list[feature_idx].vec<float>()(candidate_idx);
+
+      if (pruning_mode_ == kPrePruning) {
+        // Don't consider negative splits if we're pre-pruning the tree.
+        // Note that zero-gain splits are acceptable.
+        if (gain < 0) {
+          continue;
+        }
+      }
+      // For now assume that the weights vectors are one dimensional.
+      // TODO(nponomareva): change here for multiclass.
+      const float left_contrib =
+          learning_rate_ *
+          left_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
+      const float right_contrib =
+          learning_rate_ *
+          right_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
+
+      // unused.
+      int32 left_node_id;
+      int32 right_node_id;
+
+      ensemble_resource->AddBucketizedSplitNode(
+          current_tree, node_id, feature_id, threshold, gain, left_contrib,
+          right_contrib, &left_node_id, &right_node_id);
+      split_happened = true;
+    }
+    if (split_happened) {
+      // Update growable tree metadata.
+      ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
+      // Finalize the tree if needed.
+      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) {
+        ensemble_resource->SetIsFinalized(current_tree, true);
+        if (pruning_mode_ == kPostPruning) {
+          ensemble_resource->PostPruneTree(current_tree);
+        }
+        if (ensemble_resource->num_trees() > 0) {
+          // Create a dummy new tree with an empty node.
+          ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
+        }
+      }
+    }
+  }
+
+ private:
+  int32 UpdateGlobalAttemptsAndRetrieveGrowableTree(
+      BoostedTreesEnsembleResource* const ensemble_resource) {
+    int32 num_trees = ensemble_resource->num_trees();
+    int32 current_tree = num_trees - 1;
+
+    // Increment global attempt stats.
+    ensemble_resource->UpdateGrowingMetadata();
+
+    // Note we don't set tree weight to be equal to learning rate, since we
+    // apply learning rate to leaf weights instead, when doing layer-by-layer
+    // boosting.
+    if (num_trees <= 0) {
+      // Create a new tree with a no-op leaf.
+      current_tree = ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
+    }
+    return current_tree;
+  }
+
+  // Helper method which effectively does a reduce over all split candidates
+  // and finds the best split for each node.
+  void FindBestSplitsPerNode(
+      OpKernelContext* const context, const OpInputList& node_ids_list,
+      const OpInputList& gains_list,
+      std::map<int32, SplitCandidate>* best_split_per_node) {
+    // Find best split per node going through every feature candidate.
+    for (int64 feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      const auto& node_ids = node_ids_list[feature_idx].vec<int32>();
+      const auto& gains = gains_list[feature_idx].vec<float>();
+
+      for (size_t candidate_idx = 0; candidate_idx < node_ids.size();
+           ++candidate_idx) {
+        // Get current split candidate.
+        const auto& node_id = node_ids(candidate_idx);
+        const auto& gain = gains(candidate_idx);
+
+        auto best_split_it = best_split_per_node->find(node_id);
+        SplitCandidate candidate;
+        candidate.feature_idx = feature_idx;
+        candidate.candidate_idx = candidate_idx;
+        candidate.gain = gain;
+
+        if (best_split_it == best_split_per_node->end() ||
+            gain > best_split_it->second.gain) {
+          (*best_split_per_node)[node_id] = candidate;
+        }
+      }
+    }
+  }
+
+ private:
+  int32 num_features_;
+  float learning_rate_;
+  int32 max_depth_;
+  PruningMode pruning_mode_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesUpdateEnsemble").Device(DEVICE_CPU),
+                        BoostedTreesUpdateEnsembleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 3ae9f2ab4d9c102941927215441b4c02625387f0..382e5440e14954eec6e81fe7eabc2017706fe678 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -58,10 +58,14 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
   FN(arg0, arg1, std::complex<float>);       \
   FN(arg0, arg1, std::complex<double>)
 
-#define CURRY_TYPES3(FN, arg0, arg1)   \
-  CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
+#define CURRY_TYPES3_NO_BF16(FN, arg0, arg1) \
+  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)       \
   FN(arg0, arg1, Eigen::half);
 
+#define CURRY_TYPES3(FN, arg0, arg1)   \
+  CURRY_TYPES3_NO_BF16(FN, arg0, arg1) \
+  FN(arg0, arg1, bfloat16);
+
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
   if (DataTypeToEnum<OUT>::value == dst_dtype) {                           \
     return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {      \
diff --git a/tensorflow/core/kernels/cast_op_impl_bfloat.cc b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
index a06f815899016a9f1267fefb504a6ed6684a0101..bfa7ba0d4770e7a4ac1493482d90b166a4fcd3a2 100644
--- a/tensorflow/core/kernels/cast_op_impl_bfloat.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
@@ -24,17 +24,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetCpuCastFromBfloat(DataType dst_dtype) {
-  if (dst_dtype == DT_FLOAT) {
-    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
-      int64 N = out->NumElements();
-      auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-      auto work = [&inp, &out](int64 start, int64 end) {
-        BFloat16ToFloat(inp.flat<bfloat16>().data() + start,
-                        out->flat<float>().data() + start, end - start);
-      };
-      Shard(worker_threads->num_threads, worker_threads->workers, N, 2, work);
-    };
-  }
+  CURRY_TYPES3(CAST_CASE, CPUDevice, bfloat16);
   return nullptr;
 }
 
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index 5cd63f2458d2a01ec3d2c1621e6f86ecc6f5347f..c5c7394b43c92069aec4a46c9a712da1f606f6a8 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -29,7 +29,7 @@ GetCpuCastFromBool(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromBool(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, bool);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, bool);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_complex128.cc b/tensorflow/core/kernels/cast_op_impl_complex128.cc
index c428679d7c4983fe8b25461bc8bd431076fc9226..52899d58cdcff2df7fca07d223cc060ba080be82 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex128.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex128.cc
@@ -29,7 +29,7 @@ GetCpuCastFromComplex128(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromComplex128(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, std::complex<double>);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<double>);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_complex64.cc b/tensorflow/core/kernels/cast_op_impl_complex64.cc
index 07b46551b2ea5dbf214d167f9a9469fa658d8f4b..617bda53d5822f67186088c0251caf9c108e6a7d 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex64.cc
@@ -29,7 +29,7 @@ GetCpuCastFromComplex64(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromComplex64(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, std::complex<float>);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<float>);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index 1203f066a2db366d38ae99c5ff1e1f385979a8af..7dc485ddad275d6fcdcc54506fabce2a90819645 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -29,7 +29,7 @@ GetCpuCastFromDouble(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromDouble(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, double);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, double);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index 2ff9af21f2413b16e1d38a64594e0dcf89e14bcb..1c933914fde14987562b1d796ebf6621d7980b28 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -25,18 +25,6 @@ typedef Eigen::GpuDevice GPUDevice;
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetCpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, float);
-  if (dst_dtype == DT_BFLOAT16) {
-    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
-      int64 N = out->NumElements();
-      auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-      auto work = [&inp, &out](int64 start, int64 end) {
-        FloatToBFloat16(inp.flat<float>().data() + start,
-                        out->flat<bfloat16>().data() + start, end - start);
-      };
-      Shard(worker_threads->num_threads, worker_threads->workers, N, 2, work);
-    };
-  }
-
   return nullptr;
 }
 
@@ -44,7 +32,6 @@ GetCpuCastFromFloat(DataType dst_dtype) {
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, GPUDevice, float);
-  CAST_CASE(GPUDevice, float, bfloat16);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_half.cc b/tensorflow/core/kernels/cast_op_impl_half.cc
index e89d4646d71c5938543f626ffb1bca1f84b6b66f..ef4b94e3263054f46bbe5b7c5487cc7b30990995 100644
--- a/tensorflow/core/kernels/cast_op_impl_half.cc
+++ b/tensorflow/core/kernels/cast_op_impl_half.cc
@@ -29,7 +29,7 @@ GetCpuCastFromHalf(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromHalf(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, Eigen::half);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, Eigen::half);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_int16.cc b/tensorflow/core/kernels/cast_op_impl_int16.cc
index f12d852e957550e17521a4dc5b70d3807dacecd3..59360f744573803f44cf7c31d5acf836e580fdc4 100644
--- a/tensorflow/core/kernels/cast_op_impl_int16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int16.cc
@@ -29,7 +29,7 @@ GetCpuCastFromInt16(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromInt16(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, int16);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int16);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index 2a4b27a12dcba71f904f980c13d82fbc0dfc3ae2..a867392fde1c4aa45960bd5a490c2664c0ba0005 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -29,7 +29,7 @@ GetCpuCastFromInt32(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromInt32(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, int32);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int32);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 065defabbba6c926640b01eb68e4c0b20a8e241d..467a8f6c89b35ea1f1c8da1327f6d204b7e876b0 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -29,7 +29,7 @@ GetCpuCastFromInt64(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromInt64(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, int64);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int64);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_int8.cc b/tensorflow/core/kernels/cast_op_impl_int8.cc
index 8d678c47335f6f985c8e93bb72f61c95fbda834b..21002a4321be4474fdd6f60e61afce539d2ea177 100644
--- a/tensorflow/core/kernels/cast_op_impl_int8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int8.cc
@@ -29,7 +29,7 @@ GetCpuCastFromInt8(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromInt8(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, int8);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int8);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_uint16.cc b/tensorflow/core/kernels/cast_op_impl_uint16.cc
index c917aaf7bde6eccf9acb556152ee854c897143fa..cd829bae2a90af6daecf1a6f67be96cdb1854140 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint16.cc
@@ -29,7 +29,7 @@ GetCpuCastFromUint16(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromUint16(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, uint16);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint16);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_impl_uint8.cc b/tensorflow/core/kernels/cast_op_impl_uint8.cc
index 377c8ca9536d69a49364e8f5f2b6ffb969ce234e..2d1a6f3a4edc72bfea53a2b95113d32e5b76c913 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint8.cc
@@ -29,7 +29,7 @@ GetCpuCastFromUint8(DataType dst_dtype) {
 #if GOOGLE_CUDA
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetGpuCastFromUint8(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, GPUDevice, uint8);
+  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint8);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index 057e209a71903ad24e2d4f757e4d2a3bc4357a76..7da9d28a3daf175e3cf6f2a667ea1213f83ab003 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -75,7 +75,8 @@ class CastOpTest : public OpsTestBase {
   TEST_CAST(in, int64);         \
   TEST_CAST(in, half);          \
   TEST_CAST(in, float);         \
-  TEST_CAST(in, double)
+  TEST_CAST(in, double);        \
+  TEST_CAST(in, bfloat16);
 
 TEST_ALL_CASTS_FROM(uint8)
 TEST_ALL_CASTS_FROM(uint16)
@@ -85,6 +86,7 @@ TEST_ALL_CASTS_FROM(int64)
 TEST_ALL_CASTS_FROM(half)
 TEST_ALL_CASTS_FROM(float)
 TEST_ALL_CASTS_FROM(double)
+TEST_ALL_CASTS_FROM(bfloat16)
 
 #undef TEST_ALL_CASTS_FROM
 #undef TEST_CAST
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 7011550f7e161c9727b8d31eff0917964b09044e..f16766315f2640ab7c42c077fc5156a3a825fbf9 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -53,17 +53,38 @@ class ConcatBaseOp : public OpKernel {
   void Compute(OpKernelContext* c) override {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
-        AxisArgName == NAME_IS_AXIS
-            ? "axis"
-            : AxisArgName == NAME_IS_CONCAT_DIM ? "concat_dim" : "<invalid>";
+        AxisArgName == NAME_IS_AXIS ? "axis" : AxisArgName == NAME_IS_CONCAT_DIM
+                                                   ? "concat_dim"
+                                                   : "<invalid>";
     OP_REQUIRES_OK(c, c->input(axis_attribute_name, &concat_dim_tensor));
     OP_REQUIRES(c, IsLegacyScalar(concat_dim_tensor->shape()),
                 errors::InvalidArgument(
                     axis_attribute_name,
                     " tensor should be a scalar integer, but got shape ",
                     concat_dim_tensor->shape().DebugString()));
-    const int32 concat_dim =
-        internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    int64 concat_dim;
+    // In case of ConcatV2, "axis" could be int32 or int64
+    if (AxisArgName == NAME_IS_AXIS) {
+      OP_REQUIRES(
+          c, (concat_dim_tensor->dtype() == DT_INT32 ||
+              concat_dim_tensor->dtype() == DT_INT64),
+          errors::InvalidArgument(axis_attribute_name,
+                                  " tensor should be int32 or int64, but got ",
+                                  concat_dim_tensor->dtype()));
+    } else {
+      OP_REQUIRES(c, (concat_dim_tensor->dtype() == DT_INT32),
+                  errors::InvalidArgument(axis_attribute_name,
+                                          " tensor should be int32, but got ",
+                                          concat_dim_tensor->dtype()));
+    }
+    if (concat_dim_tensor->dtype() == DT_INT32) {
+      concat_dim =
+          internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    } else {
+      concat_dim =
+          internal::SubtleMustCopy(concat_dim_tensor->scalar<int64>()());
+    }
+
     OpInputList values;
     OP_REQUIRES_OK(c, c->input_list("values", &values));
     const int N = values.size();
@@ -154,17 +175,16 @@ using ConcatOp = ConcatBaseOp<Device, T, NAME_IS_CONCAT_DIM>;
 template <typename Device, typename T>
 using ConcatV2Op = ConcatBaseOp<Device, T, NAME_IS_AXIS>;
 
-#define REGISTER_CONCAT(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("Concat")                     \
-                              .Device(DEVICE_CPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .HostMemory("concat_dim"),     \
-                          ConcatOp<CPUDevice, type>)         \
-  REGISTER_KERNEL_BUILDER(Name("ConcatV2")                   \
-                              .Device(DEVICE_CPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("axis"),           \
+#define REGISTER_CONCAT(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("concat_dim"), \
+                          ConcatOp<CPUDevice, type>)     \
+  REGISTER_KERNEL_BUILDER(Name("ConcatV2")               \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("axis"),       \
                           ConcatV2Op<CPUDevice, type>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_CONCAT);
@@ -178,17 +198,16 @@ REGISTER_CONCAT(qint32);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                                   \
-  REGISTER_KERNEL_BUILDER(Name("Concat")                     \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .HostMemory("concat_dim"),     \
-                          ConcatOp<GPUDevice, type>)         \
-  REGISTER_KERNEL_BUILDER(Name("ConcatV2")                   \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("axis"),           \
+#define REGISTER_GPU(type)                               \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("concat_dim"), \
+                          ConcatOp<GPUDevice, type>)     \
+  REGISTER_KERNEL_BUILDER(Name("ConcatV2")               \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("axis"),       \
                           ConcatV2Op<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
@@ -212,7 +231,6 @@ REGISTER_KERNEL_BUILDER(Name("Concat")
 REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tidx")
                             .HostMemory("values")
                             .HostMemory("axis")
                             .HostMemory("output"),
@@ -221,17 +239,16 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                  \
-  REGISTER_KERNEL_BUILDER(Name("Concat")                     \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<type>("T")     \
-                              .HostMemory("concat_dim"),     \
-                          ConcatOp<SYCLDevice, type>)        \
-  REGISTER_KERNEL_BUILDER(Name("ConcatV2")                   \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("axis"),           \
+#define REGISTER_SYCL(type)                              \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("concat_dim"), \
+                          ConcatOp<SYCLDevice, type>)    \
+  REGISTER_KERNEL_BUILDER(Name("ConcatV2")               \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("axis"),       \
                           ConcatV2Op<SYCLDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
@@ -246,7 +263,6 @@ REGISTER_KERNEL_BUILDER(Name("Concat")
 REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tidx")
                             .HostMemory("values")
                             .HostMemory("axis")
                             .HostMemory("output"),
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index e6ae59529107e529a9ccf7c790da0da62c90c199..66ee474ca3f72c283b2a300e90f7377a68911b7b 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -520,6 +520,7 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 // GPU definitions.
@@ -1017,11 +1018,17 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
+DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("filter_sizes"),
+                        Conv2DSlowBackpropFilterOp<GPUDevice, double>);
 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 15c55e4d9903b3bbd53e1b6e1c95571ef7834015..71ea0d5d720df3c8070bce81fc8608b438617220 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -592,6 +592,7 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 // GPU definitions.
@@ -1090,11 +1091,17 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
+DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<double>("T")
+                            .HostMemory("input_sizes"),
+                        Conv2DSlowBackpropInputOp<GPUDevice, double>);
 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 47f6907c04b4e48607e66b5c9601cd9030fa9001..88843e4da78a867ea5b7c30d6cb43855fdefdd13 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -446,10 +446,11 @@ class Conv2DOp : public BinaryOp<T> {
 #if !defined(USE_GEMM_FOR_CONV)
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
 #endif  // USE_GEMM_FOR_CONV
 
 // To be used inside depthwise_conv_op.cc.
-template class LaunchConv2DOp<CPUDevice, float>;
+template struct LaunchConv2DOp<CPUDevice, float>;
 
 #if GOOGLE_CUDA
 int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
@@ -810,6 +811,7 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);     \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
+DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
 #undef DECLARE_GPU_SPEC
@@ -822,6 +824,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv2DOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    Conv2DOp<GPUDevice, double>);
 
 // To be used inside depthwise_conv_op.cc.
 template class LaunchConv2DOp<GPUDevice, float>;
diff --git a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
index b5dd26a9e47578619945da21c85b4c5b40a55132..52859af950e3c346536acd246bafde830b405ee5 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
@@ -25,6 +25,9 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
+template struct functor::InflatePadAndShuffle<GPUDevice, double, 4, int>;
+template struct functor::InflatePadAndShuffle<GPUDevice, double, 4,
+                                              Eigen::DenseIndex>;
 template struct functor::InflatePadAndShuffle<GPUDevice, float, 4, int>;
 template struct functor::InflatePadAndShuffle<GPUDevice, float, 4,
                                               Eigen::DenseIndex>;
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index a376534badc73065e3ec01972dde85da7bbdb0f8..2503b475dc10e631863e06b1e4d6931928fb4321 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -1039,9 +1039,11 @@ template struct functor::SwapDimension0And2InTensor3<GPUDevice, double2,
                                                      /*conjugate=*/true>;
 
 // For 2d ops.
+template struct functor::TransformFilter<GPUDevice, double, int, 4>;
 template struct functor::TransformFilter<GPUDevice, float, int, 4>;
 template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
 
+template struct functor::ReverseTransformFilter<GPUDevice, double, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;
 
@@ -1054,6 +1056,7 @@ template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
 template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;
 
 template struct functor::PadInput<GPUDevice, int, int, 4>;
+template struct functor::PadInput<GPUDevice, double, int, 4>;
 template struct functor::PadInput<GPUDevice, float, int, 4>;
 template struct functor::PadInput<GPUDevice, Eigen::half, int, 4>;
 
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
similarity index 100%
rename from tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
rename to tensorflow/core/kernels/cudnn_rnn_ops.cc
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index c71c756e4461d4ed36628ea8a4f8a0922896302c..b12652f7fba4ea8a9bd4ec18b79469ad69e79902 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -16,14 +16,14 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Div", functor::div, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Div", functor::div, float, Eigen::half, double,
+          bfloat16, complex64, complex128);
 REGISTER5(BinaryOp, CPU, "Div", functor::safe_div, uint8, uint16, int16, int32,
           int64);
 REGISTER5(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8, uint16, int16,
           int32, int64);
-REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
+          bfloat16, complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
           uint16, int16, int64, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 00cdecdbd184b84b6601eda76dd5dfded5aa1e1b..575968126fa82d585fcda9490da5cd69332366c6 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(BinaryOp, CPU, "Less", functor::less, float, Eigen::half, double,
-          int32, int64, uint8, int8, int16);
+REGISTER9(BinaryOp, CPU, "Less", functor::less, float, Eigen::half, double,
+          bfloat16, int32, int64, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER7(BinaryOp, GPU, "Less", functor::less, float, Eigen::half, double,
           int64, uint8, int8, int16);
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 11806c5fc774dc3a37abc733127e4b6660f27f9c..499200d0546ccf1d9119b63a9e552908de3d1ae1 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(BinaryOp, CPU, "LessEqual", functor::less_equal, float, Eigen::half,
-          double, int32, int64, uint8, int8, int16);
+REGISTER9(BinaryOp, CPU, "LessEqual", functor::less_equal, float, Eigen::half,
+          bfloat16, double, int32, int64, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER7(BinaryOp, GPU, "LessEqual", functor::less_equal, float, Eigen::half,
           double, int64, uint8, int8, int16);
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 98936e0f960f1f407c2187746ca80d3db0a93412..5d17c890cfec77cd3f50ee649adf4af6e20b5ed7 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
+          bfloat16, complex64, complex128);
 
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index dff83df828f076a076a8f220d04974344d8ffafc..9bc37003879f077288dfc058996e9b0b4162d16e 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Minimum", functor::minimum, float, Eigen::half,
-          double, int32, int64);
+REGISTER6(BinaryOp, CPU, "Minimum", functor::minimum, float, Eigen::half,
+          bfloat16, double, int32, int64);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Minimum", functor::minimum, float, Eigen::half,
           double, int64);
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 497756133d05249141823481e6ef43b73a84660b..205070761f13cbfe6b509eea2d6b36c2f0f37f04 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
+          bfloat16, complex64, complex128);
 
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
@@ -27,8 +27,8 @@ REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
 #endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 06918075a42648a3cf7135376d728fa466e7c469..a80905d1450cc38619bb27c2e27eda58b3cf169d 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -27,27 +27,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 
 namespace Eigen {
-namespace numext {
-#if GOOGLE_CUDA
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(
-    const std::complex<float>& x) {
-  auto com = ::expf(x.real());
-  auto res_real = com * ::cosf(x.imag());
-  auto res_imag = com * ::sinf(x.imag());
-  return std::complex<float>(res_real, res_imag);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(
-    const std::complex<double>& x) {
-  auto com = ::exp(x.real());
-  auto res_real = com * ::cos(x.imag());
-  auto res_imag = com * ::sin(x.imag());
-  return std::complex<double>(res_real, res_imag);
-}
-#endif
-}  // namespace numext
-
 namespace internal {
 
 template <typename T>
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 01754ec21acd2196dd907747da45071022bcebc9..8c4f0218ee174328ec5c2484d08d639251047728 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -10,18 +10,7 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_kernel_library",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    "tf_cc_test",
 )
 
 cc_library(
@@ -295,11 +284,31 @@ tf_kernel_library(
     ],
 )
 
+cc_library(
+    name = "prefetch_autotuner",
+    srcs = ["prefetch_autotuner.cc"],
+    hdrs = ["prefetch_autotuner.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "prefetch_autotuner_test",
+    srcs = ["prefetch_autotuner_test.cc"],
+    deps = [
+        ":prefetch_autotuner",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_kernel_library(
     name = "prefetch_dataset_op",
     srcs = ["prefetch_dataset_op.cc"],
     deps = [
         ":dataset",
+        ":prefetch_autotuner",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index f0a2192826e051586e4999d729c24ed5495be0ea..4b4728dab68523aa54176bdce6222a7aa5f8e9d3 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -308,6 +308,21 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
             input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             cache_(new std::vector<std::vector<Tensor>>) {}
 
+      ~MemoryWriterIterator() override {
+        mutex_lock l(mu_);
+        if (cache_) {
+          LOG(ERROR)
+              << "The calling iterator did not fully read the dataset we were "
+                 "attempting to cache. In order to avoid unexpected truncation "
+                 "of the sequence, the current [partially cached] sequence "
+                 "will be dropped. This can occur if you have a sequence "
+                 "similar to `dataset.cache().take(k).repeat()`. Instead, swap "
+                 "the order (i.e. `dataset.take(k).cache().repeat()`)";
+          mutex_lock l2(dataset()->mu_);
+          dataset()->writer_iterator_created_ = false;
+        }
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
@@ -318,7 +333,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
           // Guard on cache_ to not crash if GetNext is called a second time
           // after *end_of_sequence == true
           if (cache_) {
-            mutex_lock l2(dataset()->mu_);
+            mutex_lock l(dataset()->mu_);
             DCHECK(dataset()->writer_iterator_created_);
             DCHECK(!dataset()->cache_);
             cache_.swap(dataset()->cache_);
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 834c06bb930d1c723c5b3f880dcc13a892bb44f7..46f43dd1b1dcd79e1fc1f8fadc24858f3f7eae9f 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -263,6 +263,11 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 }
                 const int64 window_size =
                     window_size_func_output[0].scalar<int64>()();
+                if (window_size <= 0) {
+                  return errors::InvalidArgument(
+                      "Window size must be greater than zero, but got ",
+                      window_size, ".");
+                }
                 window_sizes_[key] = window_size;
               }
 
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 9ce263732f6e6c907dfdc89692455daa5dca86d1..aaf4dc734183968359b03819bfc04ae544c8877a 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -66,12 +66,16 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                 errors::InvalidArgument(
                     "num_parallel_batches must be greater than zero."));
 
+    bool drop_remainder;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "drop_remainder", &drop_remainder));
+
     std::unique_ptr<CapturedFunction> captured_func;
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(
                             func_, std::move(other_arguments), &captured_func));
 
     *output = new Dataset(input, batch_size, num_parallel_batches,
-                          output_types_, output_shapes_,
+                          drop_remainder, output_types_, output_shapes_,
                           std::move(captured_func), &ctx->eigen_cpu_device());
   }
 
@@ -79,13 +83,15 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
   class Dataset : public DatasetBase {
    public:
     Dataset(const DatasetBase* input, int64 batch_size,
-            int64 num_parallel_batches, const DataTypeVector& output_types,
+            int64 num_parallel_batches, bool drop_remainder,
+            const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             std::unique_ptr<CapturedFunction> captured_func,
             const Eigen::ThreadPoolDevice* device)
         : input_(input),
           batch_size_(batch_size),
           num_parallel_batches_(num_parallel_batches),
+          drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
           captured_func_(std::move(captured_func)),
@@ -177,13 +183,21 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           batch_results_[current_batch_index_].output.clear();
         } else {
           if (num_elements < dataset()->batch_size_) {
+            if (dataset()->drop_remainder_) {
+              // Deallocate tensors allocated for the output.
+              batch_results_[current_batch_index_].output.clear();
+              *end_of_sequence = true;
+              return Status::OK();
+            }
             const std::vector<Tensor>& output =
                 batch_results_[current_batch_index_].output;
             for (size_t i = 0; i < output.size(); ++i) {
               TensorShape component_shape(
                   batch_results_[current_batch_index_].output[i].shape());
               component_shape.set_dim(0, num_elements);
-              Tensor component(ctx->allocator({}), output[i].dtype(),
+              AllocatorAttributes attr;
+              attr.set_gpu_compatible(true);
+              Tensor component(ctx->allocator(attr), output[i].dtype(),
                                component_shape);
               TF_RETURN_IF_ERROR(
                   CopyPartialBatch(&component, output[i], num_elements));
@@ -255,7 +269,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         for (size_t i = 0; i < num_components; ++i) {
           TensorShape component_shape({dataset()->batch_size_});
           component_shape.AppendShape(return_values[i].shape());
-          Tensor component(ctx->allocator({}), return_values[i].dtype(),
+          AllocatorAttributes attr;
+          attr.set_gpu_compatible(true);
+          Tensor component(ctx->allocator(attr), return_values[i].dtype(),
                            component_shape);
           batch_result->output.emplace_back(std::move(component));
         }
@@ -388,6 +404,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     const NameAttrList func_;
     const int64 batch_size_;
     const int64 num_parallel_batches_;
+    const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
     const std::unique_ptr<CapturedFunction> captured_func_;
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.cc b/tensorflow/core/kernels/data/prefetch_autotuner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3272f6bcde56029d878f3e61a7809594db86b24
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
+
+namespace tensorflow {
+
+PrefetchAutotuner::PrefetchAutotuner(int64 initial_buffer_size)
+    : buffer_limit_(initial_buffer_size) {
+  if (initial_buffer_size == kAutoTune) {
+    mode_ = Mode::kUpswing;
+    buffer_limit_ = 1;
+  }
+}
+
+void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) {
+  switch (mode_) {
+    case Mode::kDisabled:
+      return;
+    case Mode::kUpswing:
+      if (current_buffer_size == buffer_limit_) {
+        mode_ = Mode::kDownswing;
+      }
+      return;
+    case Mode::kDownswing:
+      if (current_buffer_size == 0) {
+        buffer_limit_ *= 2;  // Increase the buffer size.
+        mode_ = Mode::kUpswing;
+      }
+      return;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.h b/tensorflow/core/kernels/data/prefetch_autotuner.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa8a1840723ec8af1d1314af1b89bed7f120abc0
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.h
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// PrefetchAutotuner dynamically adjusts the buffer size of a prefetch iterator.
+//
+// PrefetchAutotuner attempts to find the minimum buffer size such that there is
+// always at least 1 element in the prefetch queue every time the downstream
+// iterator calls GetNext().
+//
+// One common failure mode of input pipelines is being throughput bound. No
+// amount of prefetching can address that performance mode. In order to guard
+// against this condition, PrefetchAutotuner will only increase the buffer_limit
+// if the prefetching thread is able to successfully fill the buffer at its
+// current size.
+//
+// Note: in the current implementation, we never decrease the buffer_limit().
+// This should change in the future!
+//
+// PrefetchAutotuner is NOT thread safe.
+class PrefetchAutotuner {
+ public:
+  static const int64 kAutoTune = -1;
+
+  explicit PrefetchAutotuner(int64 initial_buffer_size);
+
+  int64 buffer_limit() const { return buffer_limit_; }
+
+  void RecordConsumption(size_t current_buffer_size);
+  void RecordEmpty() { RecordConsumption(0); }
+
+ private:
+  // PrefetchAutotuner operates as a state machine.
+  enum class Mode {
+    // Disables the autotuning.
+    kDisabled,
+
+    // We have increased the size of the buffer, and will transition to
+    // kDownswing if we successfully fill the buffer.
+    kUpswing,
+
+    // We have successfully filled a buffer of this size. If we ever block the
+    // downstream iterator, we should increase the buffer size.
+    kDownswing,
+  };
+
+  int64 buffer_limit_;
+  Mode mode_ = Mode::kDisabled;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2f573dfb3555b2466d84c6341eaa77e69414d103
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(PrefetchAutotuner, Disabled) {
+  PrefetchAutotuner t(2);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0);
+  t.RecordConsumption(2);
+  t.RecordConsumption(0);
+  t.RecordConsumption(2);
+  EXPECT_EQ(2, t.buffer_limit());
+}
+
+TEST(PrefetchAutotuner, Enabled) {
+  PrefetchAutotuner t(PrefetchAutotuner::kAutoTune);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(1);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(2);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(1);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(4, t.buffer_limit());
+  t.RecordConsumption(4);
+  EXPECT_EQ(4, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to stay the same!
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to stay the same!
+  EXPECT_EQ(8, t.buffer_limit());
+}
+
+TEST(PrefetchAutotuner, EnabledSteady) {
+  PrefetchAutotuner t(PrefetchAutotuner::kAutoTune);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(1);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(2);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(4, t.buffer_limit());
+
+  // Never reach zero again.
+  std::vector<size_t> consumption_values = {2, 3, 1, 4, 1, 2, 3, 1};
+  for (int i = 0; i < consumption_values.size(); ++i) {
+    t.RecordConsumption(consumption_values[i]);
+    EXPECT_EQ(4, t.buffer_limit())
+        << "Failed at index " << i << " with value: " << consumption_values[i];
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 1c548a30d2c8e7f33db85000d0f480b3151d6ecf..536de81fd891f1849cd285d6be4ddefb79fd3386 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 
 namespace tensorflow {
@@ -37,7 +38,8 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
     int64 buffer_size;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
-    OP_REQUIRES(ctx, buffer_size > 0,
+    OP_REQUIRES(ctx,
+                buffer_size > 0 || buffer_size == PrefetchAutotuner::kAutoTune,
                 errors::InvalidArgument("buffer_size must be > 0"));
 
     *output = new Dataset(ctx, input, buffer_size);
@@ -85,7 +87,8 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            auto_tuner_(params.dataset->buffer_size_) {}
 
       ~Iterator() override {
         // Signal the prefetch thread to terminate it. We will then
@@ -113,6 +116,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
           // Wait until the next element in the buffer has been
           // produced, or we are shutting down.
           while (!cancelled_ && !prefetch_thread_finished_ && buffer_.empty()) {
+            auto_tuner_.RecordEmpty();
             cond_var_.wait(l);
           }
 
@@ -129,6 +133,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
             if (s.ok()) {
               *out_tensors = std::move(buffer_.front().value);
             }
+            auto_tuner_.RecordConsumption(buffer_.size());
             buffer_.pop_front();
             *end_of_sequence = false;
 
@@ -242,7 +247,8 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
           // 1. Wait for a slot in the buffer.
           {
             mutex_lock l(mu_);
-            while (!cancelled_ && buffer_.size() == dataset()->buffer_size_) {
+            while (!cancelled_ &&
+                   buffer_.size() == auto_tuner_.buffer_limit()) {
               cond_var_.wait(l);
             }
 
@@ -323,6 +329,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
       mutex parent_mu_ ACQUIRED_BEFORE(mu_);
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
       condition_variable cond_var_;
+      PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
       std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD
index f4698bdaf7ae9767e068e49dad61d2a3d9f739a8..dc591208752c52d3f53484f5a1c564666727bb16 100644
--- a/tensorflow/core/kernels/data/sql/BUILD
+++ b/tensorflow/core/kernels/data/sql/BUILD
@@ -7,18 +7,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "sql",
     srcs = [
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
index 5a2dd9c43dbcbf5250d4dcd4bd803ed4979999e0..17103627e0749d14215bf28fec2489b110308526 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
@@ -47,7 +47,7 @@ class StatsAggregatorImpl : public StatsAggregator {
       Summary::Value* value = out_summary->add_value();
       value->set_tag(name);
       histogram.EncodeToProto(value->mutable_histo(),
-                              true /* preserve_zero_buckets */);
+                              false /* doesn't preserve zero buckets */);
     }
   }
 
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index bea3af98ebcec0915e9e8ac7f8e52eed597153e5..4485152e96ec680ef10f1ef15608e4bbb6d09952 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -28,15 +28,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-namespace {
-inline functor::DataFormat FormatNameToEnum(const string& name) {
-  if (name == "NHWC") return functor::DataFormat::NHWC;
-  if (name == "NCHW") return functor::DataFormat::NCHW;
-  if (name == "HWNC") return functor::DataFormat::HWNC;
-  return functor::DataFormat::UNKNOWN;
-}
-}  // namespace
-
 template <typename Device, typename T>
 class DataFormatDimMapOp : public OpKernel {
  public:
@@ -46,25 +37,37 @@ class DataFormatDimMapOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
     string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(context, src_format.size() == 4,
+                errors::InvalidArgument(strings::StrCat(
+                    "Source format must of length 4, received src_format = ",
+                    src_format)));
     OP_REQUIRES(
-        context, src_format == "NHWC",
+        context, dst_format.size() == 4,
         errors::InvalidArgument(strings::StrCat(
-            "Current implementation doesn't support source data format ",
-            src_format)));
-    OP_REQUIRES(context, dst_format == "NCHW",
-                errors::InvalidArgument(strings::StrCat(
-                    "Current implementation doesn't support dst data format ",
-                    dst_format)));
+            "Destination format must of length 4, received dst_format = ",
+            dst_format)));
+    dst_idx_ = Tensor(DT_INT32, {static_cast<int64>(src_format.size())});
+    for (int i = 0; i < src_format.size(); ++i) {
+      for (int j = 0; j < dst_format.size(); ++j) {
+        if (dst_format[j] == src_format[i]) {
+          dst_idx_.vec<int>()(i) = j;
+          break;
+        }
+      }
+    }
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    Tensor* output = nullptr;
+    Tensor* output;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
     functor::DataFormatDimMap<Device, T>()(context->eigen_device<Device>(),
-                                           input.flat<T>(), output->flat<T>());
+                                           input.flat<T>(), output->flat<T>(),
+                                           dst_idx_.vec<int>());
   }
+
+  Tensor dst_idx_;
 };
 
 template <typename Device, typename T>
@@ -76,17 +79,8 @@ class DataFormatVecPermuteOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
     string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
-    OP_REQUIRES(context,
-                (src_format == "NHWC" && dst_format == "NCHW") ||
-                    (src_format == "NCHW" && dst_format == "NHWC") ||
-                    (src_format == "NHWC" && dst_format == "HWNC") ||
-                    (src_format == "HWNC" && dst_format == "NHWC"),
-                errors::InvalidArgument(strings::StrCat(
-                    "Current implementation only supports NHWC<->NCHW and "
-                    "NHWC<->HWNC conversion; got source format ",
-                    src_format, " and destination format ", dst_format)));
-    src_format_ = FormatNameToEnum(src_format);
-    dst_format_ = FormatNameToEnum(dst_format);
+    src_format_ = src_format;
+    dst_format_ = dst_format;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -116,14 +110,34 @@ class DataFormatVecPermuteOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
-    functor::DataFormatVecPermute<Device, T>()(
-        context->eigen_device<Device>(), input.flat<T>(), output->flat<T>(),
-        src_format_, dst_format_);
+    // Support 1D and 2D cases.
+    Eigen::DSizes<Eigen::DenseIndex, 8> dst_idx;
+    ComputeDstIndex(input.dims(), &dst_idx);
+
+    functor::DataFormatVecPermute<Device, T>()(context->eigen_device<Device>(),
+                                               input.flat<T>(),
+                                               output->flat<T>(), dst_idx);
   }
 
  private:
-  functor::DataFormat src_format_;
-  functor::DataFormat dst_format_;
+  // Finds out the destination index. Support 1D and 2D cases.
+  // Example: HWNC --> NHWC
+  // 1D: dst = [1, 2, 0, 3],
+  // 2D: dst = [2, 3, 4, 5, 0, 1, 6, 7]
+  void ComputeDstIndex(int num_dim, Eigen::DSizes<Eigen::DenseIndex, 8>* dst) {
+    for (int i = 0; i < src_format_.size(); ++i) {
+      for (int j = 0; j < dst_format_.size(); ++j) {
+        if (dst_format_[j] != src_format_[i]) continue;
+        // Found the dst index. Set output based on the number of dims.
+        for (int k = 0; k < num_dim; ++k) {
+          (*dst)[i * num_dim + k] = j * num_dim + k;
+        }
+      }
+    }
+  }
+
+  string src_format_;
+  string dst_format_;
 };
 
 #define REGISTER_KERNEL(T)                                                \
@@ -145,23 +159,23 @@ TF_CALL_int64(REGISTER_KERNEL);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                \
-  template <>                                              \
-  void DataFormatDimMap<GPUDevice, T>::operator()(         \
-      const GPUDevice& d, typename TTypes<T>::ConstFlat x, \
-      typename TTypes<T>::Flat y);                         \
+#define DECLARE_GPU_SPEC(T)                                    \
+  template <>                                                  \
+  void DataFormatDimMap<GPUDevice, T>::operator()(             \
+      const GPUDevice& d, typename TTypes<T>::ConstFlat x,     \
+      typename TTypes<T>::Flat y, const TTypes<int>::Vec dst); \
   extern template struct DataFormatDimMap<GPUDevice, T>;
 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
 TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_int64(DECLARE_GPU_SPECS);
 #undef DECLARE_GPU_SPEC
 
-#define DECLARE_GPU_SPEC(T)                                   \
-  template <>                                                 \
-  void DataFormatVecPermute<GPUDevice, T>::operator()(        \
-      const GPUDevice& d, typename TTypes<T>::ConstFlat x,    \
-      typename TTypes<T>::Vec y, const DataFormat src_format, \
-      const DataFormat dst_format);                           \
+#define DECLARE_GPU_SPEC(T)                                \
+  template <>                                              \
+  void DataFormatVecPermute<GPUDevice, T>::operator()(     \
+      const GPUDevice& d, typename TTypes<T>::ConstFlat x, \
+      typename TTypes<T>::Vec y,                           \
+      const Eigen::DSizes<Eigen::DenseIndex, 8>& dst_idx); \
   extern template struct DataFormatVecPermute<GPUDevice, T>;
 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
 TF_CALL_int32(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/data_format_ops.h b/tensorflow/core/kernels/data_format_ops.h
index d27415ed918f202c44d9689de5aeda9cd4359450..1ca144cb400ff828d334495b57572b67f60e28ef 100644
--- a/tensorflow/core/kernels/data_format_ops.h
+++ b/tensorflow/core/kernels/data_format_ops.h
@@ -23,89 +23,35 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-enum class DataFormat {
-  UNKNOWN = 0,
-  NHWC,
-  NCHW,
-  HWNC,
-};
-
 // Functor used by DataFormatDimMapOP to do the computations.
 template <typename Device, typename T>
 struct DataFormatDimMap {
   void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
-                  typename TTypes<T>::Flat y) {
+                  typename TTypes<T>::Flat y, const TTypes<int>::Vec dst) {
     auto zero = x.constant(0);
     auto one = x.constant(1);
-    auto three = x.constant(3);
+    auto two = x.constant(2);
+
+    auto f_zero = x.constant(dst(0));
+    auto f_one = x.constant(dst(1));
+    auto f_two = x.constant(dst(2));
+    auto f_three = x.constant(dst(3));
+
     auto four = x.constant(4);
     auto x_mod = (x + four) % 4;
-    auto is_zero = (x_mod == zero);
-    auto is_three = (x_mod == three);
-    y.device(d) = is_zero.select(zero, is_three.select(one, x_mod + one));
-  }
-};
 
-template <typename T>
-struct VecPermuteNHWCToNCHW {
-  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
-      typename TTypes<T>::ConstFlat input) const {
-    Eigen::DSizes<Eigen::DenseIndex, 1> result;
-    result[0] = input.dimension(0);
-    return result;
-  }
-  template <typename Output, typename Device>
-  void eval(typename TTypes<T>::ConstFlat input, Output& output,
-            const Device& d) const {
-    if (input.size() == 8) {
-      output.template chip<0>(0).device(d) = input.template chip<0>(0);
-      output.template chip<0>(1).device(d) = input.template chip<0>(1);
-      output.template chip<0>(2).device(d) = input.template chip<0>(6);
-      output.template chip<0>(3).device(d) = input.template chip<0>(7);
-      output.template chip<0>(4).device(d) = input.template chip<0>(2);
-      output.template chip<0>(5).device(d) = input.template chip<0>(3);
-      output.template chip<0>(6).device(d) = input.template chip<0>(4);
-      output.template chip<0>(7).device(d) = input.template chip<0>(5);
-    } else {
-      output.template chip<0>(0).device(d) = input.template chip<0>(0);
-      output.template chip<0>(1).device(d) = input.template chip<0>(3);
-      output.template chip<0>(2).device(d) = input.template chip<0>(1);
-      output.template chip<0>(3).device(d) = input.template chip<0>(2);
-    }
-  }
-};
+    auto is_zero = (x_mod == zero);
+    auto is_one = (x_mod == one);
+    auto is_two = (x_mod == two);
 
-template <typename T>
-struct VecPermuteNCHWToNHWC {
-  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
-      typename TTypes<T>::ConstFlat input) const {
-    Eigen::DSizes<Eigen::DenseIndex, 1> result;
-    result[0] = input.dimension(0);
-    return result;
-  }
-  template <typename Output, typename Device>
-  void eval(typename TTypes<T>::ConstFlat input, Output& output,
-            const Device& d) const {
-    if (input.size() == 8) {
-      output.template chip<0>(0).device(d) = input.template chip<0>(0);
-      output.template chip<0>(1).device(d) = input.template chip<0>(1);
-      output.template chip<0>(2).device(d) = input.template chip<0>(4);
-      output.template chip<0>(3).device(d) = input.template chip<0>(5);
-      output.template chip<0>(4).device(d) = input.template chip<0>(6);
-      output.template chip<0>(5).device(d) = input.template chip<0>(7);
-      output.template chip<0>(6).device(d) = input.template chip<0>(2);
-      output.template chip<0>(7).device(d) = input.template chip<0>(3);
-    } else {
-      output.template chip<0>(0).device(d) = input.template chip<0>(0);
-      output.template chip<0>(1).device(d) = input.template chip<0>(2);
-      output.template chip<0>(2).device(d) = input.template chip<0>(3);
-      output.template chip<0>(3).device(d) = input.template chip<0>(1);
-    }
+    y.device(d) = is_zero.select(
+        f_zero, is_one.select(f_one, is_two.select(f_two, f_three)));
   }
 };
 
 template <typename T>
-struct VecPermuteNHWCToHWNC {
+struct VecPermute {
+  VecPermute(const Eigen::DSizes<Eigen::DenseIndex, 8>& dst) : dst_(dst) {}
   Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
       typename TTypes<T>::ConstFlat input) const {
     Eigen::DSizes<Eigen::DenseIndex, 1> result;
@@ -115,71 +61,22 @@ struct VecPermuteNHWCToHWNC {
   template <typename Output, typename Device>
   void eval(typename TTypes<T>::ConstFlat input, Output& output,
             const Device& d) const {
-    if (input.size() == 8) {
-      output.template chip<0>(0).device(d) = input.template chip<0>(2);
-      output.template chip<0>(1).device(d) = input.template chip<0>(3);
-      output.template chip<0>(2).device(d) = input.template chip<0>(4);
-      output.template chip<0>(3).device(d) = input.template chip<0>(5);
-      output.template chip<0>(4).device(d) = input.template chip<0>(0);
-      output.template chip<0>(5).device(d) = input.template chip<0>(1);
-      output.template chip<0>(6).device(d) = input.template chip<0>(6);
-      output.template chip<0>(7).device(d) = input.template chip<0>(7);
-    } else {
-      output.template chip<0>(0).device(d) = input.template chip<0>(1);
-      output.template chip<0>(1).device(d) = input.template chip<0>(2);
-      output.template chip<0>(2).device(d) = input.template chip<0>(0);
-      output.template chip<0>(3).device(d) = input.template chip<0>(3);
+    for (int i = 0; i < input.size(); ++i) {
+      output.template chip<0>(dst_[i]).device(d) = input.template chip<0>(i);
     }
   }
-};
 
-template <typename T>
-struct VecPermuteHWNCToNHWC {
-  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
-      typename TTypes<T>::ConstFlat input) const {
-    Eigen::DSizes<Eigen::DenseIndex, 1> result;
-    result[0] = input.dimension(0);
-    return result;
-  }
-  template <typename Output, typename Device>
-  void eval(typename TTypes<T>::ConstFlat input, Output& output,
-            const Device& d) const {
-    if (input.size() == 8) {
-      output.template chip<0>(0).device(d) = input.template chip<0>(4);
-      output.template chip<0>(1).device(d) = input.template chip<0>(5);
-      output.template chip<0>(2).device(d) = input.template chip<0>(0);
-      output.template chip<0>(3).device(d) = input.template chip<0>(1);
-      output.template chip<0>(4).device(d) = input.template chip<0>(2);
-      output.template chip<0>(5).device(d) = input.template chip<0>(3);
-      output.template chip<0>(6).device(d) = input.template chip<0>(6);
-      output.template chip<0>(7).device(d) = input.template chip<0>(7);
-    } else {
-      output.template chip<0>(0).device(d) = input.template chip<0>(2);
-      output.template chip<0>(1).device(d) = input.template chip<0>(0);
-      output.template chip<0>(2).device(d) = input.template chip<0>(1);
-      output.template chip<0>(3).device(d) = input.template chip<0>(3);
-    }
-  }
+ private:
+  Eigen::DSizes<Eigen::DenseIndex, 8> dst_;
 };
 
 // Functor used by DataFormatVecPermuteOp to do the computations.
 template <typename Device, typename T>
 struct DataFormatVecPermute {
   void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
-                  typename TTypes<T>::Flat y, const DataFormat src_format,
-                  const DataFormat dst_format) {
-    if (src_format == DataFormat::NHWC && dst_format == DataFormat::NCHW) {
-      y.device(d) = x.customOp(VecPermuteNHWCToNCHW<T>());
-    } else if (src_format == DataFormat::NCHW &&
-               dst_format == DataFormat::NHWC) {
-      y.device(d) = x.customOp(VecPermuteNCHWToNHWC<T>());
-    } else if (src_format == DataFormat::NHWC &&
-               dst_format == DataFormat::HWNC) {
-      y.device(d) = x.customOp(VecPermuteNHWCToHWNC<T>());
-    } else if (src_format == DataFormat::HWNC &&
-               dst_format == DataFormat::NHWC) {
-      y.device(d) = x.customOp(VecPermuteHWNCToNHWC<T>());
-    }
+                  typename TTypes<T>::Flat y,
+                  const Eigen::DSizes<Eigen::DenseIndex, 8>& dst) {
+    y.device(d) = x.customOp(VecPermute<T>(dst));
   }
 };
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index c060b2e14d2f03f990af5267260bd88fa01a2c81..6dedb1a61ef47ccc1fa902e7f69ea21db3392f39 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -241,7 +241,7 @@ struct LaunchDepthwiseConvOp<CPUDevice, T> {
 };
 
 // Extern template instantiated in conv_ops.cc.
-extern template class LaunchConv2DOp<CPUDevice, float>;
+extern template struct LaunchConv2DOp<CPUDevice, float>;
 
 #if GOOGLE_CUDA
 
@@ -251,7 +251,7 @@ extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
 
 // Extern template instantiated in conv_ops.cc.
-extern template class LaunchConv2DOp<GPUDevice, float>;
+extern template struct LaunchConv2DOp<GPUDevice, float>;
 
 #endif
 
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 896c9957616037da4ead2dbda8cb2393eaea226f..2f83780525090c90a0a9cfa3268115daa6fbc89b 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -334,7 +334,8 @@ struct AvgPoolMeanReducer {
   }
 
   template <typename Packet>
-  void reducePacketWithType(T, const Packet& p, Packet* accum) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacketWithType(
+      T, const Packet& p, Packet* accum) {
     Packet skip_mask =
         pequal(p, pset1<Packet>(-Eigen::NumTraits<T>::highest()));
     (*accum) = padd<Packet>(*accum, psel(p, pset1<Packet>(0), skip_mask));
@@ -480,11 +481,9 @@ SpatialAvgPooling(const Input& input, DenseIndex patchRows,
                              Eigen::type2index<3> > >::type reduction_dims;
 #endif
   return input
-      .extract_image_patches(
-          patchRows, patchCols, strideRows, strideCols, in_strideRows,
-          in_strideCols, padding_type,
-          -Eigen::NumTraits<typename internal::remove_const<
-              typename internal::traits<Input>::Scalar>::type>::highest())
+      .extract_image_patches(patchRows, patchCols, strideRows, strideCols,
+                             in_strideRows, in_strideCols, padding_type,
+                             -Eigen::NumTraits<CoeffReturnType>::highest())
       .reduce(reduction_dims, mean_with_nan)
       .reshape(post_reduce_dims);
 }
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 1acbe3a658070222e99ff874815db9a6b07d4565..a4dff4b91c5c7a991b432f113cae2e29ecdcab31 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -797,6 +797,188 @@ struct gemm_pack_rhs<
   }
 };
 
+// Template specialization for packet_size = 2. We must special-case packet
+// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
+template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+        Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+      Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+
+  EIGEN_DEVICE_FUNC
+  static inline Index ceil_div(Index a, Index b) { return (a + b - 1) / b; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    typedef typename packet_traits<Scalar>::type Packet;
+
+    const int packet_size = 2;
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if (!non_standard_patches) {
+        const Index patch_depth = rhs.patchDepth();
+        if ((patch_depth % packet_size) == 0) {
+          const Index patch_cols = rhs.patchCols();
+          const Index patch_rows = rhs.patchRows();
+
+          const Index startCol = rhs.colOffset();
+          const Index max_cols = std::min<Index>(
+              ceil_div(peeled_k, patch_rows * patch_depth) + startCol,
+              patch_cols);
+
+          for (Index c = startCol; c < max_cols; ++c) {
+            eigen_assert(k < peeled_k);
+            const Index startRow = (c == startCol) ? rhs.rowOffset() : 0;
+            const Index max_rows = std::min<Index>(
+                ceil_div(peeled_k - c * patch_rows * patch_depth, patch_depth) +
+                    startRow,
+                patch_rows);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+            for (Index r = startRow; r < max_rows; ++r) {
+              eigen_assert(k < peeled_k);
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index startDepth =
+                  ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0;
+              const Index max_depth =
+                  std::min<Index>(peeled_k - c * patch_rows * patch_depth -
+                                      r * patch_depth + startDepth,
+                                  patch_depth);
+              eigen_assert((max_depth - startDepth) % packet_size == 0);
+              for (Index d = startDepth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 2> kernel0;
+            PacketBlock<Packet, 2> kernel1;
+            kernel0.packet[0] = dm0.loadPacketFast(k);
+            kernel0.packet[1] = dm1.loadPacketFast(k);
+            kernel1.packet[0] = dm2.loadPacketFast(k);
+            kernel1.packet[1] = dm3.loadPacketFast(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+            block += 4 * packet_size;
+          }
+        } else {
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 2> kernel0;
+            PacketBlock<Packet, 2> kernel1;
+            kernel0.packet[0] = dm0.loadPacketStandard(k);
+            kernel0.packet[1] = dm1.loadPacketStandard(k);
+            kernel1.packet[0] = dm2.loadPacketStandard(k);
+            kernel1.packet[1] = dm3.loadPacketStandard(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+      if (!rhs.nonStandardPatches()) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
 // Special case for non-vectorized types such as float16.
 template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
           typename ArgType, typename Device, typename Scalar, typename Index,
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 351aad72135da9c11dcef7ce4ff19cd158a50a1b..f8e0267578054b2cb8bb5371545bf8547ad38c22 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -144,6 +144,11 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                    .HostMemory("input")
                                                    .TypeConstraint<int32>("T"),
                                                RetvalOp);
+REGISTER_KERNEL_BUILDER(Name(kRetOp)
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<ResourceHandle>("T")
+                            .HostMemory("input"),
+                        RetvalOp);
 #undef REGISTER
 
 class PassOn : public OpKernel {
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index b687088db16a31d8ecb74a7a483c35d2c65a74f9..911aa3a78fff2f6f7272e7408388e6625df52037 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -21,10 +20,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/mutex.h"
 
-namespace tensorflow {
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
 
+namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef FunctionLibraryRuntime::Handle FHandle;
@@ -106,11 +107,9 @@ void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
   opts->runner = ctx->runner();
 }
 
-}  // end namespace
-
-class FunctionalIf : public AsyncOpKernel {
+class IfOp : public AsyncOpKernel {
  public:
-  explicit FunctionalIf(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit IfOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     auto lib = ctx->function_library();
     OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
     const NameAttrList* func;
@@ -120,7 +119,7 @@ class FunctionalIf : public AsyncOpKernel {
     OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &else_handle_));
   }
 
-  ~FunctionalIf() override {}
+  ~IfOp() override {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     bool cond;
@@ -134,8 +133,7 @@ class FunctionalIf : public AsyncOpKernel {
 
   class State {
    public:
-    State(FunctionalIf* kernel, OpKernelContext* ctx, bool cond,
-          DoneCallback done)
+    State(IfOp* kernel, OpKernelContext* ctx, bool cond, DoneCallback done)
         : kernel_(kernel),
           ctx_(ctx),
           cond_(cond),
@@ -168,7 +166,7 @@ class FunctionalIf : public AsyncOpKernel {
     }
 
    private:
-    FunctionalIf* const kernel_;
+    IfOp* const kernel_;
     OpKernelContext* const ctx_;
     const bool cond_;
     const DoneCallback done_;
@@ -179,18 +177,22 @@ class FunctionalIf : public AsyncOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), FunctionalIf);
+// TODO(drpng): remove this.
+REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
-                        FunctionalIf);
+                        IfOp);
+
+REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_CPU), IfOp);
+REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
 
-class FunctionalWhile : public AsyncOpKernel {
+class WhileOp : public AsyncOpKernel {
  public:
-  explicit FunctionalWhile(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit WhileOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("cond", &cond_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &body_func_));
   }
 
-  ~FunctionalWhile() override {}
+  ~WhileOp() override {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     auto lib = ctx->function_library();
@@ -234,7 +236,7 @@ class FunctionalWhile : public AsyncOpKernel {
 
   class State {
    public:
-    State(FunctionalWhile* kernel, OpKernelContext* ctx, FHandle cond_handle,
+    State(WhileOp* kernel, OpKernelContext* ctx, FHandle cond_handle,
           FHandle body_handle, DoneCallback done)
         : kernel_(kernel),
           ctx_(ctx),
@@ -253,7 +255,7 @@ class FunctionalWhile : public AsyncOpKernel {
     void Start() { EvalCond(); }
 
    private:
-    FunctionalWhile* const kernel_;
+    WhileOp* const kernel_;
     OpKernelContext* const ctx_;
     const FHandle cond_handle_;
     const FHandle body_handle_;
@@ -316,7 +318,152 @@ class FunctionalWhile : public AsyncOpKernel {
     }
   };
 };
-REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), FunctionalWhile);
-REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), FunctionalWhile);
+// TODO(drpng): remove these.
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), WhileOp);
+
+REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_CPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_GPU), WhileOp);
+
+Status GetScalar(OpKernelContext* ctx, int index, int32* value,
+                 const char* label) {
+  Tensor t = ctx->input(index);
+  if (!TensorShapeUtils::IsScalar(t.shape())) {
+    return errors::InvalidArgument(label, " must be a scalar, but ",
+                                   t.shape().DebugString());
+  }
+  *value = t.scalar<int32>()();
+  return Status::OK();
+}
+
+class ForOp : public AsyncOpKernel {
+ public:
+  explicit ForOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto lib = ctx->function_library();
+    OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
+    const NameAttrList* func;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &func));
+    OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &body_handle_));
+  }
+
+  ~ForOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    (new State(this, ctx, done))->Start();
+  }
+
+ private:
+  FHandle body_handle_;
+
+  class State {
+   public:
+    State(ForOp* kernel, OpKernelContext* ctx, DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          done_(std::move(done)),
+          lib_(CHECK_NOTNULL(ctx_->function_library())),
+          args_(1 + ctx_->num_inputs() - 3) {
+      args_[0] = Tensor(DT_INT32, {});
+      iter_ = &args_[0].scalar<int32>()();
+
+      const int32 num_loop_inputs = ctx_->num_inputs() - 3;
+      rets_.reserve(num_loop_inputs);
+      for (int i = 0; i < num_loop_inputs; ++i) {
+        rets_.push_back(ctx_->input(3 + i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() {
+      Status s = StartLoop();
+      if (!s.ok()) Finish(s);
+    }
+
+   private:
+    ForOp* const kernel_;
+    OpKernelContext* const ctx_;
+    const DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+
+    int32* iter_;  // points to args_[0].
+    int32 limit_;
+    int32 delta_;
+
+    // If an error e is returned, caller must call Finish(e).
+    // If OK is returned, the async loop execution has been started.
+    Status StartLoop() {
+      SetRunOptions(ctx_, &opts_, false /* always_collect_stats */);
+
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 0, iter_, "start"));
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 1, &limit_, "limit"));
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 2, &delta_, "delta"));
+
+      if ((delta_ > 0 && *iter_ <= limit_) ||
+          (delta_ < 0 && *iter_ >= limit_) ||
+          (delta_ == 0 && *iter_ == limit_)) {
+        RunNext();
+        return Status::OK();
+      } else {
+        return errors::InvalidArgument("Invalid start/limit/delta: ", *iter_,
+                                       " ", limit_, " ", delta_);
+      }
+    }
+
+    void RunNext() {
+      bool done_loop;
+      if (delta_ > 0) {
+        done_loop = *iter_ >= limit_;
+      } else {
+        done_loop = *iter_ <= limit_;
+      }
+      if (done_loop) {
+        Finish(Status::OK());
+        return;
+      }
+
+      if (rets_.size() >= args_.size()) {
+        Finish(errors::InvalidArgument(
+            "For loop body returned ", rets_.size(),
+            " arguments. Expected: ", args_.size() - 1));
+        return;
+      }
+      for (int i = 0; i < rets_.size(); ++i) {
+        args_[1 + i] = std::move(rets_[i]);
+      }
+      rets_.clear();
+      lib_->Run(opts_, kernel_->body_handle_, args_, &rets_,
+                [this](const Status& s) {
+                  if (s.ok()) {
+                    *iter_ += delta_;
+                    RunNext();
+                  } else {
+                    Finish(s);
+                  }
+                });
+    }
+
+    void Finish(Status s) {
+      if (s.ok()) {
+        s = SetOutputs(kernel_, ctx_, rets_);
+      }
+      ctx_->SetStatus(s);
+      done_();
+      delete this;
+    }
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("For").Device(DEVICE_CPU), ForOp);
+REGISTER_KERNEL_BUILDER(Name("For")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("start")
+                            .HostMemory("limit")
+                            .HostMemory("delta"),
+                        ForOp);
 
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 9a7eca03ce276d26321f01f80ad7f1a0a254e4db..aab4b009b505417a93238683b617f603ffc256be 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -17,18 +17,6 @@ cc_library(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib")
 
 tf_ops_fuzz_target_lib("identity")
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 108d59db2c21cad6ff3136a0271f9ba1f7d7a237..4870d9ae200cd55adc4833c044e5588aa1d6aa89 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -13,18 +13,6 @@ load(
     "tf_kernel_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_cc_test(
     name = "graph_transferer_test",
     size = "small",
@@ -45,6 +33,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:quantization_utils",
         "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:reduction_ops",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index a18a72c66dc659ffd372c231524dbf038df6ac22..dffb4d71713f54307097fe6600622992e6b8977e 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -101,6 +101,10 @@ REGISTER_SYCL_HOST_KERNEL(bool);
   REGISTER_KERNEL_BUILDER(Name("DebugGradientIdentity")                     \
                               .Device(DEVICE_GPU)                           \
                               .TypeConstraint<type>("T"),                   \
+                          IdentityOp);                                      \
+  REGISTER_KERNEL_BUILDER(Name("PlaceholderWithDefault")                    \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<type>("dtype"),               \
                           IdentityOp)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
@@ -112,18 +116,30 @@ REGISTER_GPU_KERNEL(Variant);
 // A special GPU kernel for int32 and bool.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-#define REGISTER_GPU_HOST_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("Identity")                \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          IdentityOp);                    \
-  REGISTER_KERNEL_BUILDER(Name("RefIdentity")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
+#define REGISTER_GPU_HOST_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Identity")                    \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("input")            \
+                              .HostMemory("output")           \
+                              .TypeConstraint<type>("T"),     \
+                          IdentityOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("RefIdentity")                 \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("input")            \
+                              .HostMemory("output")           \
+                              .TypeConstraint<type>("T"),     \
+                          IdentityOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("StopGradient")                \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("input")            \
+                              .HostMemory("output")           \
+                              .TypeConstraint<type>("T"),     \
+                          IdentityOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("PlaceholderWithDefault")      \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("input")            \
+                              .HostMemory("output")           \
+                              .TypeConstraint<type>("dtype"), \
                           IdentityOp)
 
 REGISTER_GPU_HOST_KERNEL(int32);
diff --git a/tensorflow/core/kernels/immutable_constant_op_test.cc b/tensorflow/core/kernels/immutable_constant_op_test.cc
index b3814331ee7f42a63af93cb35e943463724cf5a6..b2dc16d5d729fff71ee6651dd87970a8c6b4bb66 100644
--- a/tensorflow/core/kernels/immutable_constant_op_test.cc
+++ b/tensorflow/core/kernels/immutable_constant_op_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/null_file_system.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc
index 9c428cdedc94a7f2851b5e7d7a8b43aa52fd0ee2..06d53eba305f98fe937839fc7261a950de9db7db 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.cc
+++ b/tensorflow/core/kernels/initializable_lookup_table.cc
@@ -44,7 +44,7 @@ Status InitializableLookupTable::Initialize(InitTableIterator& iter) {
     return errors::FailedPrecondition("Table already initialized.");
   }
 
-  TF_RETURN_IF_ERROR(DoPrepare(iter.total_size()));
+  TF_RETURN_IF_ERROR(DoLazyPrepare([&iter]() { return iter.total_size(); }));
   while (iter.Valid()) {
     TF_RETURN_IF_ERROR(DoInsert(iter.keys(), iter.values()));
     iter.Next();
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
index e9eae9f863b136fdd217792946437f4755cf55bb..edb779540fb12e97740fd54a159887c8b3170f9a 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.h
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -92,6 +92,8 @@ class InitializableLookupTable : public LookupInterface {
   //
   // Then the iterator is exhausted, valid returns false and status returns
   // Status::OutOfRange.
+  //
+  // This class is Thread-unsafe.
   class InitTableIterator {
    public:
     InitTableIterator() {}
@@ -114,6 +116,7 @@ class InitializableLookupTable : public LookupInterface {
     virtual Status status() const = 0;
 
     // Returns the total number of elements that the iterator will produce.
+    // It might return -1 in case of error.
     virtual int64 total_size() const = 0;
 
    private:
@@ -129,6 +132,17 @@ class InitializableLookupTable : public LookupInterface {
   // number of expected elements.
   virtual Status DoPrepare(size_t expected_num_elements) = 0;
 
+  // Same as DoPrepare() but derived implementations might choose to skip
+  // calling get_expected_num_elements if size is not needed for DoPrepare.
+  virtual Status DoLazyPrepare(
+      std::function<int64(void)> get_expected_num_elements) {
+    int64 expected_num_elements = get_expected_num_elements();
+    if (expected_num_elements < 0) {
+      return errors::FailedPrecondition("Got negative expected_num_elements.");
+    }
+    return DoPrepare(expected_num_elements);
+  }
+
   // Populates the table in batches given keys and values as tensors into the
   // underlying data structure.
   virtual Status DoInsert(const Tensor& keys, const Tensor& values) = 0;
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index baf0a4abe48ea0c5a5fed5d7ef3e53925e393b10..9e7786f25e052b0113a8020e3af1e015eae41b8d 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -112,6 +112,7 @@ bool TensorList::Decode(const VariantTensorData& data) {
       dims.push_back(scratch);
     }
   }
+  element_shape = PartialTensorShape(dims);
   return true;
 }
 
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 9733883001d4ce7888b4893ecb43047b621a3eba..f3bbf3b6e37d0a2852b68a018e9d32ac88f610a7 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -83,7 +83,8 @@ class TensorListStack : public OpKernel {
                                         DataTypeString(l->element_dtype)));
     OP_REQUIRES(c, l->element_shape.IsFullyDefined(),
                 errors::InvalidArgument("Tried to stack elements from a list "
-                                        "with non-fully-defined shape."));
+                                        "with non-fully-defined shape: ",
+                                        l->element_shape.DebugString()));
     if (num_elements_ != -1) {
       OP_REQUIRES(c, l->tensors.size() == num_elements_,
                   errors::InvalidArgument("Operation expected a list with ",
@@ -159,15 +160,13 @@ class TensorListFromTensor : public OpKernel {
       tmp_shape.RemoveDim(0);
       OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
                   errors::Unknown("Unexpected shape error."));
-      if (tmp.IsAligned() || !DataTypeCanUseMemcpy(DataTypeToEnum<T>::value)) {
-        output_list.tensors.push_back(tmp);
-      } else {
-        Tensor aligned;
-        OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
-        aligned.flat<T>().device(c->eigen_device<Device>()) =
-            tmp.unaligned_flat<T>();
-        output_list.tensors.push_back(aligned);
-      }
+      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+      // prevent this.
+      Tensor aligned;
+      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+      aligned.flat<T>().device(c->eigen_device<Device>()) =
+          tmp.unaligned_flat<T>();
+      output_list.tensors.push_back(aligned);
     }
     output_tensor->scalar<Variant>()() = std::move(output_list);
   }
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index e3872fee0edcae543b9193e0dcf6850d194ef067..57b7798ba04eab5d1a869d4782dfe7d0dc727df4 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/kernels/initializable_lookup_table.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -62,8 +63,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
     mutex_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
       value_values(i) = gtl::FindWithDefault(
-          table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
+          table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
     }
 
     return Status::OK();
@@ -78,9 +78,8 @@ class MutableHashTableOfScalars final : public LookupInterface {
       table_.clear();
     }
     for (int64 i = 0; i < key_values.size(); ++i) {
-      gtl::InsertOrUpdate(&table_,
-                          SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-                          SubtleMustCopyUnlessStringOrFloat(value_values(i)));
+      gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
+                          SubtleMustCopyIfIntegral(value_values(i)));
     }
     return Status::OK();
   }
@@ -172,8 +171,8 @@ class MutableHashTableOfTensors final : public LookupInterface {
 
     mutex_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
-      ValueArray* value_vec = gtl::FindOrNull(
-          table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)));
+      ValueArray* value_vec =
+          gtl::FindOrNull(table_, SubtleMustCopyIfIntegral(key_values(i)));
       if (value_vec != nullptr) {
         for (int64 j = 0; j < value_dim; j++) {
           value_values(i, j) = value_vec->at(j);
@@ -203,8 +202,8 @@ class MutableHashTableOfTensors final : public LookupInterface {
         V value = value_values(i, j);
         value_vec.push_back(value);
       }
-      gtl::InsertOrUpdate(
-          &table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)), value_vec);
+      gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
+                          value_vec);
     }
     return Status::OK();
   }
@@ -379,15 +378,14 @@ class MutableDenseHashTable final : public LookupInterface {
           for (int64 j = 0; j < value_size; ++j) {
             // TODO(andreasst): check if we can get rid of SubtleMustCopy
             // here and elsewhere in this file.
-            value_matrix(i, j) = SubtleMustCopyUnlessStringOrFloat(
-                value_buckets_matrix(bucket_index, j));
+            value_matrix(i, j) =
+                SubtleMustCopyIfIntegral(value_buckets_matrix(bucket_index, j));
           }
           break;
         }
         if (IsEqualKey(key_buckets_matrix, bucket_index, empty_key_matrix, 0)) {
           for (int64 j = 0; j < value_size; ++j) {
-            value_matrix(i, j) =
-                SubtleMustCopyUnlessStringOrFloat(default_flat(j));
+            value_matrix(i, j) = SubtleMustCopyIfIntegral(default_flat(j));
           }
           break;
         }
@@ -531,7 +529,7 @@ class MutableDenseHashTable final : public LookupInterface {
         if (IsEqualKey(key_buckets_matrix, bucket_index, key_matrix, i)) {
           for (int64 j = 0; j < value_size; ++j) {
             value_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(value_matrix(i, j));
+                SubtleMustCopyIfIntegral(value_matrix(i, j));
           }
           break;
         }
@@ -539,11 +537,11 @@ class MutableDenseHashTable final : public LookupInterface {
           ++num_entries_;
           for (int64 j = 0; j < key_size; ++j) {
             key_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(key_matrix(i, j));
+                SubtleMustCopyIfIntegral(key_matrix(i, j));
           }
           for (int64 j = 0; j < value_size; ++j) {
             value_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(value_matrix(i, j));
+                SubtleMustCopyIfIntegral(value_matrix(i, j));
           }
           break;
         }
@@ -849,6 +847,7 @@ REGISTER_KERNEL(string, int64);
 REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, Variant);
 
 #undef REGISTER_KERNEL
 
@@ -899,6 +898,7 @@ REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(string, float);
 REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(int64, bool);
+REGISTER_KERNEL(int64, Variant);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 5ba9b936e4ea309ceda645f63e9630f01a99c985..29a0cc91fe01e4e2fcede3f2f28c59ba7e89607d 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -125,19 +125,21 @@ namespace lookup {
 // integral types. However non-integer variables are not allowed and therefore
 // the local copy is unnecessary.
 template <typename T>
-T SubtleMustCopyUnlessStringOrFloat(const T& value) {
+T SubtleMustCopyIfIntegral(const T& value) {
   return internal::SubtleMustCopy(value);
 }
 
-inline const string& SubtleMustCopyUnlessStringOrFloat(const string& value) {
+inline const string& SubtleMustCopyIfIntegral(const string& value) {
   return value;
 }
 
-inline const float SubtleMustCopyUnlessStringOrFloat(const float value) {
+inline const float SubtleMustCopyIfIntegral(const float value) { return value; }
+
+inline const double SubtleMustCopyIfIntegral(const double value) {
   return value;
 }
 
-inline const double SubtleMustCopyUnlessStringOrFloat(const double value) {
+inline const Variant& SubtleMustCopyIfIntegral(const Variant& value) {
   return value;
 }
 
@@ -191,6 +193,11 @@ class HashTable : public InitializableLookupTable {
     return Status::OK();
   };
 
+  Status DoLazyPrepare(std::function<int64(void)> unused) override {
+    constexpr size_t kUnusedSize = 0;
+    return DoPrepare(kUnusedSize);
+  }
+
   Status DoInsert(const Tensor& keys, const Tensor& values) override {
     if (!table_) {
       return errors::FailedPrecondition("HashTable is not prepared.");
@@ -199,8 +206,8 @@ class HashTable : public InitializableLookupTable {
     const auto key_values = keys.flat<K>();
     const auto value_values = values.flat<V>();
     for (int64 i = 0; i < key_values.size(); ++i) {
-      const K key = SubtleMustCopyUnlessStringOrFloat(key_values(i));
-      const V value = SubtleMustCopyUnlessStringOrFloat(value_values(i));
+      const K key = SubtleMustCopyIfIntegral(key_values(i));
+      const V value = SubtleMustCopyIfIntegral(value_values(i));
       const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
       if (previous_value != value) {
         return errors::FailedPrecondition(
@@ -219,8 +226,7 @@ class HashTable : public InitializableLookupTable {
 
     for (int64 i = 0; i < key_values.size(); ++i) {
       value_values(i) = gtl::FindWithDefault(
-          *table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
+          *table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index c7ce1c3747ea9f329f96d62af27708b0c9f4eb68..27031d9216129b842195993279f6d6c2acf7fb5f 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -75,9 +75,6 @@ class TextFileLineIterator
   Status Init(const string& filename, int64 vocab_size, char delimiter,
               DataType key_dtype, int64 key_index, DataType value_dtype,
               int64 value_index, Env* env) {
-    if (vocab_size == -1) {
-      TF_RETURN_IF_ERROR(GetNumLinesInTextFile(env, filename, &vocab_size));
-    }
     filename_ = filename;
     vocab_size_ = vocab_size;
     delimiter_ = delimiter;
@@ -85,6 +82,7 @@ class TextFileLineIterator
     value_ = Tensor(value_dtype, TensorShape({}));
     key_index_ = key_index;
     value_index_ = value_index;
+    env_ = env;
 
     status_ = env->NewRandomAccessFile(filename_, &file_);
     if (!status_.ok()) return status_;
@@ -103,15 +101,15 @@ class TextFileLineIterator
     string line;
     status_ = input_buffer_->ReadLine(&line);
     if (!status_.ok()) {
-      if (errors::IsOutOfRange(status_) && next_id_ != vocab_size_) {
+      if (errors::IsOutOfRange(status_) && next_id_ != total_size()) {
         status_ = errors::InvalidArgument("Invalid vocab_size in ", filename_,
-                                          ": expected ", vocab_size_,
+                                          ": expected ", total_size(),
                                           " but got ", next_id_);
       }
       valid_ = false;
       return;
     }
-    if (next_id_ >= vocab_size_) {
+    if (vocab_size_ != -1 && next_id_ >= vocab_size_) {
       LOG(WARNING) << "Truncated " << filename_ << " before its end at "
                    << vocab_size_ << " records.";
       LOG(WARNING) << "next_id_  : " << next_id_;
@@ -162,7 +160,18 @@ class TextFileLineIterator
 
   Status status() const override { return status_; }
 
-  int64 total_size() const override { return vocab_size_; }
+  int64 total_size() const override {
+    if (vocab_size_ == -1) {
+      int64 new_size;
+      Status status = GetNumLinesInTextFile(env_, filename_, &new_size);
+      if (!status.ok()) {
+        LOG(WARNING) << "Unable to get line count: " << status;
+        new_size = -1;
+      }
+      *const_cast<int64*>(&vocab_size_) = new_size;
+    }
+    return vocab_size_;
+  }
 
  private:
   Tensor key_;
@@ -170,6 +179,7 @@ class TextFileLineIterator
   bool valid_;  // true if the iterator points to an existing range.
   int64 key_index_;
   int64 value_index_;
+  Env* env_;
   int64 next_id_;
   int64 vocab_size_;
   string filename_;
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index aa3ea890b04358d6176b44558fed014ef29259e3..9ab95d765c39d70448e5a99aeb3fad6101827daf 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -803,8 +803,10 @@ class MklConcatOp : public OpKernel {
     Tensor* output_tensor = nullptr;
     TensorShape tf_shape_output;
     tf_shape_output.AddDim(dnn_shape_output.GetSerializeBufferSize());
-    context->allocate_output(GetTensorMetaDataIndex(0, context->num_outputs()),
-                             tf_shape_output, &output_tensor);
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       GetTensorMetaDataIndex(0, context->num_outputs()),
+                       tf_shape_output, &output_tensor));
     dnn_shape_output.SerializeMklDnnShape(
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 7ca10db895c2224aef6b4306ad585d5b617c446c..8333a09316c2147e79a610eeb6c4d7aafde6e2bf 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -65,9 +65,12 @@ class MklDnnConvUtil {
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
                  Padding pad, TensorFormat fm,
-                 const std::vector<int32>& dilations) :
-    context_(context), strides_(strides), padding_(pad),
-    data_format_(fm), dilations_(dilations) {}
+                 const std::vector<int32>& dilations)
+      : context_(context),
+        strides_(strides),
+        dilations_(dilations),
+        padding_(pad),
+        data_format_(fm) {}
 
   virtual ~MklDnnConvUtil() { context_ = nullptr; }
 
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 9e564b016f54b476f1d5e1d91f291c1ce3e3fda2..62aafa793056f233ac84d1c8ca49bba1e73035c9 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -817,8 +817,8 @@ class MklFusedBatchNormOp : public OpKernel {
       // set weights primitive
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
-      auto weights_desc =
-          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto weights_desc = memory::desc({2, static_cast<int>(depth_)},
+                                       MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
       T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
@@ -833,8 +833,8 @@ class MklFusedBatchNormOp : public OpKernel {
       }
 
       // set mean primitive
-      auto mean_desc =
-          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto mean_desc = memory::desc({1, static_cast<int>(depth_)},
+                                    MklDnnType<T>(), memory::format::nc);
       auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
       char* saved_mean_data_tf =
           reinterpret_cast<char*>(saved_mean_tensor->flat<T>().data());
@@ -844,8 +844,8 @@ class MklFusedBatchNormOp : public OpKernel {
           memory(mean_pd, reinterpret_cast<void*>(saved_mean_data_tf));
 
       // set variance primitive
-      auto variance_desc =
-          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto variance_desc = memory::desc({1, static_cast<int>(depth_)},
+                                        MklDnnType<T>(), memory::format::nc);
       auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
       char* saved_variance_data_tf =
           reinterpret_cast<char*>(saved_variance_tensor->flat<T>().data());
@@ -933,7 +933,7 @@ class MklFusedBatchNormOp : public OpKernel {
   bool is_training_;
   T* mean_values_;
   T* variance_values_;
-  size_t depth_;  // batch normalization is done for per channel.
+  int depth_;  // batch normalization is done for per channel.
 
   void ExtractParams(OpKernelContext* context) {
     const Tensor& input = MklGetInput(context, 0);
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index d91f7107c5b1effdfa6c4c3b95b16bcf31750f42..68d3e1c9abde59d12a66d56896cc2e262794f756 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -263,21 +263,18 @@ class MklInputConversionOp : public OpKernel {
 
  private:
   void Compute(OpKernelContext* context) override {
-    const Tensor& input_tensor_0 = MklGetInput(context, 0);
+    const int kInputIndex_0 = 0, kInputIndex_1 = 1;
+    const Tensor& input_tensor_0 = MklGetInput(context, kInputIndex_0);
     MklDnnShape input_shape_0;
-    GetMklShape(context, 0, &input_shape_0);
+    GetMklShape(context, kInputIndex_0, &input_shape_0);
 
-    const Tensor& input_tensor_1 = MklGetInput(context, 1);
+    const Tensor& input_tensor_1 = MklGetInput(context, kInputIndex_1);
     MklDnnShape input_shape_1;
-    GetMklShape(context, 1, &input_shape_1);
-
-    bool tf_shapes_are_same =
-        context->input(0).shape() == context->input(1).shape();
+    GetMklShape(context, kInputIndex_1, &input_shape_1);
 
-    VLOG(1) << "MklInputConversionOp: Input shapes are "
-            << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
-            << context->input(0).shape().DebugString() << " and "
-            << context->input(1).shape().DebugString();
+    VLOG(1) << "MklInputConversionOp: Input shapes are: "
+            << context->input(kInputIndex_0).shape().DebugString() << " and "
+            << context->input(kInputIndex_1).shape().DebugString();
 
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // if both inputs are in TF format, just copy input tensors to output.
@@ -285,15 +282,19 @@ class MklInputConversionOp : public OpKernel {
       VLOG(1) << "MklInputConversionOp: No conversion needed, "
               << "copying TF inputs to output";
 
-      ForwardTfTensorInToOut(context, 0, 0);
-      ForwardTfTensorInToOut(context, 1, 1);
+      ForwardTfTensorInToOut(context, kInputIndex_0, kInputIndex_0);
+      ForwardTfTensorInToOut(context, kInputIndex_1, kInputIndex_1);
       return;
     }
 
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      if (tf_shapes_are_same) {
+      // It is safer to compare the original TensorFlow shapes than to compare
+      // Mkl shapes since element wise ops are forwarded to Eigen implementation.
+      TensorShape tf_shape0 = input_shape_0.GetTfShape();
+      TensorShape tf_shape1 = input_shape_1.GetTfShape();
+      if (tf_shape0 == tf_shape1) {
         auto input0_md = input_shape_0.GetMklLayout();
         auto input1_md = input_shape_1.GetMklLayout();
 
@@ -302,8 +303,8 @@ class MklInputConversionOp : public OpKernel {
           VLOG(1) << "MklInputConversionOp: No conversion needed, "
                   << "copying MKL inputs with identical shapes to output";
 
-          ForwardMklTensorInToOut(context, 0, 0);
-          ForwardMklTensorInToOut(context, 1, 1);
+          ForwardMklTensorInToOut(context, kInputIndex_0, kInputIndex_0);
+          ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
         } else {
           VLOG(1) << "MklInputConversionOp: Shape is same, but format is "
@@ -324,7 +325,7 @@ class MklInputConversionOp : public OpKernel {
           mkl_output_mkl_shape.SetMklLayout(&input1_md);
 
           // Create output Mkl tensor for index 0
-          AllocateOutputSetMklShape(context, 0, &tensor_out,
+          AllocateOutputSetMklShape(context, kInputIndex_0, &tensor_out,
                                     input_tensor_0.shape(),
                                     mkl_output_mkl_shape);
 
@@ -342,7 +343,7 @@ class MklInputConversionOp : public OpKernel {
           stream(stream::kind::eager).submit(net).wait();
 
           // Input1 will be passed through
-          ForwardMklTensorInToOut(context, 1, 1);
+          ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
         }
       }
@@ -361,11 +362,11 @@ class MklInputConversionOp : public OpKernel {
               << "converted MKL inputs to TF format";
 
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 0);
+                                           op_data_type, has_avx512f_, kInputIndex_0);
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 1);
-      SetDummyMklShapeOutput(context, 0);
-      SetDummyMklShapeOutput(context, 1);
+                                           op_data_type, has_avx512f_, kInputIndex_1);
+      SetDummyMklShapeOutput(context, kInputIndex_0);
+      SetDummyMklShapeOutput(context, kInputIndex_1);
       return;
     }
 
@@ -377,7 +378,6 @@ class MklInputConversionOp : public OpKernel {
     const Tensor* mkl_tensor;
     const MklDnnShape* mkl_shape;
     const Tensor* tf_tensor;
-    MklDnnShape* tf_mkl_shape;
     uint mkl_tensor_index;
     uint tf_tensor_index;
     if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
@@ -385,14 +385,12 @@ class MklInputConversionOp : public OpKernel {
       mkl_shape = &input_shape_0;
       mkl_tensor_index = 0;
       tf_tensor = &input_tensor_1;
-      tf_mkl_shape = &input_shape_1;
       tf_tensor_index = 1;
     } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
       mkl_tensor = &input_tensor_1;
       mkl_shape = &input_shape_1;
       mkl_tensor_index = 1;
       tf_tensor = &input_tensor_0;
-      tf_mkl_shape = &input_shape_0;
       tf_tensor_index = 0;
     } else {
       CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
@@ -466,8 +464,8 @@ class MklInputConversionOp : public OpKernel {
     }
 
     VLOG(1) << "MklInputConversionOp: Shapes (output): "
-            << context->mutable_output(0)->shape().DebugString() << " and "
-            << context->mutable_output(1)->shape().DebugString();
+            << context->mutable_output(kInputIndex_0)->shape().DebugString() << " and "
+            << context->mutable_output(kInputIndex_1)->shape().DebugString();
 
     VLOG(1) << "MklInputConversion completed successfully.";
   }
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 282012c719fe3045e880ef0dc9027a50c0f23fec..eef254cdadbde377c463ea2c5dad693d890d1dc5 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -752,7 +752,8 @@ class MklLRNOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
     OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
     workspace_enabled_ = false;
-    context->GetAttr("workspace_enabled", &workspace_enabled_);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("workspace_enabled", &workspace_enabled_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1001,7 +1002,8 @@ class MklLRNGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
     OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
     workspace_enabled_ = false;
-    context->GetAttr("workspace_enabled", &workspace_enabled_);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("workspace_enabled", &workspace_enabled_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1043,7 +1045,6 @@ class MklLRNGradOp : public OpKernel {
       // Naming: diff_dst is input_gradient_tensor; src is orig_input_tensor.
       const Tensor& input_grad_tensor = MklGetInput(context, kIdxGradient);
       const Tensor& orig_input_tensor = MklGetInput(context, kIdxOrigInput);
-      const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
 
       // Get input sizes in MKL-DNN required NCHW format.
       // LRN does not have data_format attribute. But by default it has
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 5dbc4a2709e2bc379ae3b9aa68ed14f3d6893e7c..2cfde1f6fd4112ea1b4e489be3d9ce0014cbaa6a 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -266,7 +266,9 @@ class MklReshapeOp : public OpKernel {
                                                    &net)) {
               stream(stream::kind::eager).submit(net).wait();
             } else {
-              output_tensor->CopyFrom(input_tensor, shape_to);
+              OP_REQUIRES(
+                  context, output_tensor->CopyFrom(input_tensor, shape_to),
+                  errors::InvalidArgument("invalid input tensor shape"));
             }
             return;
           } else {
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index aceef1e234eff3660b33f5a091a2cd10e25ea2f9..f79e18cff29de5682ac2db445160d9346425414f 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -27,7 +27,6 @@ limitations under the License.
 
 #include "mkldnn.h"
 #include "mkldnn_types.h"
-#include "tensorflow/core/platform/default/logging.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 #include "mkldnn.hpp"
@@ -103,7 +102,7 @@ class MklSoftmaxOp : public OpKernel {
       // Softmax MklDnn output layout is same as input layout.
       auto dst_pd = src.GetUsrMemPrimDesc();
 
-      // if input is MKL shape, ouput is also MKL shape.
+      // if input is MKL shape, output is also MKL shape.
       // if input is TF shape, output is also TF shape
       if (src_mkl_shape.IsMklTensor()) {
         output_mkl_shape.SetMklTensor(true);
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index c3d24e50effb3fe5184e264064393a7f339105f0..313d40c082b3e334a01ba97eaf4449e1940b013a 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -12,18 +12,6 @@ load(
     "tf_kernel_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_kernel_library(
     name = "neon_depthwise_conv_op",
     hdrs = [
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index 04c71e384b075855f993407041d50c75ea4e40ee..41494f56c5ea6b099f8eb7e81d50c83269aa278f 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -114,9 +114,12 @@ class PadOp : public OpKernel {
       Tensor collapsed_input;
       CHECK(collapsed_input.CopyFrom(in0, collapsed_input_shape));
       Tensor collapsed_output;
-      OP_REQUIRES_OK(context, context->allocate_temp(collapsed_input.dtype(),
-                                                     collapsed_output_shape,
-                                                     &collapsed_output));
+      AllocatorAttributes alloc_attrs;
+      alloc_attrs.set_on_host(context->input_memory_type(0) == HOST_MEMORY);
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(collapsed_input.dtype(),
+                                            collapsed_output_shape,
+                                            &collapsed_output, alloc_attrs));
       const Tensor& collapsed_paddings_ref = collapsed_paddings;
       typename TTypes<Tpadding>::ConstMatrix collapsed_paddings_matrix =
           collapsed_paddings_ref.matrix<Tpadding>();
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index ad606803ee7017380b33819dca7718023daa3900..6c19f9841cdd886a614e537d75cefee4c2e892d8 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -43,6 +43,7 @@ class QueueOp : public ResourceOpKernel<QueueInterface> {
 
   void Compute(OpKernelContext* context) override {
     ResourceOpKernel<QueueInterface>::Compute(context);
+    mutex_lock l(mu_);
     if (resource_ && context->track_allocations()) {
       context->record_persistent_memory_allocation(resource_->MemoryUsed());
     }
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 9237fa51d885c633675146191dc384dd87d8ab22..0de2ebb5907caa13e0c1b2a4e11d218bd9701bae 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -244,6 +244,33 @@ __global__ void RowReduceKernel(
   if (row < num_rows && lane == 0) out[row] = sum;
 }
 
+template <typename T1>
+struct storage_type {
+  T1 val;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator T1() { return val; }
+  __host__ __device__ storage_type<T1>& operator=(const T1& in) {
+    val = in;
+    return *this;
+  }
+};
+
+template <typename T2>
+struct storage_type<std::complex<T2>> {
+  T2 real;
+  T2 imag;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator std::complex<T2>() {
+    return std::complex<T2>(real, imag);
+  }
+  __host__ __device__ storage_type<std::complex<T2>>& operator=(
+      const std::complex<T2>& in) {
+    real = in.real();
+    imag = in.imag();
+    return *this;
+  }
+};
+
 // Works only if there are <= 16 columns
 // each warps sums over multiple rows at once
 template <typename T, typename outT, typename Op>
@@ -268,7 +295,7 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += rows_per_warp * gridDim.y * blockDim.y;
   for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
@@ -294,7 +321,8 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
     if (blockDim.y > 1) {
       for (int row = 1; row < blockDim.y; ++row) {
-        s = op(s, partial_sums[threadIdx.x * 33 + row]);
+        value_type t = partial_sums[threadIdx.x * 33 + row];
+        s = op(s, t);
       }
     }
 
@@ -316,7 +344,7 @@ __global__ void ColumnReduceKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += gridDim.y * blockDim.y;
 
@@ -347,7 +375,8 @@ __global__ void ColumnReduceKernel(
         min(blockDim.y, num_rows - blockIdx.y * blockDim.y);
 
     for (int row = 1; row < numRowsThisBlock; ++row) {
-      s = op(s, partial_sums[threadIdx.x * 33 + row]);
+      value_type t = partial_sums[threadIdx.x * 33 + row];
+      s = op(s, t);
     }
 
     out[col * gridDim.y + blockIdx.y] = s;
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index aecad0185fd6cd9574e29bb33f2707e04650aef4..f49a05c70ad122ce5da17ef91f279255ad18e306 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -250,8 +250,9 @@ class AssignVariableOp : public OpKernel {
 
     // Copying is unnecessary if we are the last user of the value
     // tensor, we can just adopt the input tensor's buffer instead.
-    std::unique_ptr<Tensor> input_alias =
-        context->forward_input(1, dtype_, value.shape(), DEVICE_MEMORY, attr);
+    std::unique_ptr<Tensor> input_alias = context->forward_input(
+        1, OpKernelContext::Params::kNoReservation /*output_index*/, dtype_,
+        value.shape(), DEVICE_MEMORY, attr);
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
     if (input_alias) {
@@ -363,9 +364,36 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(DT_VARIANT)));
 
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+
+    // Copying is unnecessary if we are the last user of the value
+    // tensor, we can just adopt the input tensor's buffer instead.
+    // Note that Variant objects themselves always reside on host.
+    std::unique_ptr<Tensor> input_alias = context->forward_input(
+        1, OpKernelContext::Params::kNoReservation /*output_index*/, DT_VARIANT,
+        value.shape(), HOST_MEMORY, attr);
+
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
     *variable->tensor() = Tensor(DT_VARIANT, value.shape());
+
+    if (input_alias) {
+      *variable->tensor() = *input_alias;
+      return;
+    }
+
+    // Need to copy, but maybe we can re-use variable's buffer?
+    if (!variable->tensor()->RefCountIsOne() ||
+        !variable->tensor()->shape().IsSameSize(value.shape())) {
+      PersistentTensor unused;
+      Tensor* tmp;
+      OP_REQUIRES_OK(context,
+                     context->allocate_persistent(DT_VARIANT, value.shape(),
+                                                  &unused, &tmp, attr));
+      *variable->tensor() = *tmp;
+    }
+
     const auto elements_in = value.flat<Variant>();
     auto elements_out = variable->tensor()->flat<Variant>();
     auto copy_fn = std::bind(&VariantCopyFn<Device>, context,
@@ -503,6 +531,7 @@ class ResourceGatherOp : public OpKernel {
   void Compute(OpKernelContext* c) override {
     Var* v = nullptr;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+    core::ScopedUnref su(v);
     // NOTE: We hold the lock for the whole gather operation instead
     // of increasing the reference count of v->tensor() to avoid a
     // situation where a write to the same variable will see a
@@ -576,7 +605,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 #if GOOGLE_CUDA
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GATHER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 
 #endif  // GOOGLE_CUDA
 
@@ -619,22 +648,35 @@ class ResourceScatterUpdateOp : public OpKernel {
     if (N > 0) {
       auto indices_flat = indices.flat<Index>();
       auto params_flat = params->flat_outer_dims<T>();
-      int64 num_updates = updates.NumElements();
-      OP_REQUIRES(c, num_updates % N == 0,
-                  errors::InvalidArgument(
-                      "shape of indices (", indices.shape().DebugString(),
-                      ") is not compatible with the shape of updates (",
-                      updates.shape().DebugString(), ")"));
-      auto updates_flat = updates.shaped<T, 2>({N, num_updates / N});
-
-      functor::ScatterFunctor<Device, T, Index, op> functor;
-      const Index bad_i = functor(c, c->template eigen_device<Device>(),
-                                  params_flat, updates_flat, indices_flat);
-      OP_REQUIRES(c, bad_i < 0,
-                  errors::InvalidArgument(
-                      "indices", SliceDebugString(indices.shape(), bad_i),
-                      " = ", indices_flat(bad_i), " is not in [0, ",
-                      params->dim_size(0), ")"));
+      if (TensorShapeUtils::IsScalar(updates.shape())) {
+        const auto update = updates.scalar<T>();
+
+        functor::ScatterScalarFunctor<Device, T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<Device>(),
+                                    params_flat, update, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params->dim_size(0), ")"));
+      } else {
+        int64 num_updates = updates.NumElements();
+        OP_REQUIRES(c, num_updates % N == 0,
+                    errors::InvalidArgument(
+                        "shape of indices (", indices.shape().DebugString(),
+                        ") is not compatible with the shape of updates (",
+                        updates.shape().DebugString(), ")"));
+        auto updates_flat = updates.shaped<T, 2>({N, num_updates / N});
+
+        functor::ScatterFunctor<Device, T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<Device>(),
+                                    params_flat, updates_flat, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params->dim_size(0), ")"));
+      }
     }
   }
 };
@@ -652,35 +694,51 @@ class ResourceScatterUpdateOp : public OpKernel {
   REGISTER_SCATTER_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_KERNEL_INDEX(type, int64, dev, name, op);
 
-// TODO(apassos) add the other types here.
-#define REGISTER_SCATTER_ARITHEMTIC(type, dev)                \
+#define REGISTER_SCATTER_ARITHMETIC(type, dev)                \
   REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterAdd",    \
                           scatter_op::UpdateOp::ADD);         \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterSub",    \
+                          scatter_op::UpdateOp::SUB);         \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterMul",    \
+                          scatter_op::UpdateOp::MUL);         \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterDiv",    \
+                          scatter_op::UpdateOp::DIV);         \
   REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterUpdate", \
                           scatter_op::UpdateOp::ASSIGN);
+#define REGISTER_SCATTER_MINMAX(type, dev)                 \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterMin", \
+                          scatter_op::UpdateOp::MIN);      \
+  REGISTER_SCATTER_KERNEL(type, dev, "ResourceScatterMax", \
+                          scatter_op::UpdateOp::MAX);
 
 // Registers CPU kernels.
-#define REGISTER_SCATTER_ARITHEMTIC_CPU(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, CPU);
+#define REGISTER_SCATTER_ARITHMETIC_CPU(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, CPU);
+#define REGISTER_SCATTER_MINMAX_CPU(type) REGISTER_SCATTER_MINMAX(type, CPU);
 
-TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHEMTIC_CPU);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHMETIC_CPU);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
 
 REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
-#define REGISTER_SCATTER_ARITHEMTIC_GPU(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, GPU);
+#define REGISTER_SCATTER_ARITHMETIC_GPU(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, GPU);
+#define REGISTER_SCATTER_MINMAX_GPU(type) REGISTER_SCATTER_MINMAX(type, GPU);
 
 #define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
 
 #endif  // GOOGLE_CUDA
 
-#undef REGISTER_SCATTER_ARITHEMTIC
-#undef REGISTER_SCATTER_ARITHEMTIC_CPU
+#undef REGISTER_SCATTER_ARITHMETIC
+#undef REGISTER_SCATTER_ARITHMETIC_CPU
+#undef REGISTER_SCATTER_MINMAX
+#undef REGISTER_SCATTER_MINMAX_CPU
 #undef REGISTER_SCATTER_KERNEL
 #undef REGISTER_SCATTER_KERNEL_INDEX
 
diff --git a/tensorflow/core/kernels/scatter_functor.cc b/tensorflow/core/kernels/scatter_functor.cc
index 7eba82899fe1d5e84d08cdd129e6c25ca4da15f1..cf5408123fb495c2f540595590a7cce92b39dea5 100644
--- a/tensorflow/core/kernels/scatter_functor.cc
+++ b/tensorflow/core/kernels/scatter_functor.cc
@@ -26,21 +26,30 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 // Forward declarations of the functor specializations for GPU.
-#define DECLARE_GPU_SPECS_OP(T, Index, op)                   \
-  template <>                                                \
-  Index ScatterFunctor<GPUDevice, T, Index, op>::operator()( \
-      OpKernelContext* c, const GPUDevice& d,                \
-      typename TTypes<T>::Matrix params,                     \
-      typename TTypes<T>::ConstMatrix updates,               \
-      typename TTypes<Index>::ConstFlat indices);            \
-  extern template struct ScatterFunctor<GPUDevice, T, Index, op>;
+#define DECLARE_GPU_SPECS_OP(T, Index, op)                         \
+  template <>                                                      \
+  Index ScatterFunctor<GPUDevice, T, Index, op>::operator()(       \
+      OpKernelContext* c, const GPUDevice& d,                      \
+      typename TTypes<T>::Matrix params,                           \
+      typename TTypes<T>::ConstMatrix updates,                     \
+      typename TTypes<Index>::ConstFlat indices);                  \
+  extern template struct ScatterFunctor<GPUDevice, T, Index, op>;  \
+  template <>                                                      \
+  Index ScatterScalarFunctor<GPUDevice, T, Index, op>::operator()( \
+      OpKernelContext* c, const GPUDevice& d,                      \
+      typename TTypes<T>::Matrix params,                           \
+      const typename TTypes<T>::ConstScalar update,                \
+      typename TTypes<Index>::ConstFlat indices);                  \
+  extern template struct ScatterScalarFunctor<GPUDevice, T, Index, op>;
 
 #define DECLARE_GPU_SPECS_INDEX(T, Index)                       \
   DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ASSIGN); \
   DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ADD);    \
   DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::SUB);    \
   DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MUL);    \
-  DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);
+  DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);    \
+  DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MIN);    \
+  DECLARE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MAX);
 
 #define DECLARE_GPU_SPECS(T)         \
   DECLARE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index 079f15e101308867389745ee42146086af91c47c..52666645bf0bb38df3fd600c602313d7b5925b00 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <type_traits>
 
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/types.h"
@@ -33,7 +35,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 namespace scatter_op {
 
-enum class UpdateOp { ASSIGN, ADD, SUB, MUL, DIV };
+enum class UpdateOp { ASSIGN, ADD, SUB, MUL, DIV, MIN, MAX };
 
 namespace internal {
 
@@ -45,6 +47,10 @@ struct Assign<scatter_op::UpdateOp::ASSIGN> {
   static void Run(Params p, Update u) {
     p = u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p.setConstant(u);
+  }
 };
 template <>
 struct Assign<scatter_op::UpdateOp::ADD> {
@@ -52,6 +58,10 @@ struct Assign<scatter_op::UpdateOp::ADD> {
   static void Run(Params p, Update u) {
     p += u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p + u;
+  }
 };
 template <>
 struct Assign<scatter_op::UpdateOp::SUB> {
@@ -59,6 +69,10 @@ struct Assign<scatter_op::UpdateOp::SUB> {
   static void Run(Params p, Update u) {
     p -= u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p + static_cast<Update>(-u);
+  }
 };
 template <>
 struct Assign<scatter_op::UpdateOp::MUL> {
@@ -66,6 +80,10 @@ struct Assign<scatter_op::UpdateOp::MUL> {
   static void Run(Params p, Update u) {
     p *= u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p * u;
+  }
 };
 template <>
 struct Assign<scatter_op::UpdateOp::DIV> {
@@ -73,6 +91,34 @@ struct Assign<scatter_op::UpdateOp::DIV> {
   static void Run(Params p, Update u) {
     p /= u;
   }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p / u;
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::MIN> {
+  // This method requires that Params and Update are tensor types.
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p = p.cwiseMin(u);
+  }
+  // Same thing, but for Update being a scalar type.
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p.cwiseMin(u);
+  }
+};
+template <>
+struct Assign<scatter_op::UpdateOp::MAX> {
+  template <typename Params, typename Update>
+  static void Run(Params p, Update u) {
+    p = p.cwiseMax(u);
+  }
+  template <typename Params, typename Update>
+  static void RunScalar(Params p, Update u) {
+    p = p.cwiseMax(u);
+  }
 };
 
 #ifdef TENSORFLOW_USE_SYCL
@@ -117,6 +163,22 @@ struct AssignSYCL<scatter_op::UpdateOp::DIV> {
     p.device(d) = p / u;
   }
 };
+
+template <>
+struct AssignSYCL<scatter_op::UpdateOp::MIN> {
+  template <typename Device, typename Params, typename Update>
+  static void Run(Device d, Params p, Update u) {
+    p.device(d) = p.cwiseMin(u);
+  }
+};
+
+template <>
+struct AssignSYCL<scatter_op::UpdateOp::MAX> {
+  template <typename Device, typename Params, typename Update>
+  static void Run(Device d, Params p, Update u) {
+    p.device(d) = p.cwiseMax(u);
+  }
+};
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
@@ -241,6 +303,112 @@ struct ScatterFunctorSYCL {
 };
 #endif  // TENSORFLOW_USE_SYCL
 
+template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctor {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices);
+};
+
+template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctorBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  An earlier version of the
+      // code checked it and then grabbed it from memory a second time, which
+      // was a security risk since it could have changed in between.
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::Assign<op>::RunScalar(
+          params.template chip<0>(index), update());
+    }
+    return -1;
+  }
+};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
+  Index operator()(OpKernelContext* c, const SYCLDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  An earlier version of the
+      // code checked it and then grabbed it from memory a second time, which
+      // was a security risk since it could have changed in between.
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::AssignSYCL<op>::RunScalar(
+          d, params.template chip<0>(index), update);
+    }
+    return -1;
+  }
+};
+#endif  // TENSORFLOW_USE_SYCL
+
+template <typename T, typename Index>
+struct ScatterScalarFunctorBase<CPUDevice, T, Index,
+                                scatter_op::UpdateOp::ASSIGN> {
+  Index operator()(OpKernelContext* c, const CPUDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  An earlier version of the
+      // code checked it and then grabbed it from memory a second time, which
+      // was a security risk since it could have changed in between.
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::Assign<scatter_op::UpdateOp::ASSIGN>::RunScalar(
+          params.template chip<0>(index), update());
+    }
+    return -1;
+  }
+};
+
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctor<CPUDevice, T, Index, op>
+    : ScatterScalarFunctorBase<CPUDevice, T, Index, op> {};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctorSYCL {
+  Index operator()(OpKernelContext* c, const SYCLDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::Flat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      scatter_op::internal::AssignSYCL<op>::Run(
+          d, params.template chip<0>(index), update());
+    }
+    return -1;
+  }
+};
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
index 52972997cc7858177a6d1b13c2d237d644a4d82d..59911bf0d26afe57a902b1533b75b76797070c06 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
@@ -23,15 +23,18 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_SPECS_OP(T, Index, op) \
-  template struct functor::ScatterFunctor<GPUDevice, T, Index, op>;
+#define DEFINE_GPU_SPECS_OP(T, Index, op)                           \
+  template struct functor::ScatterFunctor<GPUDevice, T, Index, op>; \
+  template struct functor::ScatterScalarFunctor<GPUDevice, T, Index, op>;
 
 #define DEFINE_GPU_SPECS_INDEX(T, Index)                       \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ASSIGN); \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ADD);    \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::SUB);    \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MUL);    \
-  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);    \
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MIN);    \
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MAX);
 
 #define DEFINE_GPU_SPECS(T)         \
   DEFINE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index be18658543ea330e3196d0f372154df32e4e1dfc..70809e4dcf93d80d562196d3515a305cf35fa8ba 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -29,12 +29,53 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+namespace scatter_op_gpu {
+
+template <typename T, scatter_op::UpdateOp op>
+struct ScatterOpKernelBody;
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::ASSIGN> {
+  __device__ void operator()(T* dest, T src) const { *dest = src; }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::ADD> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicAdd(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::SUB> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicSub(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MUL> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicMul(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::DIV> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicDiv(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MIN> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicMin(dest, src); }
+};
+
+template <typename T>
+struct ScatterOpKernelBody<T, scatter_op::UpdateOp::MAX> {
+  __device__ void operator()(T* dest, T src) const { CudaAtomicMax(dest, src); }
+};
+
 template <typename T, typename Index, scatter_op::UpdateOp op>
 __global__ void ScatterOpCustomKernel(T* params, const T* updates,
                                       const Index* indices,
                                       Index first_dim_size, Index updates_size,
                                       Index indices_size) {
   Index update_block = updates_size / indices_size;
+  ScatterOpKernelBody<T, op> body;
   CUDA_1D_KERNEL_LOOP(i, updates_size) {
     int indices_i = i / update_block;
     int updates_i = i;
@@ -44,31 +85,33 @@ __global__ void ScatterOpCustomKernel(T* params, const T* updates,
       continue;
     }
     int params_i = param_first_index * update_block + (i % update_block);
-    switch (op) {
-      case scatter_op::UpdateOp::ASSIGN: {
-        params[params_i] = ldg(updates + updates_i);
-        break;
-      }
-      case scatter_op::UpdateOp::ADD: {
-        CudaAtomicAdd(params + params_i, ldg(updates + updates_i));
-        break;
-      }
-      case scatter_op::UpdateOp::SUB: {
-        CudaAtomicSub(params + params_i, ldg(updates + updates_i));
-        break;
-      }
-      case scatter_op::UpdateOp::MUL: {
-        CudaAtomicMul(params + params_i, ldg(updates + updates_i));
-        break;
-      }
-      case scatter_op::UpdateOp::DIV: {
-        CudaAtomicDiv(params + params_i, ldg(updates + updates_i));
-        break;
-      }
+    body(&params[params_i], ldg(updates + updates_i));
+  }
+}
+
+template <typename T, typename Index, scatter_op::UpdateOp op>
+__global__ void ScatterScalarOpCustomKernel(T* params, const T* update,
+                                            const Index* indices,
+                                            Index first_dim_size,
+                                            Index indices_size,
+                                            Index synthesized_updates_size) {
+  Index update_block = synthesized_updates_size / indices_size;
+  ScatterOpKernelBody<T, op> body;
+  CUDA_1D_KERNEL_LOOP(i, synthesized_updates_size) {
+    int indices_i = i / update_block;
+    int param_first_index = indices[indices_i];
+    const T update_val = *update;
+    if (!(param_first_index >= 0 && param_first_index < first_dim_size)) {
+      // Ignore indices that are out of range.
+      continue;
     }
+    int params_i = param_first_index * update_block + (i % update_block);
+    body(&params[params_i], update_val);
   }
 }
 
+}  // namespace scatter_op_gpu
+
 namespace functor {
 // Specialization for a GPU device.
 template <typename T, typename Index, scatter_op::UpdateOp op>
@@ -84,7 +127,7 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
     const Index indices_size = indices.size();
     const Index updates_size = updates.size();
     CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
-    ScatterOpCustomKernel<T, Index, op>
+    scatter_op_gpu::ScatterOpCustomKernel<T, Index, op>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
             params.data(), updates.data(), indices.data(), first_dim_size,
             updates_size, indices_size);
@@ -92,6 +135,27 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
   }
 };
 
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterScalarFunctor<GPUDevice, T, Index, op> {
+  Index operator()(OpKernelContext* c, const GPUDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   const typename TTypes<T>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // TODO(b/31801742): Implement indices range check. The hardest part is
+    // with returning a value after the range check, as we do not want to do
+    // device to host memcpy during a stream.
+    const Index first_dim_size = params.dimension(0);
+    const Index indices_size = indices.size();
+    const Index synthesized_updates_size = indices_size * params.dimension(1);
+    CudaLaunchConfig config = GetCudaLaunchConfig(synthesized_updates_size, d);
+    scatter_op_gpu::ScatterScalarOpCustomKernel<T, Index, op>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            params.data(), update.data(), indices.data(), first_dim_size,
+            indices_size, synthesized_updates_size);
+    return -1;
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 282165349f316144d261859d5a3a992f047e0df3..0fbde764d57eb661314b699ef9902238ad38b2cf 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -38,6 +38,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 // Check whether updates.shape = indices.shape + params.shape[1:]
 static bool ValidShapes(const Tensor& params, const Tensor& updates,
                         const Tensor& indices) {
+  if (updates.dims() == 0) return true;
   if (updates.dims() != indices.dims() + params.dims() - 1) return false;
   for (int d = 0; d < indices.dims(); d++) {
     if (updates.dim_size(d) != indices.dim_size(d)) {
@@ -61,11 +62,11 @@ static void DoValidationChecking(OpKernelContext* c, const Tensor& params,
                                       params.shape().DebugString()));
   OP_REQUIRES(
       c, ValidShapes(params, updates, indices),
-      errors::InvalidArgument(
-          "Must have updates.shape = indices.shape + params.shape[1:], got ",
-          "updates.shape ", updates.shape().DebugString(), ", indices.shape ",
-          indices.shape().DebugString(), ", params.shape ",
-          params.shape().DebugString()));
+      errors::InvalidArgument("Must have updates.shape = indices.shape + "
+                              "params.shape[1:] or updates.shape = [], got ",
+                              "updates.shape ", updates.shape().DebugString(),
+                              ", indices.shape ", indices.shape().DebugString(),
+                              ", params.shape ", params.shape().DebugString()));
 }
 
 template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
@@ -122,16 +123,31 @@ class ScatterUpdateOp : public OpKernel {
     if (N > 0) {
       auto indices_flat = indices.flat<Index>();
       auto params_flat = params.flat_outer_dims<T>();
-      auto updates_flat = updates.shaped<T, 2>({N, updates.NumElements() / N});
-
-      functor::ScatterFunctor<Device, T, Index, op> functor;
-      const Index bad_i = functor(c, c->template eigen_device<Device>(),
-                                  params_flat, updates_flat, indices_flat);
-      OP_REQUIRES(
-          c, bad_i < 0,
-          errors::InvalidArgument(
-              "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-              indices_flat(bad_i), " is not in [0, ", params.dim_size(0), ")"));
+
+      if (TensorShapeUtils::IsScalar(updates.shape()) ||
+          IsLegacyScalar(updates.shape())) {
+        const auto update = updates.scalar<T>();
+        functor::ScatterScalarFunctor<Device, T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<Device>(),
+                                    params_flat, update, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params.dim_size(0), ")"));
+      } else {
+        auto updates_flat =
+            updates.shaped<T, 2>({N, updates.NumElements() / N});
+
+        functor::ScatterFunctor<Device, T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<Device>(),
+                                    params_flat, updates_flat, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params.dim_size(0), ")"));
+      }
     }
   }
 };
@@ -195,16 +211,31 @@ class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
 
       auto indices_flat = indices_host.flat<Index>();
       auto params_flat = params.flat_outer_dims<T>();
-      auto updates_flat = updates.shaped<T, 2>({N, updates.NumElements() / N});
-
-      functor::ScatterFunctorSYCL<T, Index, op> functor;
-      const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
-                                  params_flat, updates_flat, indices_flat);
-      OP_REQUIRES(
-          c, bad_i < 0,
-          errors::InvalidArgument(
-              "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-              indices_flat(bad_i), " is not in [0, ", params.dim_size(0), ")"));
+
+      if (TensorShapeUtils::IsScalar(updates.shape())) {
+        const auto update = updates.scalar<T>();
+
+        functor::ScatterScalarFunctorSYCL<T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
+                                    params_flat, update, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params.dim_size(0), ")"));
+      } else {
+        auto updates_flat =
+            updates.shaped<T, 2>({N, updates.NumElements() / N});
+
+        functor::ScatterFunctorSYCL<T, Index, op> functor;
+        const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
+                                    params_flat, updates_flat, indices_flat);
+        OP_REQUIRES(c, bad_i < 0,
+                    errors::InvalidArgument(
+                        "indices", SliceDebugString(indices.shape(), bad_i),
+                        " = ", indices_flat(bad_i), " is not in [0, ",
+                        params.dim_size(0), ")"));
+      }
     }
   }
 };
@@ -221,54 +252,71 @@ class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
   REGISTER_SCATTER_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_KERNEL_INDEX(type, int64, dev, name, op);
 
-#define REGISTER_SCATTER_ARITHEMTIC(type, dev)                                 \
+#define REGISTER_SCATTER_ARITHMETIC(type, dev)                                 \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterAdd", scatter_op::UpdateOp::ADD); \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterDiv", scatter_op::UpdateOp::DIV); \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterMul", scatter_op::UpdateOp::MUL); \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterSub", scatter_op::UpdateOp::SUB);
 
+#define REGISTER_SCATTER_MINMAX(type, dev)                                     \
+  REGISTER_SCATTER_KERNEL(type, dev, "ScatterMin", scatter_op::UpdateOp::MIN); \
+  REGISTER_SCATTER_KERNEL(type, dev, "ScatterMax", scatter_op::UpdateOp::MAX);
+
 #define REGISTER_SCATTER_UPDATE(type, dev)            \
   REGISTER_SCATTER_KERNEL(type, dev, "ScatterUpdate", \
                           scatter_op::UpdateOp::ASSIGN);
 
 // Registers CPU kernels.
-#define REGISTER_SCATTER_ARITHEMTIC_CPU(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, CPU);
+#define REGISTER_SCATTER_ARITHMETIC_CPU(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, CPU);
+
+#define REGISTER_SCATTER_MINMAX_CPU(type) REGISTER_SCATTER_MINMAX(type, CPU);
 
 #define REGISTER_SCATTER_UPDATE_CPU(type) REGISTER_SCATTER_UPDATE(type, CPU);
 
-TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHEMTIC_CPU);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHMETIC_CPU);
 TF_CALL_ALL_TYPES(REGISTER_SCATTER_UPDATE_CPU);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
-#define REGISTER_SCATTER_ARITHEMTIC_GPU(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, GPU);
+#define REGISTER_SCATTER_ARITHMETIC_GPU(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, GPU);
+
+#define REGISTER_SCATTER_MINMAX_GPU(type) REGISTER_SCATTER_MINMAX(type, GPU);
 
 #define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);
 
 #endif  // GOOGLE_CUDA
 
 // Registers GPU kernels.
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SCATTER_ARITHEMTIC_SYCL(type) \
-  REGISTER_SCATTER_ARITHEMTIC(type, SYCL);
+#define REGISTER_SCATTER_ARITHMETIC_SYCL(type) \
+  REGISTER_SCATTER_ARITHMETIC(type, SYCL);
+
+#define REGISTER_SCATTER_MINMAX_SYCL(type) REGISTER_SCATTER_MINMAX(type, SYCL);
 
 #define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL);
 
-#undef REGISTER_SCATTER_ARITHEMTIC_SYCL
+#undef REGISTER_SCATTER_ARITHMETIC_SYCL
+#undef REGISTER_SCATTER_MINMAX_SYCL
 #undef REGISTER_SCATTER_UPDATE_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
-#undef REGISTER_SCATTER_ARITHEMTIC
-#undef REGISTER_SCATTER_ARITHEMTIC_CPU
-#undef REGISTER_SCATTER_ARITHEMTIC_GPU
+#undef REGISTER_SCATTER_ARITHMETIC
+#undef REGISTER_SCATTER_ARITHMETIC_CPU
+#undef REGISTER_SCATTER_ARITHMETIC_GPU
+#undef REGISTER_SCATTER_MINMAX
+#undef REGISTER_SCATTER_MINMAX_CPU
+#undef REGISTER_SCATTER_MINMAX_GPU
 #undef REGISTER_SCATTER_UPDATE
 #undef REGISTER_SCATTER_UPDATE_CPU
 #undef REGISTER_SCATTER_UPDATE_GPU
diff --git a/tensorflow/core/kernels/scatter_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
index 0b43704846d6958cbfc71e1b0b498bbada8e01e1..0df329310f0dc51bbe91b784a40fd7bf68b012f0 100644
--- a/tensorflow/core/kernels/scatter_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
@@ -24,15 +24,18 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 // Instantiates functor specializations for GPU.
-#define DEFINE_GPU_SPECS_OP(T, Index, op) \
-  template struct functor::ScatterFunctor<GPUDevice, T, Index, op>;
+#define DEFINE_GPU_SPECS_OP(T, Index, op)                           \
+  template struct functor::ScatterFunctor<GPUDevice, T, Index, op>; \
+  template struct functor::ScatterScalarFunctor<GPUDevice, T, Index, op>;
 
 #define DEFINE_GPU_SPECS_INDEX(T, Index)                       \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ASSIGN); \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::ADD);    \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::SUB);    \
   DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MUL);    \
-  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::DIV);    \
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MIN);    \
+  DEFINE_GPU_SPECS_OP(T, Index, scatter_op::UpdateOp::MAX);
 
 #define DEFINE_GPU_SPECS(T)         \
   DEFINE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
index 0b8645a2ae98edc263eccd51f2bd7377c78930da..5b3537b94c8304d7427e769eacb15784cec8c295 100644
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -185,7 +185,7 @@ TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) {
   Status s = RunOpKernel();
   EXPECT_TRUE(StringPiece(s.ToString())
                   .contains("Must have updates.shape = indices.shape + "
-                            "params.shape[1:], got "))
+                            "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
 
@@ -202,7 +202,7 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
   Status s = RunOpKernel();
   EXPECT_TRUE(StringPiece(s.ToString())
                   .contains("Must have updates.shape = indices.shape + "
-                            "params.shape[1:], got "))
+                            "params.shape[1:] or updates.shape = [], got "))
 
       << s;
 }
@@ -219,7 +219,7 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   Status s = RunOpKernel();
   EXPECT_TRUE(StringPiece(s.ToString())
                   .contains("Must have updates.shape = indices.shape + "
-                            "params.shape[1:], got "))
+                            "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
 
@@ -300,6 +300,20 @@ static void BM_ScatterDivInt64(int iters, int embedding_size) {
   BM_ScatterHelper<int64>(iters, embedding_size, "ScatterDiv");
 }
 
+static void BM_ScatterMinInt32(int iters, int embedding_size) {
+  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMin");
+}
+static void BM_ScatterMinInt64(int iters, int embedding_size) {
+  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMin");
+}
+
+static void BM_ScatterMaxInt32(int iters, int embedding_size) {
+  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMax");
+}
+static void BM_ScatterMaxInt64(int iters, int embedding_size) {
+  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMax");
+}
+
 BENCHMARK(BM_ScatterUpdateInt32)
     ->Arg(1)
     ->Arg(10)
@@ -332,5 +346,11 @@ BENCHMARK(BM_ScatterMulInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
 BENCHMARK(BM_ScatterDivInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
 BENCHMARK(BM_ScatterDivInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
 
+BENCHMARK(BM_ScatterMinInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+BENCHMARK(BM_ScatterMinInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+
+BENCHMARK(BM_ScatterMaxInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+BENCHMARK(BM_ScatterMaxInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scoped_allocator_ops.cc b/tensorflow/core/kernels/scoped_allocator_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7b25ffad0408383a63d5127f85ce41f40890e87
--- /dev/null
+++ b/tensorflow/core/kernels/scoped_allocator_ops.cc
@@ -0,0 +1,216 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class ScopedAllocatorOp : public OpKernel {
+ public:
+  explicit ScopedAllocatorOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("T", &dtype_));
+    OP_REQUIRES_OK(context, context->GetAttr("shapes", &shapes_));
+    OP_REQUIRES_OK(context, context->GetAttr("sa_name", &name_));
+    OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
+    OP_REQUIRES_OK(context, context->GetAttr("expected_call_count",
+                                             &expected_call_count_));
+    device_ = context->device();
+    // Precalculate the size of the backing tensor and the offsets of
+    // the subtensors to be allocated from it, taking into account
+    // alignment considerations.
+    ScopedAllocatorMgr::PopulateFields(id_, shapes_, dtype_, &fields_);
+    size_t num_bytes = fields_.back().offset + fields_.back().bytes;
+    num_elements_ = num_bytes / DataTypeSize(dtype_);
+    OP_REQUIRES(context, num_bytes % DataTypeSize(dtype_) == 0,
+                errors::InvalidArgument(
+                    "Number of bytes ", num_bytes,
+                    " must be divisible by size of datatype ", dtype_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    ScopedAllocatorMgr* sam = device_->GetScopedAllocatorMgr();
+    if (!sam) {
+      context->SetStatus(errors::Internal(
+          "ScopedAllocatorMgr not supported on device ", device_->name()));
+      return;
+    }
+    Tensor* backing_tensor = nullptr;
+    AllocatorAttributes attr = context->output_alloc_attr(0);
+    Status s =
+        context->allocate_output(0, {num_elements_}, &backing_tensor, attr);
+    VLOG(1) << "_ScopedAllocatorOp new backing tensor size "
+            << backing_tensor->TotalBytes() << " num_elements_ "
+            << num_elements_ << " buffer " << DMAHelper::buffer(backing_tensor)
+            << " base addr " << DMAHelper::base(backing_tensor);
+    if (s.ok()) {
+      s = sam->AddScopedAllocator(*backing_tensor, context->step_id(), id_,
+                                  name_, fields_, expected_call_count_);
+    }
+    if (!s.ok()) {
+      context->SetStatus(s);
+    }
+  }
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataType dtype_;
+  int64 num_elements_;
+  std::vector<ScopedAllocator::Field> fields_;
+  string name_;
+  int32 id_;
+  int32 expected_call_count_;
+  DeviceBase* device_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocator").Device(DEVICE_CPU),
+                        ScopedAllocatorOp);
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocator").Device(DEVICE_GPU),
+                        ScopedAllocatorOp);
+
+class ScopedAllocatorConcatOp : public OpKernel {
+ public:
+  explicit ScopedAllocatorConcatOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &dtype_));
+    // This stuff is just for debugging
+    OP_REQUIRES_OK(context, context->GetAttr("sa_name", &name_));
+    OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
+    device_ = context->device();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& backing_tensor = context->input(0);
+    // Check that type matches.
+    OP_REQUIRES(
+        context, backing_tensor.dtype() == dtype_,
+        errors::InvalidArgument("Backing tensor type ", backing_tensor.dtype(),
+                                " does not match expected type ", dtype_));
+    // Check that backing tensor is at least as large as the shape of the
+    // output.
+    OP_REQUIRES(context, backing_tensor.NumElements() >= shape_.num_elements(),
+                errors::InvalidArgument("Backing tensor num elements ",
+                                        backing_tensor.NumElements(),
+                                        " is not equal to expected ",
+                                        shape_.num_elements()));
+    VLOG(1) << "_ScopedAllocatorConcatOp outputting backing tensor at "
+            << DMAHelper::base(&backing_tensor);
+    Tensor backing_copy(backing_tensor);
+    context->set_output(0, backing_copy);
+    const TensorBuffer* backing_buf = DMAHelper::buffer(&backing_copy);
+    const void* backing_tensor_lb = backing_buf->data();
+    const void* backing_tensor_ub = static_cast<const void*>(
+        static_cast<const char*>(backing_tensor_lb) + backing_buf->size());
+    // Check that all inputs lie entirely within the backing tensor.
+    for (int i = 1; i < context->num_inputs(); ++i) {
+      const TensorBuffer* input_buf = DMAHelper::buffer(&context->input(i));
+      const void* input_lb = input_buf->data();
+      OP_REQUIRES(
+          context, input_lb >= backing_tensor_lb,
+          errors::InvalidArgument("Lower bound check fail for input ", i,
+                                  " to node ", context->op_kernel().name()));
+      const void* input_ub = static_cast<const void*>(
+          static_cast<const char*>(input_lb) + input_buf->size());
+      OP_REQUIRES(
+          context, input_ub <= backing_tensor_ub,
+          errors::InvalidArgument("Upper bound check fail for input ", i,
+                                  " to node ", context->op_kernel().name()));
+    }
+  }
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  string name_;
+  int32 id_;
+  DeviceBase* device_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocatorConcat").Device(DEVICE_CPU),
+                        ScopedAllocatorConcatOp);
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocatorConcat").Device(DEVICE_GPU),
+                        ScopedAllocatorConcatOp);
+
+class ScopedAllocatorSplitOp : public OpKernel {
+ public:
+  explicit ScopedAllocatorSplitOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("T", &dtype_));
+    // This stuff is just for debugging
+    OP_REQUIRES_OK(context, context->GetAttr("sa_name", &name_));
+    OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
+    device_ = context->device();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    Tensor backing_copy(context->input(0));
+    // Check that type matches.
+    OP_REQUIRES(
+        context, backing_copy.dtype() == dtype_,
+        errors::InvalidArgument("Backing tensor type ", backing_copy.dtype(),
+                                " does not match expected type ", dtype_));
+    const TensorBuffer* backing_buf = DMAHelper::buffer(&backing_copy);
+    const void* backing_tensor_lb = backing_buf->data();
+    const void* backing_tensor_ub = static_cast<const void*>(
+        static_cast<const char*>(backing_tensor_lb) + backing_buf->size());
+    for (int i = 1; i < context->num_inputs(); ++i) {
+      VLOG(1) << "_ScopedAllocatorSplitOp assigning input " << i
+              << " to output " << i - 1 << " buf addr "
+              << DMAHelper::base(&context->input(i));
+      Tensor copy(context->input(i));
+      OP_REQUIRES(
+          context, copy.dtype() == dtype_,
+          errors::InvalidArgument("Input ", i, " tensor type ", copy.dtype(),
+                                  " does not match expected type ", dtype_));
+      context->set_output(i - 1, copy);
+      const TensorBuffer* input_buf = DMAHelper::buffer(&copy);
+      const void* input_lb = input_buf->data();
+      OP_REQUIRES(
+          context, input_lb >= backing_tensor_lb,
+          errors::InvalidArgument("Lower bound check fail for input ", i,
+                                  " to node ", context->op_kernel().name()));
+      const void* input_ub = static_cast<const void*>(
+          static_cast<const char*>(input_lb) + input_buf->size());
+      OP_REQUIRES(
+          context, input_ub <= backing_tensor_ub,
+          errors::InvalidArgument("Upper bound check fail for input ", i,
+                                  " to node ", context->op_kernel().name()));
+    }
+  }
+
+ private:
+  DataType dtype_;
+  string name_;
+  int32 id_;
+  DeviceBase* device_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocatorSplit").Device(DEVICE_CPU),
+                        ScopedAllocatorSplitOp);
+
+REGISTER_KERNEL_BUILDER(Name("_ScopedAllocatorSplit").Device(DEVICE_GPU),
+                        ScopedAllocatorSplitOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d36c8b7d43748abb91a5ecd2edd22dada7ae9c6
--- /dev/null
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -0,0 +1,296 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class ScopedAllocatorOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(const gtl::ArraySlice<TensorShape>& shapes, DataType dtype,
+              const string& name, int32 id, int32 expected_call_count) {
+    TF_EXPECT_OK(NodeDefBuilder("scoped_allocator_op", "_ScopedAllocator")
+                     .Attr("T", dtype)
+                     .Attr("shapes", shapes)
+                     .Attr("sa_name", name)
+                     .Attr("id", id)
+                     .Attr("expected_call_count", expected_call_count)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Allocate and Deallocate the tensors so that memory is not leaked
+    AllocatorAttributes attr;
+    Allocator* allocator;
+    for (size_t i = 0; i < shapes.size(); i++) {
+      attr.scope_id = id + i + 1;
+      allocator = device_->GetScopedAllocator(attr, context_->step_id());
+      Tensor temp(allocator, dtype, shapes[i]);
+    }
+  }
+};
+
+TEST_F(ScopedAllocatorOpTest, Simple) {
+  MakeOp({TensorShape({8})}, DT_FLOAT, "test", 120, 1);
+  MakeOp({TensorShape({32, 32})}, DT_DOUBLE, "test1", 130, 1);
+  MakeOp({TensorShape({64}), TensorShape({3, 3}), TensorShape({5, 5, 5})},
+         DT_HALF, "test2", 140, 3);
+  MakeOp({TensorShape({512}), TensorShape({64, 8})}, DT_UINT32, "test3", 150,
+         2);
+}
+
+// PrepOp is common to ConcatOp tests and SplitOpTests.
+// It allocates a backing tensor that is large enough to hold all slices defined
+// by fields, creates ScopedAllocatorInstances for each field, allocates the
+// tensors, and assigns them as inputs to the op.
+// We won't use the AddInput* suite of functions from ops_testutil.h because
+// they allocate new tensors for each input.  We need to mimic what a
+// ScopedAllocator would do.
+void PrepOp(DataType dtype, int32 id,
+            const std::vector<TensorShape>& fields_shapes,
+            std::vector<ScopedAllocator::Field>* fields,
+            Tensor** backing_tensor, Allocator* allocator,
+            ScopedAllocatorMgr* sam, const string& op_name,
+            std::vector<Tensor>* tensors,
+            gtl::InlinedVector<TensorValue, 4>* inputs,
+            const DataTypeVector& input_types) {
+  ScopedAllocatorMgr::PopulateFields(id, fields_shapes, dtype, fields);
+  // We don't simply allocate a tensor with shape as backing_tensor_shape,
+  // because we need to account for padding in the fields.  We actually need a
+  // tensor of size at least (fields[-1].offset + fields[-1].bytes).
+  size_t num_bytes = fields->back().offset + fields->back().bytes;
+  int32_t num_elements = num_bytes / DataTypeSize(dtype);
+  CHECK_EQ(num_bytes % DataTypeSize(dtype), 0);
+
+  *backing_tensor = new Tensor(allocator, dtype, {num_elements});
+  int64 step_id = 10;
+  Status s = sam->AddScopedAllocator(**backing_tensor, step_id, id,
+                                     "sa_" + op_name + "_test", *fields,
+                                     fields_shapes.size());
+  TF_ASSERT_OK(s);
+
+  ScopedAllocatorContainer* sac = sam->GetContainer(step_id);
+  std::vector<ScopedAllocatorInstance*> sa_instances(fields_shapes.size(),
+                                                     nullptr);
+  for (size_t i = 0; i < fields_shapes.size(); i++) {
+    sa_instances[i] = sac->GetInstance(id + i + 1);
+    tensors->push_back(Tensor(sa_instances[i], dtype, fields_shapes[i]));
+  }
+  // Now add the tensor as an input to ScopedAllocator<op_name>Op.
+  // Order matters here, so first add the backing tensor, then the slices.
+  inputs->reserve(1 + tensors->size());
+  CHECK_GT(input_types.size(), inputs->size());
+  CHECK_EQ(input_types[inputs->size()], dtype);
+  inputs->push_back({nullptr, *backing_tensor});
+  for (size_t i = 0; i < tensors->size(); i++) {
+    CHECK_EQ(input_types[inputs->size()], dtype);
+    inputs->push_back({nullptr, &((*tensors)[i])});
+  }
+}
+
+class ScopedAllocatorConcatOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(const TensorShape& shape, DataType dtype, const string& name,
+              int32 id, int32 num_tensors) {
+    TF_EXPECT_OK(
+        NodeDefBuilder("scoped_allocator_concat_op", "_ScopedAllocatorConcat")
+            .Attr("shape", shape)
+            .Attr("T", dtype)
+            .Attr("N", num_tensors)
+            .Attr("sa_name", name)
+            .Attr("id", id)
+            .Input(FakeInput(dtype))               // backing tensor
+            .Input(FakeInput(num_tensors, dtype))  // list of tensors
+            .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+
+  void ExecOp(DataType dtype, int32 id,
+              const std::vector<TensorShape>& fields_shapes) {
+    Tensor* backing_tensor = nullptr;
+    std::vector<Tensor> tensors;
+    std::vector<ScopedAllocator::Field> fields;
+    PrepOp(dtype, id, fields_shapes, &fields, &backing_tensor, allocator(),
+           device_->GetScopedAllocatorMgr(), "split", &tensors, &inputs_,
+           input_types_);
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Check input and output are same tensor.
+    const Tensor& input = context_->input(0);
+    OpOutputList output_list;
+    Status s = context_->output_list("output", &output_list);
+    TF_ASSERT_OK(s);
+    const Tensor& output = *(output_list[0]);
+    CHECK_EQ(DMAHelper::base(&input), DMAHelper::base(&output));
+    CHECK_EQ(input.dtype(), output.dtype());
+    CHECK_EQ(input.NumElements(), output.NumElements());
+
+    // Free the backing tensor which was allocated in PrepOp.
+    delete backing_tensor;
+  }
+};
+
+TEST_F(ScopedAllocatorConcatOpTest, Success1) {
+  MakeOp({32}, DT_FLOAT, "test", 120, 2);
+  ExecOp(DT_FLOAT, 120, {{16}, {16}});
+}
+
+TEST_F(ScopedAllocatorConcatOpTest, Success2) {
+  MakeOp({2, 2, 2}, DT_DOUBLE, "test", 120, 2);
+  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+}
+
+TEST_F(ScopedAllocatorConcatOpTest, Success3) {
+  MakeOp({3, 3, 3}, DT_HALF, "test", 120, 3);
+  ExecOp(DT_HALF, 120, {{3, 3}, {3, 3}, {3, 3}});
+}
+
+TEST_F(ScopedAllocatorConcatOpTest, FailDtypeCheck) {
+  MakeOp({8}, DT_FLOAT, "test", 120, 2);
+  EXPECT_DEATH(ExecOp(DT_DOUBLE, 120, {{4}, {4}}), "");
+}
+
+TEST_F(ScopedAllocatorConcatOpTest, FailNumElementsCheck) {
+  MakeOp({32}, DT_FLOAT, "test", 120, 2);
+  AddInputFromArray<float>({8}, {0, 1, 2, 3, 4, 5, 6, 7});
+  AddInputFromArray<float>({4}, {0, 1, 2, 3});
+  AddInputFromArray<float>({4}, {4, 5, 6, 7});
+  Status s = RunOpKernel();
+  EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
+}
+
+// This test should fail because the backing tensor and the input tensors are
+// unrelated, i.e. the inputs are not slices of the backing tensor.
+TEST_F(ScopedAllocatorConcatOpTest, FailBounds) {
+  MakeOp({8}, DT_DOUBLE, "test", 120, 2);
+  AddInputFromArray<double>({8}, {0, 1, 2, 3, 4, 5, 6, 7});
+  AddInputFromArray<double>({4}, {0, 1, 2, 3});
+  AddInputFromArray<double>({4}, {4, 5, 6, 7});
+  Status s = RunOpKernel();
+  EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
+}
+
+class ScopedAllocatorSplitOpTest : public OpsTestBase {
+ protected:
+  void BuildNodeDef(const TensorShape& shape, DataType dtype,
+                    const string& name, int32 id, int32 num_tensors) {
+    TF_EXPECT_OK(
+        NodeDefBuilder("scoped_allocator_split_op", "_ScopedAllocatorSplit")
+            .Attr("T", dtype)
+            .Attr("N", num_tensors)
+            .Attr("sa_name", name)
+            .Attr("id", id)
+            .Input(FakeInput(dtype))  // backing tensor and input
+            .Input(
+                FakeInput(num_tensors, dtype))  // list of subtensors to forward
+            .Finalize(node_def()));
+  }
+
+  void MakeOp(const TensorShape& shape, DataType dtype, const string& name,
+              int32 id, int32 num_tensors) {
+    BuildNodeDef(shape, dtype, name, id, num_tensors);
+    TF_EXPECT_OK(InitOp());
+  }
+
+  // Similar to ConcatOpTest, we add inputs that are allocated from
+  // ScopedAllocator so that the memory lines up nicely.
+  void ExecOp(DataType dtype, int32 id,
+              const std::vector<TensorShape>& fields_shapes) {
+    Tensor* backing_tensor = nullptr;
+    std::vector<Tensor> tensors;
+    std::vector<ScopedAllocator::Field> fields;
+    PrepOp(dtype, id, fields_shapes, &fields, &backing_tensor, allocator(),
+           device_->GetScopedAllocatorMgr(), "split", &tensors, &inputs_,
+           input_types_);
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Check that outputs are slices of backing tensor.
+    const Tensor& input = context_->input(0);
+    const void* lower_limit = DMAHelper::base(&input);
+    const char* lower_limit_c =
+        static_cast<const char*>(lower_limit);  // for pointer arithmetic
+    OpOutputList output_list;
+    Status s = context_->output_list("output", &output_list);
+    TF_ASSERT_OK(s);
+    for (int i = 0; i < output_list.size(); i++) {
+      const Tensor& output = *(output_list[i]);
+      const void* expected_base =
+          static_cast<const void*>(lower_limit_c + fields[i].offset);
+      CHECK_EQ(output.dtype(), input.dtype());
+      CHECK_EQ(expected_base, DMAHelper::base(&output));
+      CHECK_EQ(output.NumElements(), fields_shapes[i].num_elements());
+    }
+
+    // Free the backing tensor which was allocated in PrepOp.
+    delete backing_tensor;
+  }
+};
+
+TEST_F(ScopedAllocatorSplitOpTest, Success1) {
+  MakeOp({32}, DT_FLOAT, "test", 120, 2);
+  ExecOp(DT_FLOAT, 120, {{16}, {16}});
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, Success2) {
+  MakeOp({2, 2, 2}, DT_DOUBLE, "test", 120, 2);
+  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, Success3) {
+  MakeOp({3, 3, 3}, DT_HALF, "test", 120, 3);
+  ExecOp(DT_HALF, 120, {{3, 3}, {3, 3}, {3, 3}});
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, FailNLessThan2) {
+  BuildNodeDef({4, 4}, DT_FLOAT, "test", 120, 1);
+  Status s = InitOp();
+  EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, FailDtypeCheck) {
+  MakeOp({8}, DT_FLOAT, "test", 120, 2);
+  EXPECT_DEATH(ExecOp(DT_HALF, 120, {{4}, {4}}), "");
+}
+
+TEST_F(ScopedAllocatorSplitOpTest, FailBounds) {
+  MakeOp({8}, DT_DOUBLE, "test", 120, 2);
+  AddInputFromArray<double>({8}, {0, 1, 2, 3, 4, 5, 6, 7});
+  AddInputFromArray<double>({4}, {0, 1, 2, 3});
+  AddInputFromArray<double>({4}, {4, 5, 6, 7});
+  Status s = RunOpKernel();
+  EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index d0703d7576932c19933844ba43c6c00f357d1ba1..89abfe0eb1b49c4dae5e69803dc3b9e1cb6ba5ad 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -31,6 +31,13 @@ limitations under the License.
 // non-GPU targets. This only breaks in clang, because it's more strict for
 // template code and CudaAtomicMax is used in template context.
 
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc
index bcd88773902824c6e88db4226af43993d5649007..cea5883db7bd5d07fd594628268c2f79cfd2b5fc 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -55,6 +56,9 @@ class SelfAdjointEigOp : public LinearAlgebraOp<Scalar> {
       return;
     }
 
+    // This algorithm relies on denormals, so switch them back on locally.
+    port::ScopedDontFlushDenormal dont_flush_denormals;
+
     Eigen::SelfAdjointEigenSolver<
         Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
         es(inputs[0]);
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h b/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
index 8c0633f4226005b401ed0f0fcd1f56bbba772701..271dd2c4858aef6d9970b907f2a8d205178a978f 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
+++ b/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -61,6 +62,9 @@ class SelfAdjointEigV2Op : public LinearAlgebraOp<Scalar> {
       return;
     }
 
+    // This algorithm relies on denormals, so switch them back on locally.
+    port::ScopedDontFlushDenormal dont_flush_denormals;
+
     Eigen::SelfAdjointEigenSolver<Matrix> eig(
         inputs[0],
         compute_v_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
diff --git a/tensorflow/core/kernels/snapshot_op.cc b/tensorflow/core/kernels/snapshot_op.cc
index 50157d5d48f93bfe61cbac95246123ef0a7d446e..fe04dcf72e2aa73a0140338a8f207177048c7d0a 100644
--- a/tensorflow/core/kernels/snapshot_op.cc
+++ b/tensorflow/core/kernels/snapshot_op.cc
@@ -22,6 +22,26 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename Scalar>
+class SnapshotOp : public OpKernel {
+ public:
+  explicit SnapshotOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    Tensor* output = nullptr;
+    // Try to use buffer forwarding to avoid an explicit copy.
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input.shape(), &output));
+    if (!output->SharesBufferWith(input)) {
+      functor::Snapshot<Device, Scalar> functor;
+      functor(context->eigen_device<Device>(), input.flat<Scalar>(),
+              output->flat<Scalar>());
+    }
+  }
+};
 
 #define REGISTER_KERNEL(TYPE)                                        \
   REGISTER_KERNEL_BUILDER(                                           \
@@ -31,6 +51,16 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 TF_CALL_POD_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
+#if GOOGLE_CUDA
+#define REGISTER_KERNEL(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Snapshot").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<GPUDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+#endif
+
 #if TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SyclDevice;
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
diff --git a/tensorflow/core/kernels/snapshot_op.h b/tensorflow/core/kernels/snapshot_op.h
index b94834f15988a21ad41eefc8030b3da1a58875f8..a18065d42ba832d5b34f2dd534bc103c907310fe 100644
--- a/tensorflow/core/kernels/snapshot_op.h
+++ b/tensorflow/core/kernels/snapshot_op.h
@@ -26,29 +26,19 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
+namespace functor {
 
+// Functor used by SnapshotOp.
 template <typename Device, typename Scalar>
-class SnapshotOp : public OpKernel {
- public:
-  explicit SnapshotOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    Tensor* output = nullptr;
-    // Try to use buffer forwarding to avoid an explicit copy.
-    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {0}, 0, input.shape(), &output));
-    if (!output->SharesBufferWith(input)) {
-      // We had to allocate a new buffer since the refcount on the input was
-      // greater than 1. Copy the input to the new buffer.
-      const Device& device = context->eigen_device<Device>();
-      device.memcpy(output->template flat<Scalar>().data(),
-                    input.template flat<Scalar>().data(),
-                    input.NumElements() * sizeof(Scalar));
-    }
+struct Snapshot {
+  void operator()(const Device& device,
+                  typename TTypes<Scalar>::ConstTensor input,
+                  typename TTypes<Scalar>::Tensor output) {
+    device.memcpy(output.data(), input.data(), input.size() * sizeof(Scalar));
   }
 };
 
+}  // namespace functor
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
diff --git a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
index 52070be838d65d21813dfe097db9c395ef5a8448..e4e3bd5220382b50eca263d50d91d503b3a1c526 100644
--- a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
@@ -24,13 +24,10 @@ limitations under the License.
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_KERNEL(TYPE)                                        \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("Snapshot").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
-      SnapshotOp<GPUDevice, TYPE>);
+// Definition of the GPU implementations declared in softsign_op.cc.
+#define DEFINE_GPU_KERNELS(T) template struct functor::Snapshot<GPUDevice, T>;
 
-TF_CALL_POD_TYPES(REGISTER_KERNEL);
-#undef REGISTER_KERNEL
+TF_CALL_POD_TYPES(DEFINE_GPU_KERNELS);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 1f4a82a7334e35c48af09c897895f79ee30e1ebd..130d693dbdf132515a7ffcfc0bc6c9631a5aee21 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -33,8 +33,42 @@ namespace tensorflow {
 
 namespace {
 
+template <typename U, typename T>
+__device__ __host__ EIGEN_STRONG_INLINE
+    typename std::enable_if<!std::is_same<T, U>::value, U>::type
+    strict_cast(T t);
+
+template <typename U, typename T>
+__device__ __host__ EIGEN_STRONG_INLINE
+    typename std::enable_if<std::is_same<T, U>::value, U>::type
+    strict_cast(T t) {
+  return t;
+}
+
+template <>
+__device__ __host__ EIGEN_STRONG_INLINE float strict_cast<float, Eigen::half>(
+    Eigen::half t) {
+  return functor::HalfToFloat()(t);
+}
+
+template <>
+__device__ __host__ EIGEN_STRONG_INLINE Eigen::half
+strict_cast<Eigen::half, float>(float t) {
+  return functor::FloatToHalf()(t);
+}
+
 template <typename T>
-__global__ void GenerateNormalizedProb(const T* logits, const T* sum_probs,
+struct softmax_traits {
+  using accumulator_type = T;
+};
+
+template <>
+struct softmax_traits<Eigen::half> {
+  using accumulator_type = float;
+};
+
+template <typename T, typename U>
+__global__ void GenerateNormalizedProb(const T* logits, const U* sum_probs,
                                        const T* max_logits, T* output,
                                        const int num_rows, const int num_cols,
                                        const bool in_log_space) {
@@ -43,25 +77,33 @@ __global__ void GenerateNormalizedProb(const T* logits, const T* sum_probs,
   const int row = tid / num_cols;
   const int col = tid % num_cols;
 
+  // TODO(jamesqin): change to half2 load when inputs are Eigen::half.
+  U input = strict_cast<U>(logits[tid]);
+  U max_val = strict_cast<U>(ldg(max_logits + row));
+  U result;
+
   if (row < num_rows && col < num_cols) {
-    if (in_log_space)
-      output[tid] =
-          logits[tid] - ldg(max_logits + row) - log(ldg(sum_probs + row));
-    else
-      output[tid] =
-          exp(logits[tid] - ldg(max_logits + row)) / ldg(sum_probs + row);
+    if (in_log_space) {
+      result = input - max_val - log(ldg(sum_probs + row));
+    } else {
+      result = exp(input - max_val) / ldg(sum_probs + row);
+    }
+    output[tid] = strict_cast<T>(result);
   }
 }
 
-template <typename T>
+template <typename T, typename U>
 struct SubtractAndExpFunctor {
   __host__ __device__ SubtractAndExpFunctor(const T* logits,
                                             const T* max_logits,
                                             const int num_cols)
       : logits_(logits), max_logits_(max_logits), num_cols_(num_cols) {}
 
-  __host__ __device__ T operator()(const int gid) const {
-    return exp(logits_[gid] - ldg(max_logits_ + gid / num_cols_));
+  __host__ __device__ U operator()(const int gid) const {
+    // TODO(jamesqin): change to half2 load when inputs are Eigen::half.
+    const U diff =
+        strict_cast<U>(logits_[gid] - ldg(max_logits_ + gid / num_cols_));
+    return exp(diff);
   }
 
   const T* logits_;
@@ -80,7 +122,6 @@ void DoRowReduction(OpKernelContext* context, T* output, InputIter input,
   functor::ReduceImpl<T, Op, T*, InputIter, ReductionAxes>(
       context, output, input, 2, rows, cols, 1, 1, constants.kOne, op);
 }
-
 }  // namespace
 
 template <typename T>
@@ -108,8 +149,10 @@ class SoftmaxOpGPU : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_temp(DataTypeToEnum<T>::value,
                                             softmax_out->shape(), &max_logits));
+
+      typedef typename softmax_traits<T>::accumulator_type acc_type;
       OP_REQUIRES_OK(context,
-                     context->allocate_temp(DataTypeToEnum<T>::value,
+                     context->allocate_temp(DataTypeToEnum<acc_type>::value,
                                             softmax_out->shape(), &sum_probs));
 
       DoRowReduction<T, cub::Max, const T*>(
@@ -120,25 +163,28 @@ class SoftmaxOpGPU : public OpKernel {
       const int numBlocks = Eigen::divup(rows * cols, numThreads);
 
       cub::CountingInputIterator<int> counting_iterator(0);
-      typedef cub::TransformInputIterator<T, SubtractAndExpFunctor<T>,
+      typedef cub::TransformInputIterator<acc_type,
+                                          SubtractAndExpFunctor<T, acc_type>,
                                           cub::CountingInputIterator<int>>
           InputIterType;
 
       InputIterType input_itr(
           counting_iterator,
-          SubtractAndExpFunctor<T>(
+          SubtractAndExpFunctor<T, acc_type>(
               reinterpret_cast<const T*>(logits_in_.flat<T>().data()),
               reinterpret_cast<const T*>(max_logits.flat<T>().data()), cols));
 
-      DoRowReduction<T, cub::Sum, InputIterType>(
-          context, const_cast<T*>(sum_probs.flat<T>().data()), input_itr, rows,
-          cols);
+      DoRowReduction<acc_type, cub::Sum, InputIterType>(
+          context, const_cast<acc_type*>(sum_probs.flat<acc_type>().data()),
+          input_itr, rows, cols);
 
-      GenerateNormalizedProb<<<numBlocks, numThreads, 0, cu_stream>>>(
-          reinterpret_cast<const T*>(logits_in_.flat<T>().data()),
-          reinterpret_cast<const T*>(sum_probs.flat<T>().data()),
-          reinterpret_cast<const T*>(max_logits.flat<T>().data()),
-          const_cast<T*>(softmax_out->flat<T>().data()), rows, cols, log_);
+      GenerateNormalizedProb<T, acc_type>
+          <<<numBlocks, numThreads, 0, cu_stream>>>(
+              reinterpret_cast<const T*>(logits_in_.flat<T>().data()),
+              reinterpret_cast<const acc_type*>(
+                  sum_probs.flat<acc_type>().data()),
+              reinterpret_cast<const T*>(max_logits.flat<T>().data()),
+              const_cast<T*>(softmax_out->flat<T>().data()), rows, cols, log_);
     }
   }
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 4b5df7aff0e9b345fb94f9f06a9906972448c048..4ebb7fbcc70e10a0d1b7a5dd063c2524b01b6dfc 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -419,7 +419,7 @@ class SparseCrossOp : public OpKernel {
           context, TensorShapeUtils::IsMatrix(dense_list_in[i].shape()),
           errors::InvalidArgument(
               "Dense inputs should be a matrix but received shape ",
-              indices_list_in[i].shape().DebugString(), " at position ", i));
+              dense_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(context, dense_list_in[i].dim_size(0) == batch_size,
                   errors::InvalidArgument("Expected batch size ", batch_size,
                                           " got ", dense_list_in[i].dim_size(0),
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index a6a71fdfaf126410b26766954c0c2fc5b86d003a..9a3612bd72cdc2bc1c3c471beed6616816072a71 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -17,12 +17,14 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/xent_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/xent_op.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
@@ -41,37 +43,56 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& logits_in = context->input(0);
     const Tensor& labels_in = context->input(1);
-    OP_REQUIRES(context, logits_in.IsSameSize(labels_in),
-                errors::InvalidArgument(
-                    "logits and labels must be same size: logits_size=",
-                    logits_in.shape().DebugString(),
-                    " labels_size=", labels_in.shape().DebugString()));
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
-                errors::InvalidArgument("logits must be 2-dimensional"));
-    // As we already tested that both inputs have the same shape no need to
-    // check that "labels" is a matrix too.
+
+    TensorShape shape_in = logits_in.shape();
+
+    BCast bcast(BCast::FromShape(logits_in.shape()),
+                BCast::FromShape(labels_in.shape()));
+    if (!logits_in.IsSameSize(labels_in)) {
+      OP_REQUIRES(context, bcast.IsValid(),
+                  errors::InvalidArgument(
+                      "logits and labels must be broadcastable: logits_size=",
+                      logits_in.shape().DebugString(),
+                      " labels_size=", labels_in.shape().DebugString()));
+      shape_in = BCast::ToShape(bcast.output_shape());
+    }
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(shape_in),
+                errors::InvalidArgument("logits and labels must be beither "
+                                        "2-dimensional, or roadcasted to "
+                                        "2-dimensional"));
 
     // loss is 1-D (one per example), and size is batch_size.
 
     Tensor scratch;
     OP_REQUIRES_OK(
         context, context->allocate_temp(DataTypeToEnum<T>::value,
-                                        TensorShape({logits_in.dim_size(0), 1}),
+                                        TensorShape({shape_in.dim_size(0), 1}),
                                         &scratch));
 
     Tensor* loss_out = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(
-                       0, TensorShape({logits_in.dim_size(0)}), &loss_out));
+                       0, TensorShape({shape_in.dim_size(0)}), &loss_out));
     Tensor* back_out = nullptr;
     // Try to reuse the logits_in buffer for the backprop output.
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {0}, 1, logits_in.shape(), &back_out));
-    if (logits_in.dim_size(0) > 0) {
+                                {0}, 1, shape_in, &back_out));
+    if (shape_in.dim_size(0) > 0) {
       functor::XentFunctor<Device, T> functor;
-      functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
-              labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
-              back_out->matrix<T>());
+      if (logits_in.IsSameSize(labels_in)) {
+        functor(context->eigen_device<Device>(), shape_in.AsEigenDSizes<2>(),
+                Eigen::array<Eigen::DenseIndex, 2>{1, 1},
+                Eigen::array<Eigen::DenseIndex, 2>{1, 1}, logits_in.matrix<T>(),
+                labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
+                back_out->matrix<T>());
+      } else {
+        functor(context->eigen_device<Device>(), shape_in.AsEigenDSizes<2>(),
+                BCast::ToIndexArray<2>(bcast.x_bcast()),
+                BCast::ToIndexArray<2>(bcast.y_bcast()),
+                logits_in.template shaped<T, 2>(bcast.x_reshape()),
+                labels_in.template shaped<T, 2>(bcast.y_reshape()),
+                scratch.matrix<T>(), loss_out->vec<T>(), back_out->matrix<T>());
+      }
     }
   }
 };
@@ -81,13 +102,17 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
 namespace functor {
 template <typename Device, typename T>
 struct XentFunctorBase {
-  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(const Device& d,
+                  const Eigen::DSizes<Eigen::DenseIndex, 2>& shape,
+                  const Eigen::array<Eigen::DenseIndex, 2>& logits_bcast,
+                  const Eigen::array<Eigen::DenseIndex, 2>& labels_bcast,
+                  typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::ConstMatrix labels,
                   typename TTypes<T>::Matrix scratch,
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
-                                      backprop);
+    XentEigenImpl<Device, T>::Compute(d, shape, logits_bcast, labels_bcast,
+                                      logits, labels, scratch, loss, backprop);
   }
 };
 
diff --git a/tensorflow/core/kernels/xent_op.h b/tensorflow/core/kernels/xent_op.h
index e689fca7ff822cdebe68fa0d7f197a03f74104ce..87be17fca98d756a179a74552518a13484d03850 100644
--- a/tensorflow/core/kernels/xent_op.h
+++ b/tensorflow/core/kernels/xent_op.h
@@ -18,6 +18,7 @@ limitations under the License.
 // Functor definition for XentOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
@@ -33,7 +34,11 @@ struct XentFunctor {
   // scratch: temporary tensor, dims: batch_size, 1
   // loss: output tensor for the loss, dims: batch_size.
   // backprop: output tensor for the backprop, dims: batch_size, num_classes.
-  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(const Device &d,
+                  const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                  const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                  const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                  typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::ConstMatrix labels,
                   typename TTypes<T>::Matrix scratch,
                   typename TTypes<T>::Vec loss,
@@ -45,7 +50,11 @@ struct XentFunctor {
 // specializations for both device types.
 template <typename Device, typename T>
 struct XentEigenImpl {
-  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  static void Compute(const Device &d,
+                      const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                      const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                      const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                      typename TTypes<T>::ConstMatrix logits,
                       typename TTypes<T>::ConstMatrix labels,
                       typename TTypes<T>::Matrix scratch,
                       typename TTypes<T>::Vec loss,
@@ -57,8 +66,8 @@ struct XentEigenImpl {
     const int kBatchDim = 0;
     const int kClassDim = 1;
 
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
+    const int batch_size = shape[kBatchDim];
+    const int num_classes = shape[kClassDim];
 
 // These arrays are used to reduce along the class dimension, and broadcast
 // the resulting value to all classes.
@@ -84,10 +93,12 @@ struct XentEigenImpl {
 #endif
 
     // max_logits along classes.
-    scratch.reshape(batch_only).device(d) = logits.maximum(along_class);
+    scratch.reshape(batch_only).device(d) =
+        logits.broadcast(logits_bcast).maximum(along_class);
 
     // logits - max_logits.
-    backprop.device(d) = logits - scratch.broadcast(one_by_class);
+    backprop.device(d) =
+        logits.broadcast(logits_bcast) - scratch.broadcast(one_by_class);
 
     // sum(exp(logits - max_logits)) along classes.
     scratch.reshape(batch_only).device(d) = backprop.exp().sum(along_class);
@@ -99,15 +110,15 @@ struct XentEigenImpl {
     //  sum(-labels *
     //     ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
     //  along classes
-    loss.device(d) =
-        (labels * (scratch.log().eval().broadcast(one_by_class) - backprop))
-            .eval()
-            .sum(along_class);
+    loss.device(d) = (labels.broadcast(labels_bcast) *
+                      (scratch.log().eval().broadcast(one_by_class) - backprop))
+                         .eval()
+                         .sum(along_class);
 
     // backprop: prob - labels, where
     //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
-    backprop.device(d) =
-        (backprop.exp() / scratch.broadcast(one_by_class)) - labels;
+    backprop.device(d) = (backprop.exp() / scratch.broadcast(one_by_class)) -
+                         labels.broadcast(labels_bcast);
   }
 };
 
diff --git a/tensorflow/core/kernels/xent_op_gpu.cu.cc b/tensorflow/core/kernels/xent_op_gpu.cu.cc
index 05ee7da490e34d427cc4023bb33322c35006acf5..2c0c0b3a027e28c9502b162aa491dac83fea5fdd 100644
--- a/tensorflow/core/kernels/xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/xent_op_gpu.cu.cc
@@ -31,12 +31,17 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 template <typename T>
 struct XentFunctor<GPUDevice, T> {
-  void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(const GPUDevice &d,
+                  const Eigen::DSizes<Eigen::DenseIndex, 2> &shape,
+                  const Eigen::array<Eigen::DenseIndex, 2> &logits_bcast,
+                  const Eigen::array<Eigen::DenseIndex, 2> &labels_bcast,
+                  typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::ConstMatrix labels,
                   typename TTypes<T>::Matrix scratch,
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    XentEigenImpl<GPUDevice, T>::Compute(d, logits, labels, scratch, loss,
+    XentEigenImpl<GPUDevice, T>::Compute(d, shape, logits_bcast, labels_bcast,
+                                         logits, labels, scratch, loss,
                                          backprop);
   }
 };
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 6a1cc0994fd52eff5ed3e0611677107dcb7e4ad0..126e5a17af42a36be31f4fa6698f55d02f8321a7 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <cmath>
 #include <complex>
 
-// We need types.h here in order to pick up __BYTE_ORDER__ from cpu_info.h
-#include "tensorflow/core/platform/types.h"
+// We need cpu_info.h here in order to pick up __BYTE_ORDER__.
+#include "tensorflow/core/platform/cpu_info.h"
 
 #ifdef __CUDACC__
 // All functions callable from CUDA code must be qualified with __device__
@@ -165,6 +165,192 @@ struct bfloat16 {
     return complex128(double(*this), double(0.0));
   }
 
+  union FP32 {
+    unsigned int u;
+    float f;
+  };
+
+  // Converts a float point to bfloat16, with round-nearest-to-even as rounding
+  // method.
+  // TODO(b/69266521): Add a truncate_to_bfloat16 function and make this
+  // function as default behavior.
+  // TODO: There is a slightly faster implementation (8% faster on CPU)
+  // than this (documented in cl/175987786), that is exponentially harder to
+  // understand and document. Switch to the faster version when converting to
+  // BF16 becomes compute-bound.
+  B16_DEVICE_FUNC static bfloat16 round_to_bfloat16(float v) {
+    uint32_t input;
+    FP32 f;
+    f.f = v;
+    input = f.u;
+    bfloat16 output;
+
+    if (float_isnan(v)) {
+      // If the value is a NaN, squash it to a qNaN with msb of fraction set,
+      // this makes sure after truncation we don't end up with an inf.
+      //
+      // qNaN magic: All exponent bits set + most significant bit of fraction
+      // set.
+      output.value = 0x7fc0;
+    } else {
+      // Fast rounding algorithm that rounds a half value to nearest even. This
+      // reduces expected error when we convert a large number of floats. Here
+      // is how it works:
+      //
+      // Definitions:
+      // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+      // with the following tags:
+      //
+      // Sign |  Exp (8 bits) | Frac (23 bits)
+      //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+      //
+      //  S: Sign bit.
+      //  E: Exponent bits.
+      //  F: First 6 bits of fraction.
+      //  L: Least significant bit of resulting bfloat16 if we truncate away the
+      //  rest of the float32. This is also the 7th bit of fraction
+      //  R: Rounding bit, 8th bit of fraction.
+      //  T: Sticky bits, rest of fraction, 15 bits.
+      //
+      // To round half to nearest even, there are 3 cases where we want to round
+      // down (simply truncate the result of the bits away, which consists of
+      // rounding bit and sticky bits) and two cases where we want to round up
+      // (truncate then add one to the result).
+      //
+      // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+      // 1s) as the rounding bias, adds the rounding bias to the input, then
+      // truncates the last 16 bits away.
+      //
+      // To understand how it works, we can analyze this algorithm case by case:
+      //
+      // 1. L = 0, R = 0:
+      //   Expect: round down, this is less than half value.
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 0 = 0x7fff
+      //   - Adding rounding bias to input may create any carry, depending on
+      //   whether there is any value set to 1 in T bits.
+      //   - R may be set to 1 if there is a carry.
+      //   - L remains 0.
+      //   - Note that this case also handles Inf and -Inf, where all fraction
+      //   bits, including L, R and Ts are all 0. The output remains Inf after
+      //   this algorithm.
+      //
+      // 2. L = 1, R = 0:
+      //   Expect: round down, this is less than half value.
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 1 = 0x8000
+      //   - Adding rounding bias to input doesn't change sticky bits but
+      //   adds 1 to rounding bit.
+      //   - L remains 1.
+      //
+      // 3. L = 0, R = 1, all of T are 0:
+      //   Expect: round down, this is exactly at half, the result is already
+      //   even (L=0).
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 0 = 0x7fff
+      //   - Adding rounding bias to input sets all sticky bits to 1, but
+      //   doesn't create a carry.
+      //   - R remains 1.
+      //   - L remains 0.
+      //
+      // 4. L = 1, R = 1:
+      //   Expect: round up, this is exactly at half, the result needs to be
+      //   round to the next even number.
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 1 = 0x8000
+      //   - Adding rounding bias to input doesn't change sticky bits, but
+      //   creates a carry from rounding bit.
+      //   - The carry sets L to 0, creates another carry bit and propagate
+      //   forward to F bits.
+      //   - If all the F bits are 1, a carry then propagates to the exponent
+      //   bits, which then creates the minimum value with the next exponent
+      //   value. Note that we won't have the case where exponents are all 1,
+      //   since that's either a NaN (handled in the other if condition) or inf
+      //   (handled in case 1).
+      //
+      // 5. L = 0, R = 1, any of T is 1:
+      //   Expect: round up, this is greater than half.
+      //
+      //   Algorithm:
+      //   - Rounding bias: 0x7fff + 0 = 0x7fff
+      //   - Adding rounding bias to input creates a carry from sticky bits,
+      //   sets rounding bit to 0, then create another carry.
+      //   - The second carry sets L to 1.
+      //
+      // Examples:
+      //
+      //  Exact half value that is already even:
+      //    Input:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+      //
+      //     This falls into case 3. We truncate the rest of 16 bits and no
+      //     carry is created into F and L:
+      //
+      //    Output:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+      //     S     E E E E E E E E      F F F F F F L
+      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+      //
+      //  Exact half value, round to next even number:
+      //    Input:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+      //
+      //     This falls into case 4. We create a carry from R and T,
+      //     which then propagates into L and F:
+      //
+      //    Output:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+      //     S     E E E E E E E E      F F F F F F L
+      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+      //
+      //
+      //  Max denormal value round to min normal value:
+      //    Input:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+      //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+      //
+      //     This falls into case 4. We create a carry from R and T,
+      //     propagate into L and F, which then propagates into exponent
+      //     bits:
+      //
+      //    Output:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+      //     S     E E E E E E E E      F F F F F F L
+      //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+      //
+      //  Max normal value round to Inf:
+      //    Input:
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+      //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+      //
+      //     This falls into case 4. We create a carry from R and T,
+      //     propagate into L and F, which then propagates into exponent
+      //     bits:
+      //
+      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+      //     S     E E E E E E E E      F F F F F F L
+      //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+      //
+      //
+      // Least significant bit of resulting bfloat.
+      uint32_t lsb = (input >> 16) & 1;
+      uint32_t rounding_bias = 0x7fff + lsb;
+      input += rounding_bias;
+      output.value = static_cast<uint16_t>(input >> 16);
+    }
+    return output;
+  }
+
   static bfloat16 epsilon() {
     bfloat16 x;
     x.value = 0x3c00;  // 0x1.0p-7
@@ -177,7 +363,7 @@ struct bfloat16 {
   static const uint16_t NAN_VALUE = 0x7FC0;
 
  private:
-  B16_DEVICE_FUNC bool float_isnan(const float& x) {
+  B16_DEVICE_FUNC static bool float_isnan(const float& x) {
 #ifdef __CUDA_ARCH__
     return ::isnan(x);
 #else
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 2d00f717dcb26d527042014ac704e3ba91b45a58..835b938cbfdf35ae5f1a040ad793544e3a7eb101 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -88,11 +88,13 @@ class StringPiece {
 
   size_t find(char c, size_t pos = 0) const;
   size_t rfind(char c, size_t pos = npos) const;
+  // DEPRECATED: Use tensorflow::str_util::StrContains instead.
   bool contains(StringPiece s) const;
 
   // Checks whether StringPiece starts with x and if so advances the beginning
   // of it to past the match.  It's basically a shortcut for starts_with
   // followed by remove_prefix.
+  // DEPRECATED: Use tensorflow::str_util::ConsumePrefix instead.
   bool Consume(StringPiece x) {
     if (starts_with(x)) {
       remove_prefix(x.size_);
@@ -113,10 +115,12 @@ class StringPiece {
   int compare(StringPiece b) const;
 
   // Return true iff "x" is a prefix of "*this"
+  // DEPRECATED: Use tensorflow::str_util::StartsWith instead.
   bool starts_with(StringPiece x) const {
     return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
   }
   // Return true iff "x" is a suffix of "*this"
+  // DEPRECATED: Use tensorflow::str_util::EndsWith instead.
   bool ends_with(StringPiece x) const {
     return ((size_ >= x.size_) &&
             (memcmp(data_ + (size_ - x.size_), x.data_, x.size_) == 0));
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index 9ff87e8d66d2575966c703a896ac9ff0bc51661a..ce09c2009ac81b5cd2736800852a148bfefff6a9 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -42,9 +42,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/core/lib/io/format.cc b/tensorflow/core/lib/io/format.cc
index 64852943ad560edeede640fbd882e2984a4afee5..0c24c660a246eacde9fe0a0368a66eb511b1786d 100644
--- a/tensorflow/core/lib/io/format.cc
+++ b/tensorflow/core/lib/io/format.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <limits>
+
 #include "tensorflow/core/lib/io/format.h"
 
 #include "tensorflow/core/lib/core/coding.h"
@@ -84,6 +86,11 @@ Status ReadBlock(RandomAccessFile* file, const BlockHandle& handle,
   // Read the block contents as well as the type/crc footer.
   // See table_builder.cc for the code that built this structure.
   size_t n = static_cast<size_t>(handle.size());
+
+  if (kBlockTrailerSize > std::numeric_limits<size_t>::max() - n) {
+    return errors::DataLoss("handle.size() too big");
+  }
+
   char* buf = new char[n + kBlockTrailerSize];
   StringPiece contents;
   Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc
index 6be1f819c2081dd4cc73853276d1cd94399614ff..3608008b30181ca5025644437740f1cd0fe1a156 100644
--- a/tensorflow/core/lib/io/inputbuffer_test.cc
+++ b/tensorflow/core/lib/io/inputbuffer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -287,7 +288,7 @@ TEST(InputBuffer, Seek) {
     EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(1, &read)));
 
     EXPECT_TRUE(
-        StringPiece(in.Seek(-1).ToString()).contains("negative position"));
+        str_util::StrContains(in.Seek(-1).ToString(), "negative position"));
   }
 }
 
diff --git a/tensorflow/core/lib/io/path.cc b/tensorflow/core/lib/io/path.cc
index 83f15e134d6f60c65a7523458353ffd62345b7cc..996fbf62e5c1736f9922fcb652f65259e985a7f1 100644
--- a/tensorflow/core/lib/io/path.cc
+++ b/tensorflow/core/lib/io/path.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/strings/scanner.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 namespace io {
diff --git a/tensorflow/core/lib/io/path.h b/tensorflow/core/lib/io/path.h
index 47bb2b998d637099b3ab788f7ce274f83e4fc646..818ba99888d041f016210292a7c0cf18ef7d0e41 100644
--- a/tensorflow/core/lib/io/path.h
+++ b/tensorflow/core/lib/io/path.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_IO_PATH_H_
 #define TENSORFLOW_LIB_IO_PATH_H_
 
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index b7e51256a22b0d84e734e2a036a184b3adc3e547..63235761d92a90b1dc48029abe5aaceaf22f1f2e 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -218,7 +219,7 @@ TEST_F(RecordioTest, RandomRead) {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void AssertHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index f5822fad8e3d3b8559d19c79ee2885e580ea3e11..8f34baa7def340f320793660f10b6e0958dade70 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <locale>
 #include <unordered_map>
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -106,19 +107,22 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
 
 namespace strings {
 
-char* FastInt32ToBufferLeft(int32 i, char* buffer) {
+size_t FastInt32ToBufferLeft(int32 i, char* buffer) {
   uint32 u = i;
+  size_t length = 0;
   if (i < 0) {
     *buffer++ = '-';
+    ++length;
     // We need to do the negation in modular (i.e., "unsigned")
     // arithmetic; MSVC++ apparently warns for plain "-u", so
     // we write the equivalent expression "0 - u" instead.
     u = 0 - u;
   }
-  return FastUInt32ToBufferLeft(u, buffer);
+  length += FastUInt32ToBufferLeft(u, buffer);
+  return length;
 }
 
-char* FastUInt32ToBufferLeft(uint32 i, char* buffer) {
+size_t FastUInt32ToBufferLeft(uint32 i, char* buffer) {
   char* start = buffer;
   do {
     *buffer++ = ((i % 10) + '0');
@@ -126,19 +130,22 @@ char* FastUInt32ToBufferLeft(uint32 i, char* buffer) {
   } while (i > 0);
   *buffer = 0;
   std::reverse(start, buffer);
-  return buffer;
+  return buffer - start;
 }
 
-char* FastInt64ToBufferLeft(int64 i, char* buffer) {
+size_t FastInt64ToBufferLeft(int64 i, char* buffer) {
   uint64 u = i;
+  size_t length = 0;
   if (i < 0) {
     *buffer++ = '-';
+    ++length;
     u = 0 - u;
   }
-  return FastUInt64ToBufferLeft(u, buffer);
+  length += FastUInt64ToBufferLeft(u, buffer);
+  return length;
 }
 
-char* FastUInt64ToBufferLeft(uint64 i, char* buffer) {
+size_t FastUInt64ToBufferLeft(uint64 i, char* buffer) {
   char* start = buffer;
   do {
     *buffer++ = ((i % 10) + '0');
@@ -146,19 +153,18 @@ char* FastUInt64ToBufferLeft(uint64 i, char* buffer) {
   } while (i > 0);
   *buffer = 0;
   std::reverse(start, buffer);
-  return buffer;
+  return buffer - start;
 }
 
 static const double kDoublePrecisionCheckMax = DBL_MAX / 1.000000000000001;
 
-char* DoubleToBuffer(double value, char* buffer) {
+size_t DoubleToBuffer(double value, char* buffer) {
   // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
   // platforms these days.  Just in case some system exists where DBL_DIG
   // is significantly larger -- and risks overflowing our buffer -- we have
   // this assert.
   static_assert(DBL_DIG < 20, "DBL_DIG is too big");
 
-  bool full_precision_needed = true;
   if (std::abs(value) <= kDoublePrecisionCheckMax) {
     int snprintf_result =
         snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG, value);
@@ -167,18 +173,20 @@ char* DoubleToBuffer(double value, char* buffer) {
     // larger than the precision we asked for.
     DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
 
-    full_precision_needed =
-        locale_independent_strtonum<double>(buffer, nullptr) != value;
+    if (locale_independent_strtonum<double>(buffer, nullptr) == value) {
+      // Round-tripping the string to double works; we're done.
+      return snprintf_result;
+    }
+    // else: full precision formatting needed. Fall through.
   }
 
-  if (full_precision_needed) {
-    int snprintf_result =
-        snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG + 2, value);
+  int snprintf_result =
+      snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG + 2, value);
 
-    // Should never overflow; see above.
-    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
-  }
-  return buffer;
+  // Should never overflow; see above.
+  DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+
+  return snprintf_result;
 }
 
 namespace {
@@ -196,7 +204,7 @@ bool safe_strto64(StringPiece str, int64* value) {
 
   int64 vlimit = kint64max;
   int sign = 1;
-  if (str.Consume("-")) {
+  if (str_util::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different limit for positive and negative integers.
     vlimit = kint64min;
@@ -258,7 +266,7 @@ bool safe_strto32(StringPiece str, int32* value) {
 
   int64 vmax = kint32max;
   int sign = 1;
-  if (str.Consume("-")) {
+  if (str_util::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different max for positive and negative integers.
     ++vmax;
@@ -325,7 +333,7 @@ bool safe_strtod(const char* str, double* value) {
   return *str != '\0' && *endptr == '\0';
 }
 
-char* FloatToBuffer(float value, char* buffer) {
+size_t FloatToBuffer(float value, char* buffer) {
   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
   // platforms these days.  Just in case some system exists where FLT_DIG
   // is significantly larger -- and risks overflowing our buffer -- we have
@@ -347,7 +355,7 @@ char* FloatToBuffer(float value, char* buffer) {
     // Should never overflow; see above.
     DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
   }
-  return buffer;
+  return snprintf_result;
 }
 
 string FpToString(Fprint fp) {
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 3c45b9027401999ba4e6c32005456312970cccba..6b7703be378cde5755a034252eba83e4be99bdc0 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -60,19 +60,18 @@ static const int kFastToBufferSize = 32;
 // the output.  The buffer should typically be at least kFastToBufferSize
 // bytes.
 //
-// Returns a pointer to the end of the string (i.e. the null character
-// terminating the string).
+// Returns the number of characters written.
 // ----------------------------------------------------------------------
 
-char* FastInt32ToBufferLeft(int32 i, char* buffer);    // at least 12 bytes
-char* FastUInt32ToBufferLeft(uint32 i, char* buffer);  // at least 12 bytes
-char* FastInt64ToBufferLeft(int64 i, char* buffer);    // at least 22 bytes
-char* FastUInt64ToBufferLeft(uint64 i, char* buffer);  // at least 22 bytes
+size_t FastInt32ToBufferLeft(int32 i, char* buffer);    // at least 12 bytes
+size_t FastUInt32ToBufferLeft(uint32 i, char* buffer);  // at least 12 bytes
+size_t FastInt64ToBufferLeft(int64 i, char* buffer);    // at least 22 bytes
+size_t FastUInt64ToBufferLeft(uint64 i, char* buffer);  // at least 22 bytes
 
 // Required buffer size for DoubleToBuffer is kFastToBufferSize.
 // Required buffer size for FloatToBuffer is kFastToBufferSize.
-char* DoubleToBuffer(double i, char* buffer);
-char* FloatToBuffer(float i, char* buffer);
+size_t DoubleToBuffer(double value, char* buffer);
+size_t FloatToBuffer(float value, char* buffer);
 
 // Convert a 64-bit fingerprint value to an ASCII representation.
 string FpToString(Fprint fp);
diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc
index fee8a6f93e9a56c1d3a152683a27ad4fec8950ef..ede9f4d390180501bd65c3cbfe301da86d7530a6 100644
--- a/tensorflow/core/lib/strings/ordered_code_test.cc
+++ b/tensorflow/core/lib/strings/ordered_code_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -128,7 +129,7 @@ void TestWriteAppends(T first, U second) {
   string encoded_first_only = encoded;
   OCWriteToString<U>(&encoded, second);
   EXPECT_NE(encoded, encoded_first_only);
-  EXPECT_TRUE(StringPiece(encoded).starts_with(encoded_first_only));
+  EXPECT_TRUE(str_util::StartsWith(encoded, encoded_first_only));
 }
 
 template <typename T>
diff --git a/tensorflow/core/lib/strings/scanner.h b/tensorflow/core/lib/strings/scanner.h
index d3b63357ee71394250a2e13f7895363b8d82de29..c82e771368c1c2b8a945e070cfcd45f72f91f0a3 100644
--- a/tensorflow/core/lib/strings/scanner.h
+++ b/tensorflow/core/lib/strings/scanner.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
@@ -75,14 +76,14 @@ class Scanner {
   // Consume the next s.size() characters of the input, if they match <s>. If
   // they don't match <s>, this is a no-op.
   Scanner& ZeroOrOneLiteral(StringPiece s) {
-    cur_.Consume(s);
+    str_util::ConsumePrefix(&cur_, s);
     return *this;
   }
 
   // Consume the next s.size() characters of the input, if they match <s>. If
   // they don't match <s>, then GetResult will ultimately return false.
   Scanner& OneLiteral(StringPiece s) {
-    if (!cur_.Consume(s)) {
+    if (!str_util::ConsumePrefix(&cur_, s)) {
       error_ = true;
     }
     return *this;
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index d28857803d7ef1edd66ae6c1a6b81a7ed1dbce85..2c9e98357a1136876da57b5453f60490f4f8bb53 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 
 #include <ctype.h>
+#include <algorithm>
 #include <vector>
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace str_util {
@@ -373,7 +375,7 @@ size_t RemoveWhitespaceContext(StringPiece* text) {
 }
 
 bool ConsumePrefix(StringPiece* s, StringPiece expected) {
-  if (s->starts_with(expected)) {
+  if (StartsWith(*s, expected)) {
     s->remove_prefix(expected.size());
     return true;
   }
@@ -381,7 +383,7 @@ bool ConsumePrefix(StringPiece* s, StringPiece expected) {
 }
 
 bool ConsumeSuffix(StringPiece* s, StringPiece expected) {
-  if (s->ends_with(expected)) {
+  if (EndsWith(*s, expected)) {
     s->remove_suffix(expected.size());
     return true;
   }
@@ -452,5 +454,22 @@ bool SplitAndParseAsFloats(StringPiece text, char delim,
                                     result);
 }
 
+bool StrContains(StringPiece haystack, StringPiece needle) {
+  return std::search(haystack.begin(), haystack.end(), needle.begin(),
+                     needle.end()) != haystack.end();
+}
+
+bool StartsWith(StringPiece text, StringPiece prefix) {
+  return prefix.empty() ||
+         (text.size() >= prefix.size() &&
+          memcmp(text.data(), prefix.data(), prefix.size()) == 0);
+}
+
+bool EndsWith(StringPiece text, StringPiece suffix) {
+  return suffix.empty() || (text.size() >= suffix.size() &&
+                            memcmp(text.data() + (text.size() - suffix.size()),
+                                   suffix.data(), suffix.size()) == 0);
+}
+
 }  // namespace str_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index 44c52850fa99f7688fb496784a18b651c147bb8b..065871c1b4b05afc39da6bee13bda93359ddb913 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -141,6 +140,21 @@ bool SplitAndParseAsInts(StringPiece text, char delim,
 bool SplitAndParseAsFloats(StringPiece text, char delim,
                            std::vector<float>* result);
 
+// StartsWith()
+//
+// Returns whether a given string `text` begins with `prefix`.
+bool StartsWith(StringPiece text, StringPiece prefix);
+
+// EndsWith()
+//
+// Returns whether a given string `text` ends with `suffix`.
+bool EndsWith(StringPiece text, StringPiece suffix);
+
+// StrContains()
+//
+// Returns whether a given string `haystack` contains the substring `needle`.
+bool StrContains(StringPiece haystack, StringPiece needle);
+
 // ------------------------------------------------------------------
 // Implementation details below
 template <typename T>
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index 6d461241f7e9c5a29064c015991039d5bf95a80f..63643c3e8ed935ecea2a3430b938985ac7df85bb 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -430,4 +430,56 @@ TEST(StringReplace, EmptyStringReplaceAll) {
   EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
 }
 
+TEST(StartsWith, Basic) {
+  const string s1(
+      "123"
+      "\0"
+      "456",
+      7);
+  const StringPiece a("foobar");
+  const StringPiece b(s1);
+  const StringPiece e;
+  EXPECT_TRUE(str_util::StartsWith(a, a));
+  EXPECT_TRUE(str_util::StartsWith(a, "foo"));
+  EXPECT_TRUE(str_util::StartsWith(a, e));
+  EXPECT_TRUE(str_util::StartsWith(b, s1));
+  EXPECT_TRUE(str_util::StartsWith(b, b));
+  EXPECT_TRUE(str_util::StartsWith(b, e));
+  EXPECT_TRUE(str_util::StartsWith(e, ""));
+  EXPECT_FALSE(str_util::StartsWith(a, b));
+  EXPECT_FALSE(str_util::StartsWith(b, a));
+  EXPECT_FALSE(str_util::StartsWith(e, a));
+}
+
+TEST(EndsWith, Basic) {
+  const string s1(
+      "123"
+      "\0"
+      "456",
+      7);
+  const StringPiece a("foobar");
+  const StringPiece b(s1);
+  const StringPiece e;
+  EXPECT_TRUE(str_util::EndsWith(a, a));
+  EXPECT_TRUE(str_util::EndsWith(a, "bar"));
+  EXPECT_TRUE(str_util::EndsWith(a, e));
+  EXPECT_TRUE(str_util::EndsWith(b, s1));
+  EXPECT_TRUE(str_util::EndsWith(b, b));
+  EXPECT_TRUE(str_util::EndsWith(b, e));
+  EXPECT_TRUE(str_util::EndsWith(e, ""));
+  EXPECT_FALSE(str_util::EndsWith(a, b));
+  EXPECT_FALSE(str_util::EndsWith(b, a));
+  EXPECT_FALSE(str_util::EndsWith(e, a));
+}
+
+TEST(StrContains, Basic) {
+  StringPiece a("abcdefg");
+  StringPiece b("abcd");
+  StringPiece c("efg");
+  StringPiece d("gh");
+  EXPECT_TRUE(str_util::StrContains(a, b));
+  EXPECT_TRUE(str_util::StrContains(a, c));
+  EXPECT_TRUE(!str_util::StrContains(a, d));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/strcat.cc b/tensorflow/core/lib/strings/strcat.cc
index 5b1cff486dba46ab761762b3076610e60d636711..f140ec3d260efdb7b82234706c5a33584c38cbb2 100644
--- a/tensorflow/core/lib/strings/strcat.cc
+++ b/tensorflow/core/lib/strings/strcat.cc
@@ -20,16 +20,12 @@ limitations under the License.
 #include <stdio.h>
 #include <string.h>
 
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace strings {
 
-AlphaNum::AlphaNum(const Eigen::half &f)
-    : piece_(digits_, strlen(FloatToBuffer(static_cast<float>(f), digits_))) {}
-
 AlphaNum::AlphaNum(Hex hex) {
   char *const end = &digits_[kFastToBufferSize];
   char *writer = end;
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index b681f7398de1f8b26f8d8084ded3b104a1e3291f..fb2cd5bc7e5fb69650dfc2758b132d73e88375a9 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -27,10 +27,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace Eigen {
-struct half;
-}
-
 // The AlphaNum type was designed to be used as the parameter type for StrCat().
 // Any routine accepting either a string or a number may accept it.
 // The basic idea is that by accepting a "const AlphaNum &" as an argument
@@ -105,24 +101,23 @@ class AlphaNum {
   // A bool ctor would also convert incoming pointers (bletch).
 
   AlphaNum(int i32)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastInt32ToBufferLeft(i32, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastInt32ToBufferLeft(i32, digits_)) {}
   AlphaNum(unsigned int u32)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastUInt32ToBufferLeft(u32, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastUInt32ToBufferLeft(u32, digits_)) {}
   AlphaNum(long x)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastInt64ToBufferLeft(x, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastInt64ToBufferLeft(x, digits_)) {}
   AlphaNum(unsigned long x)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastUInt64ToBufferLeft(x, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastUInt64ToBufferLeft(x, digits_)) {}
   AlphaNum(long long int i64)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastInt64ToBufferLeft(i64, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastInt64ToBufferLeft(i64, digits_)) {}
   AlphaNum(unsigned long long int u64)  // NOLINT(runtime/explicit)
-      : piece_(digits_, FastUInt64ToBufferLeft(u64, digits_) - &digits_[0]) {}
+      : piece_(digits_, FastUInt64ToBufferLeft(u64, digits_)) {}
 
   AlphaNum(float f)  // NOLINT(runtime/explicit)
-      : piece_(digits_, strlen(FloatToBuffer(f, digits_))) {}
+      : piece_(digits_, FloatToBuffer(f, digits_)) {}
   AlphaNum(double f)  // NOLINT(runtime/explicit)
-      : piece_(digits_, strlen(DoubleToBuffer(f, digits_))) {}
+      : piece_(digits_, DoubleToBuffer(f, digits_)) {}
 
-  AlphaNum(const Eigen::half &f);  // NOLINT(runtime/explicit)
   AlphaNum(Hex hex);               // NOLINT(runtime/explicit)
 
   AlphaNum(const char *c_str) : piece_(c_str) {}   // NOLINT(runtime/explicit)
diff --git a/tensorflow/core/lib/strings/strcat_test.cc b/tensorflow/core/lib/strings/strcat_test.cc
index 7cb186e6375fae4d8a7140dd2f9ee6e7e64ddd1a..8cc64a6f0aecfd3dcce772b9a6c5c30ced86ba12 100644
--- a/tensorflow/core/lib/strings/strcat_test.cc
+++ b/tensorflow/core/lib/strings/strcat_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <string>
 
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -131,11 +130,6 @@ TEST(StrCat, Basics) {
   result = tensorflow::strings::StrCat("A hundred K and a half squared is ", d);
   EXPECT_EQ(result, "A hundred K and a half squared is 10000100000.25");
 
-  Eigen::half h(10007.0f);
-  result =
-      tensorflow::strings::StrCat("Ten thousand seven is approximately ", h);
-  EXPECT_EQ(result, "Ten thousand seven is approximately 10008");
-
   result = tensorflow::strings::StrCat(1, 2, 333, 4444, 55555, 666666, 7777777,
                                        88888888, 999999999);
   EXPECT_EQ(result, "12333444455555666666777777788888888999999999");
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 2165415ba503166d187b37d2a7f27a4af37fcb64..51b9c6cd82c4769cfd333e91177bc9b7ba5e38de 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -95,10 +95,6 @@ Status IncrementOffset(int old_offset, size_t increment, size_t max_size,
     return errors::InvalidArgument("Initial offset is outside data range: ",
                                    old_offset);
   }
-  if (increment < 0) {
-    return errors::InvalidArgument("Negative increments are not allowed: ",
-                                   old_offset);
-  }
   *new_offset = old_offset + increment;
   if (*new_offset > max_size) {
     return errors::InvalidArgument("Data too short when trying to read string");
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index d8a83fc464b33d274aa4f8174132980275fd8598..9e41da6a20dc5c1491786a5391e12ca7aa2e722c 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -203,7 +204,7 @@ TEST(WavIO, ChunkSizeOverflow) {
       wav_data_string, &decoded_audio, &decoded_sample_count,
       &decoded_channel_count, &decoded_sample_rate);
   EXPECT_FALSE(decode_status.ok());
-  EXPECT_TRUE(StringPiece(decode_status.error_message()).contains("too large"))
+  EXPECT_TRUE(str_util::StrContains(decode_status.error_message(), "too large"))
       << decode_status.error_message();
 }
 
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 39b92464cb8f626d5581e7be1347d8c735f8277e..62ce70eb6b4bc9621f49b06d0eead85b6c76d17c 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -178,46 +178,88 @@ Status SetOutputShapeForReshape(InferenceContext* c) {
     c->set_output(0, out);
     return Status::OK();
   }
-  DimensionHandle num_in_elems = c->NumElements(in);
-  if (c->FullyDefined(out)) {
-    DimensionHandle num_out_elems = c->NumElements(out);
-    if (c->ValueKnown(num_in_elems) &&
-        c->Value(num_in_elems) != c->Value(num_out_elems)) {
-      return errors::InvalidArgument(
-          "Cannot reshape a tensor with ", c->DebugString(num_in_elems),
-          " elements to shape ", c->DebugString(out), " (",
-          c->DebugString(num_out_elems), " elements)");
-    }
-    c->set_output(0, out);
-    return Status::OK();
-  }
 
-  if (c->ValueKnown(num_in_elems)) {
+  if (c->RankKnown(out) && c->RankKnown(in)) {
     // We don't know the number of output elements, but we can try to infer
     // the missing dimension.
-    int32 unknown_idx = -1;
     bool too_many_unknown = false;
-    DimensionHandle known_elems = c->MakeDim(1);
-    for (int32 i = 0; i < c->Rank(out); ++i) {
-      DimensionHandle dim = c->Dim(out, i);
-      if (!c->ValueKnown(dim)) {
-        if (unknown_idx >= 0) {
-          too_many_unknown = true;
-          break;
+    int32 out_unknown_idx = -1;
+
+    DimensionHandle known_out_elems = c->NumElements(out);
+    if (!c->ValueKnown(known_out_elems)) {
+      known_out_elems = c->MakeDim(1);
+      for (int32 i = 0; i < c->Rank(out); ++i) {
+        DimensionHandle dim = c->Dim(out, i);
+        if (!c->ValueKnown(dim)) {
+          if (out_unknown_idx >= 0) {
+            too_many_unknown = true;
+            break;
+          }
+          out_unknown_idx = i;
+        } else {
+          TF_RETURN_IF_ERROR(
+              c->Multiply(known_out_elems, dim, &known_out_elems));
         }
-        unknown_idx = i;
-      } else {
-        TF_RETURN_IF_ERROR(c->Multiply(known_elems, dim, &known_elems));
       }
     }
-    if (!too_many_unknown && c->Value(known_elems) != 0) {
-      DimensionHandle inferred_dim;
-      TF_RETURN_IF_ERROR(c->Divide(num_in_elems, c->Value(known_elems),
-                                   true /* evenly_divisible */, &inferred_dim));
-      TF_RETURN_IF_ERROR(c->ReplaceDim(out, unknown_idx, inferred_dim, &out));
+    int32 in_unknown_idx = -1;
+    DimensionHandle known_in_elems = c->NumElements(in);
+    if (!c->ValueKnown(known_in_elems)) {
+      known_in_elems = c->MakeDim(1);
+      for (int32 i = 0; i < c->Rank(in); ++i) {
+        DimensionHandle dim = c->Dim(in, i);
+        if (!c->ValueKnown(dim)) {
+          if (in_unknown_idx >= 0) {
+            too_many_unknown = true;
+            break;
+          }
+          in_unknown_idx = i;
+        } else {
+          TF_RETURN_IF_ERROR(c->Multiply(known_in_elems, dim, &known_in_elems));
+        }
+      }
     }
-  }
 
+    if (!too_many_unknown) {
+      if (in_unknown_idx < 0 && out_unknown_idx < 0) {
+        // Just check that the dimensions match.
+        if (c->Value(known_in_elems) != c->Value(known_out_elems)) {
+          return errors::InvalidArgument(
+              "Cannot reshape a tensor with ", c->DebugString(known_in_elems),
+              " elements to shape ", c->DebugString(out), " (",
+              c->DebugString(known_out_elems), " elements)");
+        }
+      } else if (in_unknown_idx < 0 && out_unknown_idx >= 0 &&
+                 c->Value(known_out_elems) > 0) {
+        // Input fully known, infer the one missing output dim
+        DimensionHandle inferred_dim;
+        TF_RETURN_IF_ERROR(c->Divide(known_in_elems, c->Value(known_out_elems),
+                                     true /* evenly_divisible */,
+                                     &inferred_dim));
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(out, out_unknown_idx, inferred_dim, &out));
+
+      } else if (in_unknown_idx >= 0 && out_unknown_idx < 0 &&
+                 c->Value(known_in_elems) != 0) {
+        // Output fully known, infer the one missing input dim
+        DimensionHandle inferred_dim;
+        TF_RETURN_IF_ERROR(c->Divide(known_out_elems, c->Value(known_in_elems),
+                                     true /* evenly_divisible */,
+                                     &inferred_dim));
+        DimensionHandle unknown_in_dim = c->Dim(in, in_unknown_idx);
+        TF_RETURN_IF_ERROR(
+            c->Merge(unknown_in_dim, inferred_dim, &unknown_in_dim));
+      } else if (in_unknown_idx >= 0 && out_unknown_idx >= 0) {
+        // Exactly one unknown dimension in both input and output. These 2 are
+        // equal iff the known elements are equal.
+        if (c->Value(known_in_elems) == c->Value(known_out_elems)) {
+          DimensionHandle unknown_in_dim = c->Dim(in, in_unknown_idx);
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(out, out_unknown_idx, unknown_in_dim, &out));
+        }
+      }
+    }
+  }
   c->set_output(0, out);
   return Status::OK();
 }
@@ -452,9 +494,9 @@ REGISTER_OP("SplitV")
       const Tensor* size_splits = c->input_tensor(1);
       if (rank == InferenceContext::kUnknownRank) {
         // If the rank of input tensor is unknown, then return unknown shapes.
-        output_shape = c->UnknownShape();
+        // Note that the shape of each output can be different.
         for (int i = 0; i < num_outputs; ++i) {
-          c->set_output(i, output_shape);
+          c->set_output(i, c->UnknownShape());
         }
       } else if (rank == 0) {
         // Throw error if input is a scalar.
@@ -463,18 +505,19 @@ REGISTER_OP("SplitV")
         // If split dimension is known, but the sizes are unknown, then
         // only the split dimension is unknown
         output_shape = input;
-        TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
-                                         c->Value(split_dimension),
-                                         c->UnknownDim(), &output_shape));
         for (int i = 0; i < num_outputs; ++i) {
+          TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
+                                           c->Value(split_dimension),
+                                           c->UnknownDim(), &output_shape));
           c->set_output(i, output_shape);
         }
       } else if (size_splits == nullptr && !c->ValueKnown(split_dimension)) {
         // If split dimension or tensor containing the split sizes is unknown,
-        // then return unknown shapes of same rank as input.
-        output_shape = c->UnknownShapeOfRank(rank);
+        // then return unknown shapes of same rank as input. Note that each
+        // output shape can be different since splitv doesn't always split
+        // tensors evenly.
         for (int i = 0; i < num_outputs; ++i) {
-          c->set_output(i, output_shape);
+          c->set_output(i, c->UnknownShapeOfRank(rank));
         }
       } else {
         // Determine the output shape if split dimension and split sizes are
@@ -752,11 +795,35 @@ REGISTER_OP("ReverseV2")
       ShapeHandle input = c->input(0);
       ShapeHandle axis;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &axis));
-      // TODO(aselle): if input(0)'s dimension is known we could validate axis
       if (c->Rank(input) > 8) {
         return errors::InvalidArgument(
             "reverse does not work on tensors with more than 8 dimensions");
       }
+      const Tensor* axis_tensor = c->input_tensor(1);
+      if (axis_tensor != nullptr && c->RankKnown(input)) {
+        int32 rank = c->Rank(input);
+        std::vector<int64> axis_value;
+        if (axis_tensor->dtype() == DT_INT32) {
+          axis_value = AsInt64<int32>(axis_tensor, axis_tensor->NumElements());
+        } else {
+          axis_value = AsInt64<int64>(axis_tensor, axis_tensor->NumElements());
+        }
+        std::vector<bool> axes_dense(c->Rank(input), false);
+        for (int i = 0; i < axis_value.size(); i++) {
+          int64 canonical_axis =
+              axis_value[i] < 0 ? rank + axis_value[i] : axis_value[i];
+          if (canonical_axis < 0 || canonical_axis >= rank) {
+            return errors::InvalidArgument("'axis'[", i, "] = ", axis_value[i],
+                                           " is out of valid range [", 0, ", ",
+                                           rank - 1);
+          }
+          if (axes_dense[canonical_axis]) {
+            return errors::InvalidArgument("axis ", canonical_axis,
+                                           " specified more than once.");
+          }
+          axes_dense[canonical_axis] = true;
+        }
+      }
       c->set_output(0, input);
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index cf5bb5ad849571c92f7dccf4d0fdc5780965567c..b1463338fbe726e10a3fb0a2cdc69521ab021ce6 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -838,7 +838,7 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
   // Unknown dimensions.
   // Flatten:
   new_shape = test::AsTensor<int32>({-1});
-  INFER_OK(op, "[?];[1]", "[?]");
+  INFER_OK(op, "[?];[1]", "[d0_0]");
   INFER_OK(op, "[2,2];[1]", "[4]");
   // The first dimension is inferred:
   new_shape = test::AsTensor<int32>({2, -1});
@@ -851,6 +851,10 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
   new_shape = test::AsTensor<int32>({-1, -1, 2});
   INFER_OK(op, "[8];[3]", "[?,?,2]");
 
+  // Symbolic shape propagation
+  new_shape = test::AsTensor<int32>({-1, 2, 3});
+  INFER_OK(op, "[?,2,3];[3]", "[d0_0,2,3]");
+
   // Reshaping to a scalar.
   new_shape = test::AsTensor<int32>({});
   INFER_OK(op, "[1];[0]", "[]");
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..297e94655fe3c62893291de2a256b177222cd7a2
--- /dev/null
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -0,0 +1,319 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_OP(BoostedTreesEnsembleResource);
+
+REGISTER_OP("IsBoostedTreesEnsembleInitialized")
+    .Input("tree_ensemble_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
+    .Input("node_id_range: int32")
+    .Input("stats_summary_list: num_features * float32")
+    .Attr("l1: float")
+    .Attr("l2: float")
+    .Attr("tree_complexity: float")
+    .Attr("max_splits: int >= 1")
+    .Attr("num_features: int >= 1")  // not passed but populated automatically.
+    .Output("node_ids_list: num_features * int32")
+    .Output("gains_list: num_features * float32")
+    .Output("thresholds_list: num_features * int32")
+    .Output("left_node_contribs_list: num_features * float32")
+    .Output("right_node_contribs_list: num_features * float32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Confirms the rank of the inputs and sets the shape of the outputs.
+      int max_splits;
+      int num_features;
+      float l1, l2, tree_complexity;
+      TF_RETURN_IF_ERROR(c->GetAttr("l1", &l1));
+      if (l1 < 0) {
+        return errors::InvalidArgument("l1 must be non-negative.");
+      }
+      TF_RETURN_IF_ERROR(c->GetAttr("l2", &l2));
+      if (l2 < 0) {
+        return errors::InvalidArgument("l2 must be non-negative.");
+      }
+      TF_RETURN_IF_ERROR(c->GetAttr("tree_complexity", &tree_complexity));
+      if (tree_complexity < 0) {
+        return errors::InvalidArgument("Tree complexity must be non-negative.");
+      }
+      TF_RETURN_IF_ERROR(c->GetAttr("max_splits", &max_splits));
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+      shape_inference::ShapeHandle node_id_range_shape;
+      shape_inference::ShapeHandle unused_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &node_id_range_shape));
+      TF_RETURN_IF_ERROR(
+          c->Merge(node_id_range_shape, c->MakeShape({2}), &unused_shape));
+      // Checks that all stats summary entries are of the same shape.
+      shape_inference::ShapeHandle summary_shape_base;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &summary_shape_base));
+      TF_RETURN_IF_ERROR(c->Merge(summary_shape_base,
+                                  c->MakeShape({max_splits, -1, 2}),
+                                  &unused_shape));
+      for (int i = 1; i < num_features; ++i) {
+        shape_inference::ShapeHandle summary_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(1 + i), 3, &summary_shape));
+        TF_RETURN_IF_ERROR(
+            c->Merge(summary_shape_base, summary_shape, &unused_shape));
+      }
+      // Sets the output lists.
+      std::vector<shape_inference::ShapeHandle> output_shapes_vec(
+          num_features, c->MakeShape({-1}));
+      TF_RETURN_IF_ERROR(c->set_output("node_ids_list", output_shapes_vec));
+      TF_RETURN_IF_ERROR(c->set_output("gains_list", output_shapes_vec));
+      TF_RETURN_IF_ERROR(c->set_output("thresholds_list", output_shapes_vec));
+      std::vector<shape_inference::ShapeHandle> output_shapes_contribs(
+          num_features, c->MakeShape({-1, 1}));
+      TF_RETURN_IF_ERROR(
+          c->set_output("left_node_contribs_list", output_shapes_contribs));
+      TF_RETURN_IF_ERROR(
+          c->set_output("right_node_contribs_list", output_shapes_contribs));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesCreateEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("tree_ensemble_serialized: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesDeserializeEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("tree_ensemble_serialized: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesGetEnsembleStates")
+    .Input("tree_ensemble_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("num_trees: int32")
+    .Output("num_finalized_trees: int32")
+    .Output("num_attempted_layers: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      c->set_output(3, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesMakeStatsSummary")
+    .Input("node_ids: int32")
+    .Input("gradients: float")
+    .Input("hessians: float")
+    .Input("bucketized_features_list: num_features * int32")
+    .Attr("max_splits: int >= 1")
+    .Attr("num_buckets: int >= 1")
+    .Attr("num_features: int >= 1")
+    .Output("stats_summary: float")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Sets the shape of the output as a Rank 4 Tensor.
+      int max_splits;
+      int num_buckets;
+      int num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("max_splits", &max_splits));
+      TF_RETURN_IF_ERROR(c->GetAttr("num_buckets", &num_buckets));
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+      shape_inference::ShapeHandle node_ids_shape;
+      shape_inference::ShapeHandle gradients_shape;
+      shape_inference::ShapeHandle hessians_shape;
+      shape_inference::ShapeHandle bucketized_feature_shape;
+      shape_inference::ShapeHandle unused_shape;
+      shape_inference::DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &node_ids_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &gradients_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &hessians_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(node_ids_shape, 0),
+                                  c->Dim(gradients_shape, 0), &unused_dim));
+      TF_RETURN_IF_ERROR(
+          c->Merge(gradients_shape, hessians_shape, &unused_shape));
+      for (int f = 0; f < num_features; ++f) {
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(3 + f), 1, &bucketized_feature_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(node_ids_shape, 0),
+                                    c->Dim(bucketized_feature_shape, 0),
+                                    &unused_dim));
+      }
+      c->set_output(0,
+                    c->MakeShape({num_features, max_splits, num_buckets, 2}));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesPredict")
+    .Input("tree_ensemble_handle: resource")
+    .Input("bucketized_features: num_bucketized_features * int32")
+    .Attr("num_bucketized_features: int >= 1")
+    .Attr("logits_dimension: int")
+    .Attr("max_depth: int >= 1")
+    .Output("logits: float")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle feature_shape;
+      int num_bucketized_features;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_bucketized_features", &num_bucketized_features));
+      shape_inference::ShapeHandle unused_input;
+      for (int i = 0; i < num_bucketized_features; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 1), 1, &feature_shape));
+        // Check that the shapes of all bucketized features are the same.
+        TF_RETURN_IF_ERROR(c->Merge(c->input(1), feature_shape, &unused_input));
+      }
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      auto logits_shape =
+          c->MakeShape({c->Dim(feature_shape, 0), logits_dimension});
+      // Logits.
+      c->set_output(0, logits_shape);
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesSerializeEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("tree_ensemble_serialized: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesTrainingPredict")
+    .Input("tree_ensemble_handle: resource")
+    .Input("cached_tree_ids: int32")
+    .Input("cached_node_ids: int32")
+    .Input("bucketized_features: num_bucketized_features * int32")
+    .Attr("num_bucketized_features: int >= 1")
+    .Attr("logits_dimension: int")
+    .Attr("max_depth: int >= 1")
+    .Output("partial_logits: float")
+    .Output("tree_ids: int32")
+    .Output("node_ids: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle feature_shape;
+      int num_bucketized_features;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_bucketized_features", &num_bucketized_features));
+
+      int max_depth;
+      TF_RETURN_IF_ERROR(c->GetAttr("max_depth", &max_depth));
+
+      shape_inference::ShapeHandle unused_input;
+      for (int i = 0; i < num_bucketized_features; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 3), 1, &feature_shape));
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->input(i + 3), feature_shape, &unused_input));
+      }
+      // all inputs/outputs except logits should have same shape.
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), feature_shape, &unused_input));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), feature_shape, &unused_input));
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      auto logits_shape =
+          c->MakeShape({c->Dim(feature_shape, 0), logits_dimension});
+      // Partial logits.
+      c->set_output(0, logits_shape);
+      // Tree ids.
+      c->set_output(1, c->MakeShape({c->Dim(feature_shape, 0)}));
+      // Node ids.
+      c->set_output(2, c->MakeShape({c->Dim(feature_shape, 0)}));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesUpdateEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("feature_ids: int32")
+    .Input("node_ids: num_features * int32")
+    .Input("gains: num_features * float")
+    .Input("thresholds: num_features * int32")
+    .Input("left_node_contribs: num_features * float")
+    .Input("right_node_contribs: num_features * float")
+    .Attr("max_depth: int >= 1")
+    .Attr("learning_rate: float")
+    .Attr("pruning_mode: int >=0")
+    .Attr("num_features: int >= 0")  // Inferred.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle shape_handle;
+      int num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+
+      // Feature_ids, should be one for each feature.
+      shape_inference::ShapeHandle feature_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &feature_ids_shape));
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->input(1), c->Vector(num_features), &shape_handle));
+
+      for (int i = 0; i < num_features; ++i) {
+        // Node ids.
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 2), 1, &shape_handle));
+        auto shape_rank_1 = c->MakeShape({c->Dim(shape_handle, 0)});
+        auto shape_rank_2 = c->MakeShape({c->Dim(shape_handle, 0), 1});
+
+        // Gains.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features + 2), 1, &shape_handle));
+        // TODO(nponomareva): replace this with input("name",vector of shapes).
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features + 2),
+                                    shape_rank_1, &shape_handle));
+        // Thresholds.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 2 + 2), 1, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 2 + 2),
+                                    shape_rank_1, &shape_handle));
+        // Left and right node contribs.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 3 + 2), 2, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 3 + 2),
+                                    shape_rank_2, &shape_handle));
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 4 + 2), 2, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 4 + 2),
+                                    shape_rank_2, &shape_handle));
+      }
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 6cdb1586bc826f5de4c926c0b5f7bf0f9285cd93..c613ab144f8824586121200b3f89c87b25cc7522 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -57,18 +57,3 @@ tf_cc_binary(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 85dd1a423a5f4627eca36c4b9cbb37a57feac229..10b24c2d3426ddb37d521fbbf4197318b9789d08 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10340,6 +10340,342 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "BoostedTreesCalculateBestGainsPerFeature"
+  input_arg {
+    name: "node_id_range"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "node_ids_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "thresholds_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "left_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "right_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "tree_complexity"
+    type: "float"
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesCreateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesDeserializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesEnsembleResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesGetEnsembleStates"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_finalized_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_attempted_layers"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesMakeStatsSummary"
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "bucketized_features_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "stats_summary"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  attr {
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesSerializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesTrainingPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "cached_tree_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "cached_node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "partial_logits"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "tree_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  attr {
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesUpdateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "learning_rate"
+    type: "float"
+  }
+  attr {
+    name: "pruning_mode"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
 op {
   name: "BroadcastArgs"
   input_arg {
@@ -10867,6 +11203,14 @@ op {
     }
   }
 }
+op {
+  name: "CloseSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -11602,6 +11946,142 @@ op {
     }
   }
 }
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
 op {
   name: "Conv2DBackpropFilter"
   input_arg {
@@ -11626,6 +12106,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -11664,6 +12145,18 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv2DBackpropFilter"
@@ -11691,6 +12184,7 @@ op {
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -11881,15 +12375,19 @@ op {
   }
 }
 op {
-  name: "Conv3D"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
     type_attr: "T"
   }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -11899,6 +12397,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11907,8 +12407,13 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "padding"
@@ -11920,6 +12425,31 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv3D"
@@ -11961,19 +12491,6 @@ op {
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
 }
 op {
   name: "Conv3D"
@@ -11994,8 +12511,6 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12030,22 +12545,9 @@ op {
       }
     }
   }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
 }
 op {
-  name: "Conv3DBackpropFilter"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -12054,10 +12556,6 @@ op {
     name: "filter"
     type_attr: "T"
   }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -12067,6 +12565,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12088,8 +12588,31 @@ op {
       }
     }
   }
-  deprecation {
-    version: 10
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
@@ -12115,7 +12638,55 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12822,6 +13393,54 @@ op {
     }
   }
 }
+op {
+  name: "CreateSummaryDbWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "db_uri"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "experiment_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "run_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "user_name"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "CreateSummaryFileWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "logdir"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "max_queue"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flush_millis"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filename_suffix"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "CropAndResize"
   input_arg {
@@ -13224,6 +13843,582 @@ op {
     }
   }
 }
+op {
+  name: "CudnnRNN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackprop"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNCanonicalToParams"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  input_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNParamsSize"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "params_size"
+    type_attr: "S"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNParamsToCanonical"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
 op {
   name: "Cumprod"
   input_arg {
@@ -19468,6 +20663,46 @@ op {
     }
   }
 }
+op {
+  name: "FlushSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
 op {
   name: "FractionalAvgPool"
   input_arg {
@@ -21551,6 +22786,45 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
 op {
   name: "Igamma"
   input_arg {
@@ -21770,6 +23044,18 @@ op {
     type: "string"
   }
 }
+op {
+  name: "ImportEvent"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "event"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "InTopK"
   input_arg {
@@ -22454,6 +23740,18 @@ op {
     }
   }
 }
+op {
+  name: "IsBoostedTreesEnsembleInitialized"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
 op {
   name: "IsFinite"
   input_arg {
@@ -24307,6 +25605,10 @@ op {
     name: "num_parallel_batches"
     type: DT_INT64
   }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -42691,17 +43993,119 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -42721,7 +44125,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "ResourceScatterDiv"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -42741,21 +44145,123 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMax"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMin"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -42772,7 +44278,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "ResourceScatterMul"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -42859,6 +44365,57 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterSub"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterUpdate"
   input_arg {
@@ -48018,6 +49575,110 @@ op {
     }
   }
 }
+op {
+  name: "ScatterMax"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMin"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ScatterMul"
   input_arg {
@@ -51522,6 +53183,37 @@ op {
     }
   }
 }
+op {
+  name: "SlideDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "Snapshot"
   input_arg {
@@ -62152,6 +63844,28 @@ op {
     }
   }
 }
+op {
+  name: "SummaryWriter"
+  output_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Svd"
   input_arg {
@@ -66432,6 +68146,31 @@ op {
     }
   }
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
@@ -66477,6 +68216,39 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "WriteAudioSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "WriteFile"
   input_arg {
@@ -66488,6 +68260,180 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "WriteGraphSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteHistogramSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteImageSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bad_color"
+    type: DT_UINT8
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteScalarSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "summary_metadata"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "ZerosLike"
   input_arg {
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/core/ops/cudnn_rnn_ops.cc
similarity index 53%
rename from tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
rename to tensorflow/core/ops/cudnn_rnn_ops.cc
index 1a79bf066c3a27e040099729fb079ee963f59270..37d70a22ef61ad4e31259dc3001db72ffcea7d93 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops.cc
@@ -21,31 +21,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr auto kCudnnRNNCommonInputs = R"doc(
-num_layers: Specifies the number of layers in the RNN model.
-num_units: Specifies the size of the hidden state.
-input_size: Specifies the size of the input state.
-)doc";
-
-constexpr auto kCudnnRNNCommonAttrs = R"doc(
-rnn_mode: Indicates the type of the RNN model.
-input_mode: Indicate whether there is a linear projection between the input and
-    The actual computation before the first layer. 'skip_input' is only allowed
-    when input_size == num_units; 'auto_select' implies 'skip_input' when
-    input_size == num_units; otherwise, it implies 'linear_input'.
-direction: Indicates whether a bidirectional model will be used.
-    dir = (direction == bidirectional) ? 2 : 1
-dropout: dropout probability. When set to 0., dropout is disabled.
-seed: the 1st part of a seed to initialize dropout.
-seed2: the 2nd part of a seed to initialize dropout.
-)doc";
-
-constexpr auto kCudnnRNNParamsBuffer = R"doc(
-Note that the params buffer may not be compatible across different GPUs. So any
-save and restoration should be converted to and from the canonical weights and
-biases.
-)doc";
-
 constexpr auto kRNNModeAttrs =
     "rnn_mode: {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'} = 'lstm'";
 
@@ -56,21 +31,13 @@ constexpr auto kRNNInputModeAttrs =
 constexpr auto kRNNDirectionAttrs =
     "direction: {'unidirectional', 'bidirectional'} = 'unidirectional'";
 
-constexpr auto kCudnnRNNParamsCanonical = R"doc(
-weights: the canonical form of weights that can be used for saving
-    and restoration. They are more likely to be compatible across different
-    generations.
-biases: the canonical form of biases that can be used for saving
-    and restoration. They are more likely to be compatible across different
-    generations.
-)doc";
-
 }  // namespace
 
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+
 REGISTER_OP("CudnnRNNParamsSize")
     .Input("num_layers: int32")
     .Input("num_units: int32")
@@ -87,38 +54,8 @@ REGISTER_OP("CudnnRNNParamsSize")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(1));
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Return the params size that can be used by the Cudnn RNN model. Subsequent
-weight allocation and initialization should use this size.
-)doc",
-                         kCudnnRNNCommonInputs, kCudnnRNNCommonAttrs,
-                         R"doc(
-params_size: The size of the params buffer that should be allocated and
-    initialized for this RNN model. Note that this params buffer may not be
-    compatible across GPUs. Please use CudnnRNNParamsWeights and
-    CudnnRNNParamsBiases to save and restore them in a way that is compatible
-    across different runs.
-)doc",
-                         kCudnnRNNParamsBuffer));
+    });
 
-static string CudnnRNNForwardTensors() {
-  return R"doc(
-input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
-input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
-    num_units].
-input_c: For LSTM, a 3-D tensor with the shape of
-    [num_layer * dir, batch, num_units]. For other models, it is ignored.
-params: a 1-D tensor that contains the weights and biases in an opaque layout.
-    The size must be created through CudnnRNNParamsSize, and initialized
-    separately. Note that they might not be compatible across different
-    generations. So it is a good idea to save and restore
-output: a 3-D tensor with the shape of [seq_length, batch_size,
-    dir * num_units].
-output_h: the same shape has input_h.
-output_c: the same shape as input_c for LSTM. An empty tensor for other models.
-)doc";
-}
 
 REGISTER_OP("CudnnRNN")
     .Input("input: T")
@@ -160,18 +97,8 @@ REGISTER_OP("CudnnRNN")
       c->set_output(2, output_c_shape);
       c->set_output(3, c->UnknownShape());
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Computes the RNN from the input and initial states, with respect to the params
-buffer.
-)doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
-                         R"doc(
-is_training: Indicates whether this operation is used for inferenece or
-    training.
-reserve_space: an opaque tensor that can be used in backprop calculation. It
-    is only produced if is_training is false.
-)doc"));
+    });
+
 
 REGISTER_OP("CudnnRNNBackprop")
     .Input("input: T")
@@ -207,27 +134,8 @@ REGISTER_OP("CudnnRNNBackprop")
       c->set_output(2, input_c_shape);
       c->set_output(3, params_shape);
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Compute the backprop of both data and weights in a RNN.
-)doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
-                         R"doc(
-output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-    pass.
-output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-    pass.
-reserve_space: The same reserve_space produced in for forward operation.
-input_backprop: The backprop to input in the forward pass. Has the same shape
-    as input.
-input_h_backprop: The backprop to input_h in the forward pass. Has the same
-    shape as input_h.
-input_c_backprop: The backprop to input_c in the forward pass. Has the same
-    shape as input_c.
-params_backprop: The backprop to the params buffer in the forward pass. Has the
-    same shape as params.
-)doc"));
+    });
+
 
 REGISTER_OP("CudnnRNNParamsToCanonical")
     .Input("num_layers: int32")
@@ -259,17 +167,8 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
         c->set_output(num_params + i, c->Vector(InferenceContext::kUnknownDim));
       }
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Retrieves a set of weights from the opaque params buffer that can be saved and
-restored in a way compatible with future runs.
-)doc",
-                         kCudnnRNNCommonInputs, kCudnnRNNParamsBuffer, R"doc(
-num_params: number of parameter sets for all layers.
-    Each layer may contain multiple parameter sets, with each set consisting of
-    a weight matrix and a bias vector.
-)doc",
-                         kCudnnRNNParamsCanonical, kCudnnRNNCommonAttrs));
+    });
+
 
 REGISTER_OP("CudnnRNNCanonicalToParams")
     .Input("num_layers: int32")
@@ -289,17 +188,6 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Writes a set of weights into the opaque params buffer so they can be used in
-upcoming training or inferences.
-)doc",
-                         kCudnnRNNCommonInputs, kCudnnRNNParamsCanonical,
-                         kCudnnRNNParamsBuffer, R"doc(
-num_params: number of parameter sets for all layers.
-    Each layer may contain multiple parameter sets, with each set consisting of
-    a weight matrix and a bias vector.
-)doc",
-                         kCudnnRNNCommonAttrs));
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops_test.cc b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
similarity index 100%
rename from tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops_test.cc
rename to tensorflow/core/ops/cudnn_rnn_ops_test.cc
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 4f946fb3ca7608816180351b7753d01f13d469f2..3112f35da43d16d7a4cd4c1c8e017cab3366e070 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -668,13 +668,31 @@ REGISTER_OP("TensorArrayGatherV3")
     .Attr("dtype: type")
     .Attr("element_shape: shape = { unknown_rank: true }")
     .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle indices;
       ShapeHandle unused;
       DimensionHandle unused_dim;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &indices));
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      return shape_inference::UnknownShape(c);
+      auto shapes = c->input_handle_shapes_and_types(0);
+      if (shapes != nullptr && !shapes->empty()) {
+        ShapeHandle tensor_shape = shapes->at(0).shape;
+        ShapeHandle output_shape;
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(indices, tensor_shape, &output_shape));
+        c->set_output(0, output_shape);
+        return Status::OK();
+      } else {
+        PartialTensorShape p;
+        TF_RETURN_IF_ERROR(c->GetAttr("element_shape", &p));
+        ShapeHandle s;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s));
+        ShapeHandle output_shape;
+        TF_RETURN_IF_ERROR(c->Concatenate(indices, s, &output_shape));
+        c->set_output(0, output_shape);
+        return Status::OK();
+      }
     });
 
 REGISTER_OP("TensorArrayScatterV3")
@@ -685,12 +703,25 @@ REGISTER_OP("TensorArrayScatterV3")
     .Output("flow_out: float")
     .Attr("T: type")
     .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle indices;
       ShapeHandle unused;
       DimensionHandle unused_dim;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &indices));
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      ShapeHandle value_shape;
+      // Assert that the length of the indices tensor is equal to the first
+      // dimension of the value tensor.
+      TF_RETURN_IF_ERROR(
+          c->MergePrefix(c->input(2), indices, &value_shape, &indices));
+      auto shapes = c->input_handle_shapes_and_types(0);
+      if (shapes != nullptr && !shapes->empty()) {
+        ShapeHandle tensor_shape = shapes->at(0).shape;
+        ShapeHandle fed_shape;
+        TF_RETURN_IF_ERROR(c->Subshape(value_shape, 1, &fed_shape));
+        TF_RETURN_IF_ERROR(c->Merge(tensor_shape, fed_shape, &fed_shape));
+      }
       return shape_inference::ScalarShape(c);
     });
 
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 9a4b616e5ded3210c97e116d8b883f54f8a8b304..b25abbcc6780364c27c1a078b0a8980014c83a43 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -105,8 +105,11 @@ REGISTER_OP("RepeatDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate the
-                                                // shape of `count`.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TakeDataset")
     .Input("input_dataset: variant")
@@ -114,7 +117,11 @@ REGISTER_OP("TakeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("SkipDataset")
     .Input("input_dataset: variant")
@@ -122,7 +129,11 @@ REGISTER_OP("SkipDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("BytesProducedStatsDataset")
     .Input("input_dataset: variant")
@@ -166,6 +177,7 @@ REGISTER_OP("MapAndBatchDataset")
     .Input("other_arguments: Targuments")
     .Input("batch_size: int64")
     .Input("num_parallel_batches: int64")
+    .Input("drop_remainder: bool")
     .Output("handle: variant")
     .Attr("f: func")
     .Attr("Targuments: list(type) >= 0")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 4b21fac80aea76555959e8a202a73ccc833d0306..792686cae1f599c17cbea05aa5de61f1e592996c 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -50,6 +50,7 @@ REGISTER_OP("RemoteCall")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
 
+// TODO(drpng): remove this.
 REGISTER_OP("_If")
     .Input("cond: Tcond")
     .Input("input: Tin")
@@ -76,8 +77,18 @@ else_branch: A function that takes 'inputs' and returns a list of
     tensors.  whose types are the same as what then_branch returns.
 )doc");
 
-// TODO(b/37549631) setting the While Op to always be stateful is too
-// conservative.
+REGISTER_OP("If")
+    .Input("cond: Tcond")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tcond: type")
+    .Attr("Tin: list(type)")
+    .Attr("Tout: list(type)")
+    .Attr("then_branch: func")
+    .Attr("else_branch: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+// TODO(drpng): remove this.
 REGISTER_OP("_While")
     .Input("input: T")
     .Output("output: T")
@@ -108,4 +119,30 @@ body: A function that takes a list of tensors and returns another
       by T.
 )doc");
 
+// TODO(b/37549631) setting the While Op to always be stateful is too
+// conservative.
+REGISTER_OP("While")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("cond: func")
+    .Attr("body: func")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    });
+
+REGISTER_OP("For")
+    .Input("start: int32")
+    .Input("limit: int32")
+    .Input("delta: int32")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("body: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 0c16abd3693d53e20bc4701bdfcb19e6e00c47d4..cad617638ff12cd1020276341fbe9f9b7aac97bc 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -135,10 +135,6 @@ REGISTER_OP("TensorListStack")
         }
         shape_inference::ShapeHandle ignored;
         TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        if (!c->FullyDefined(list_shape_type.shape)) {
-          return errors::InvalidArgument(
-              "Can only stack a list with fully defined shapes.");
-        }
         s = list_shape_type.shape;
       }
       int expected_num_elements = -1;
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 8dcd3e815f3c19b41b1ef02a23e1f5ce36697a23..da38a6bc2497aca1623faed40c41386a4daff113 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 
@@ -362,7 +363,7 @@ class MathGradTest : public ::testing::Test {
 };
 
 void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index ca3772e6f89805b70f05f1c9fd5e36ee99f2d510..8f974d5367a486dca39cddfd3fbdca4d4a3bf6eb 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -239,20 +240,21 @@ TEST(MathOpsTest, Select_ShapeFn) {
 
   // Expect an error when the shapes can't be merged.
   handle_data[2]->at(0).first = shape_proto({2, 2});
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
+                                    "must be equal, but are 1 and 2"));
   handle_data[2]->at(0).first = i1;  // restore to valid
 
   // Expect an error when the types can't be merged.
   handle_data[2]->at(1).second = DT_INT64;
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("pointing to different dtypes"));
+  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
+                                    "pointing to different dtypes"));
   handle_data[2]->at(1).second = DT_INT32;  // restore to valid
 
   // Expect an error when different numbers of tensors are merged.
   handle_data[2]->push_back({i1, DT_FLOAT});
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("pointing to different numbers of tensors"));
+  EXPECT_TRUE(
+      str_util::StrContains(run_inference_for_handles().error_message(),
+                            "pointing to different numbers of tensors"));
   handle_data[2]->pop_back();  // restore to valid.
 }
 
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index d6a0f380336f4e3debd3a5a6facbefef375a6798..12d6dc5eaf29569d7a0f865afb4fb26b440be60b 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -266,7 +266,7 @@ REGISTER_OP("Conv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -279,7 +279,7 @@ REGISTER_OP("Conv2DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -301,7 +301,7 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -472,7 +472,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -490,7 +490,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -589,7 +589,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("AvgPool3DGrad")
@@ -600,7 +600,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -618,7 +618,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {half, bfloat16, float}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("MaxPool3DGrad")
@@ -630,8 +630,8 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float} = DT_FLOAT")
-    .Attr("TInput: {bfloat16, float} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     });
@@ -1062,12 +1062,27 @@ REGISTER_OP("SoftmaxCrossEntropyWithLogits")
     .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
-      TF_RETURN_IF_ERROR(c->Merge(input, c->input(1), &input));
+      if (c->WithRank(c->input(0), 2, &input) == Status::OK() &&
+          c->Merge(input, c->input(1), &input) == Status::OK()) {
+        DimensionHandle batch_size = c->Dim(input, 0);
+        c->set_output(0, c->Vector(batch_size));
+        c->set_output(1, input);
+        return Status::OK();
+      }
+      TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFn(c, 1));
+
+      if (!c->RankKnown(c->output(1))) {
+        return errors::InvalidArgument(
+            "Shape must be broadcasted with rank 2, but is rank is unknown.");
+      }
 
-      DimensionHandle batch_size = c->Dim(input, 0);
+      if (c->Rank(c->output(1)) != 2) {
+        return errors::InvalidArgument(
+            "Shape must be broadcasted with rank 2, but is rank ",
+            c->Rank(c->output(1)));
+      }
+      DimensionHandle batch_size = c->Dim(c->output(1), 0);
       c->set_output(0, c->Vector(batch_size));
-      c->set_output(1, input);
       return Status::OK();
     });
 
@@ -1155,9 +1170,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument(
-        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
-        c->Value(last_dim));
+    return errors::InvalidArgument("input must have last dimension >= k = ",
+                                   c->Value(k_dim), " but is ",
+                                   c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1211,9 +1226,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument(
-            "Input must have last dimension > n = ", c->Value(n_dim),
-            " but is ", c->Value(last_dim));
+        return errors::InvalidArgument("Input must have last dimension > n = ",
+                                       c->Value(n_dim), " but is ",
+                                       c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
diff --git a/tensorflow/core/ops/nn_ops_test.cc b/tensorflow/core/ops/nn_ops_test.cc
index 1b17a7cda65f210e1981e0f46f47691f0faba465..289b95305561dc60e52951b950dc4f6ade179fe2 100644
--- a/tensorflow/core/ops/nn_ops_test.cc
+++ b/tensorflow/core/ops/nn_ops_test.cc
@@ -410,10 +410,18 @@ TEST(NNOpsTest, SoftmaxCrossEntropyWithLogits_ShapeFn) {
   INFER_OK(op, "[1,?];[?,2]", "[d0_0];[d0_0,d0_1|d1_1]");
   INFER_OK(op, "[?,2];[1,2]", "[d1_0];in1");
 
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1,?];[2,?]");
-  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[1,2,3];?");
-  INFER_ERROR("Shapes must be equal rank, but are 2 and 3", op, "?;[1,2,3]");
+  INFER_ERROR("Shape must be broadcasted with rank 2", op, "[1,2,3];?");
+  INFER_ERROR("Shape must be broadcasted with rank 2", op, "?;[1,2,3]");
+
+  // Broadcast example
+  // [1,4] and [2,4] are broadcasted to [2,4]
+  INFER_OK(op, "[1,4];[2,4]", "[d1_0];[d1_0,d0_1|d1_1]");
+  // [2,4] and [2,1] are broadcasted to [2,4]
+  INFER_OK(op, "[2,4];[2,1]", "[d0_0];[d0_0|d1_0,d0_1]");
+  // [1,?] and [2,4] are broadcasted to [2,4]
+  INFER_OK(op, "[1,?];[2,4]", "[d1_0];[d1_0,d0_1|d1_1]");
+  // [2,4] and [?,1] are broadcasted to [2,4]
+  INFER_OK(op, "[2,4];[?,1]", "[d0_0];[d0_0|d1_0,d0_1]");
 }
 
 TEST(NNOpsTest, SparseSoftmaxCrossEntropyWithLogits_ShapeFn) {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 3faa4eeada04efdb8bc6504fd468eeaa179319c1..5764976aee1236c7a1b36d8a12e335f4aff7bc13 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3996,748 +3996,1727 @@ op {
   is_commutative: true
 }
 op {
-  name: "BroadcastArgs"
+  name: "BoostedTreesCalculateBestGainsPerFeature"
   input_arg {
-    name: "s0"
-    type_attr: "T"
+    name: "node_id_range"
+    type: DT_INT32
   }
   input_arg {
-    name: "s1"
-    type_attr: "T"
+    name: "stats_summary_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
   }
   output_arg {
-    name: "r0"
-    type_attr: "T"
+    name: "node_ids_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "thresholds_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "left_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "right_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "tree_complexity"
+    type: "float"
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "BroadcastGradientArgs"
+  name: "BoostedTreesCreateEnsemble"
   input_arg {
-    name: "s0"
-    type_attr: "T"
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "s1"
-    type_attr: "T"
+    name: "stamp_token"
+    type: DT_INT64
   }
-  output_arg {
-    name: "r0"
-    type_attr: "T"
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesDeserializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
   }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesEnsembleResourceHandleOp"
   output_arg {
-    name: "r1"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "container"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: ""
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Bucketize"
+  name: "BoostedTreesGetEnsembleStates"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "output"
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_trees"
     type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  output_arg {
+    name: "num_finalized_trees"
+    type: DT_INT32
   }
-  attr {
-    name: "boundaries"
-    type: "list(float)"
+  output_arg {
+    name: "num_attempted_layers"
+    type: DT_INT32
   }
+  is_stateful: true
 }
 op {
-  name: "BytesProducedStatsDataset"
+  name: "BoostedTreesMakeStatsSummary"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "node_ids"
+    type: DT_INT32
   }
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "bucketized_features_list"
+    type: DT_INT32
+    number_attr: "num_features"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "stats_summary"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "max_splits"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
 }
 op {
-  name: "CTCBeamSearchDecoder"
+  name: "BoostedTreesPredict"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "sequence_length"
+    name: "bucketized_features"
     type: DT_INT32
+    number_attr: "num_bucketized_features"
   }
   output_arg {
-    name: "decoded_indices"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "decoded_values"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "decoded_shape"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "log_probability"
+    name: "logits"
     type: DT_FLOAT
   }
   attr {
-    name: "beam_width"
+    name: "num_bucketized_features"
     type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "top_paths"
+    name: "logits_dimension"
     type: "int"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "merge_repeated"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "CTCGreedyDecoder"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
+  name: "BoostedTreesSerializeEnsemble"
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "decoded_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "decoded_values"
-    type: DT_INT64
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "decoded_shape"
+    name: "stamp_token"
     type: DT_INT64
   }
   output_arg {
-    name: "log_probability"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "merge_repeated"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
   }
+  is_stateful: true
 }
 op {
-  name: "CTCLoss"
+  name: "BoostedTreesTrainingPredict"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "labels_indices"
-    type: DT_INT64
+    name: "cached_tree_ids"
+    type: DT_INT32
   }
   input_arg {
-    name: "labels_values"
+    name: "cached_node_ids"
     type: DT_INT32
   }
   input_arg {
-    name: "sequence_length"
+    name: "bucketized_features"
     type: DT_INT32
+    number_attr: "num_bucketized_features"
   }
   output_arg {
-    name: "loss"
+    name: "partial_logits"
     type: DT_FLOAT
   }
   output_arg {
-    name: "gradient"
-    type: DT_FLOAT
+    name: "tree_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
   }
   attr {
-    name: "preprocess_collapse_repeated"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "ctc_merge_repeated"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "logits_dimension"
+    type: "int"
   }
   attr {
-    name: "ignore_longer_outputs_than_inputs"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "CacheDataset"
+  name: "BoostedTreesUpdateEnsemble"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "feature_ids"
+    type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "max_depth"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
-}
-op {
-  name: "Cast"
-  input_arg {
-    name: "x"
-    type_attr: "SrcT"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "DstT"
+  attr {
+    name: "learning_rate"
+    type: "float"
   }
   attr {
-    name: "SrcT"
-    type: "type"
+    name: "pruning_mode"
+    type: "int"
+    has_minimum: true
   }
   attr {
-    name: "DstT"
-    type: "type"
+    name: "num_features"
+    type: "int"
+    has_minimum: true
   }
+  is_stateful: true
 }
 op {
-  name: "Ceil"
+  name: "BroadcastArgs"
   input_arg {
-    name: "x"
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "r0"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "CheckNumerics"
+  name: "BroadcastGradientArgs"
   input_arg {
-    name: "tensor"
+    name: "s0"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "s1"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "message"
-    type: "string"
-  }
-}
-op {
-  name: "Cholesky"
-  input_arg {
-    name: "input"
+  output_arg {
+    name: "r0"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "r1"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "CholeskyGrad"
-  input_arg {
-    name: "l"
-    type_attr: "T"
-  }
+  name: "Bucketize"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type: DT_INT32
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
+        type: DT_INT64
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+  }
 }
 op {
-  name: "CompareAndBitpack"
+  name: "BytesProducedStatsDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "threshold"
-    type_attr: "T"
+    name: "tag"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type: DT_UINT8
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "Complex"
+  name: "CTCBeamSearchDecoder"
   input_arg {
-    name: "real"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "imag"
-    type_attr: "T"
+    name: "sequence_length"
+    type: DT_INT32
   }
   output_arg {
-    name: "out"
-    type_attr: "Tout"
+    name: "decoded_indices"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "beam_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "top_paths"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
     default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+      b: true
     }
   }
 }
 op {
-  name: "ComplexAbs"
+  name: "CTCGreedyDecoder"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "Tout"
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "merge_repeated"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ComputeAccidentalHits"
+  name: "CTCLoss"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "sampled_candidates"
+    name: "labels_indices"
     type: DT_INT64
   }
-  output_arg {
-    name: "indices"
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
     type: DT_INT32
   }
   output_arg {
-    name: "ids"
-    type: DT_INT64
+    name: "loss"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "weights"
+    name: "gradient"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
-    type: "int"
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "ctc_merge_repeated"
+    type: "bool"
     default_value {
-      i: 0
+      b: true
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
 }
 op {
-  name: "Concat"
+  name: "CacheDataset"
   input_arg {
-    name: "concat_dim"
-    type: DT_INT32
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
+    name: "filename"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "N"
-    type: "int"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
-    minimum: 2
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "ConcatOffset"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
+  name: "Cast"
   input_arg {
-    name: "shape"
-    type: DT_INT32
-    number_attr: "N"
+    name: "x"
+    type_attr: "SrcT"
   }
   output_arg {
-    name: "offset"
-    type: DT_INT32
-    number_attr: "N"
+    name: "y"
+    type_attr: "DstT"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
   }
 }
 op {
-  name: "ConcatV2"
+  name: "Ceil"
   input_arg {
-    name: "values"
+    name: "x"
     type_attr: "T"
-    number_attr: "N"
   }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "CheckNumerics"
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "tensor"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "Tidx"
+    name: "message"
+    type: "string"
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "ConcatenateDataset"
+  name: "CholeskyGrad"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "l"
+    type_attr: "T"
   }
   input_arg {
-    name: "another_dataset"
-    type: DT_VARIANT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "CloseSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
   }
+  is_stateful: true
 }
 op {
-  name: "ConditionalAccumulator"
+  name: "CompareAndBitpack"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "threshold"
+    type_attr: "T"
+  }
   output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type: DT_UINT8
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BOOL
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
+        type: DT_INT16
+        type: DT_INT32
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "Complex"
+  input_arg {
+    name: "real"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "imag"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "ComplexAbs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ComputeAccidentalHits"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "ids"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Concat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ConcatOffset"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  output_arg {
+    name: "offset"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "ConcatV2"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ConcatenateDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Conj"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "ConjugateTranspose"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "perm"
+    type_attr: "Tperm"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tperm"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Const"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "ConsumeMutexLock"
+  input_arg {
+    name: "mutex_lock"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ControlTrigger"
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+    explanation: "Use Conv3DBackpropFilterV2"
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+    explanation: "Use Conv3DBackpropInputV2"
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: ""
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Conj"
+  name: "Copy"
   input_arg {
     name: "input"
+    description: "Input tensor."
     type_attr: "T"
   }
   output_arg {
     name: "output"
+    description: "Output tensor, deep-copied from input."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
     default_value {
-      type: DT_COMPLEX64
+      s: ""
     }
-    allowed_values {
+    description: "The name of the input tensor."
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
       list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_VARIANT
       }
     }
+    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
   }
+  summary: "Copy Op."
+  description: "Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the\ndevice on which the tensor is allocated.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the CopyHost Op, this op does not have HostMemory constraint on its\ninput or output."
+  allows_uninitialized_input: true
 }
 op {
-  name: "ConjugateTranspose"
+  name: "CopyHost"
   input_arg {
-    name: "x"
+    name: "input"
+    description: "Input tensor."
     type_attr: "T"
   }
-  input_arg {
-    name: "perm"
-    type_attr: "Tperm"
-  }
   output_arg {
-    name: "y"
+    name: "output"
+    description: "Output tensor, deep-copied from input."
     type_attr: "T"
   }
   attr {
@@ -4745,57 +5724,59 @@ op {
     type: "type"
   }
   attr {
-    name: "Tperm"
-    type: "type"
+    name: "tensor_name"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: ""
     }
-    allowed_values {
+    description: "The name of the input tensor."
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
       list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
+    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
   }
+  summary: "Copy Host Op."
+  description: "Performs CPU-to-CPU deep-copying of tensor.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the Copy Op, this op has HostMemory constraint on its input or output."
+  allows_uninitialized_input: true
 }
 op {
-  name: "Const"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "value"
-    type: "tensor"
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "ConsumeMutexLock"
-  input_arg {
-    name: "mutex_lock"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "ControlTrigger"
-}
-op {
-  name: "Conv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "Cosh"
   input_arg {
-    name: "filter"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -4806,217 +5787,215 @@ op {
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
+}
+op {
+  name: "CountUpTo"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "limit"
+    type: "int"
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
+}
+op {
+  name: "CreateSummaryDbWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "db_uri"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "experiment_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "run_name"
+    type: DT_STRING
   }
+  input_arg {
+    name: "user_name"
+    type: DT_STRING
+  }
+  is_stateful: true
 }
 op {
-  name: "Conv2DBackpropFilter"
+  name: "CreateSummaryFileWriter"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "writer"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "filter_sizes"
+    name: "logdir"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "max_queue"
     type: DT_INT32
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "flush_millis"
+    type: DT_INT32
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "filename_suffix"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
   }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "data_format"
+    name: "method"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "bilinear"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "bilinear"
       }
     }
   }
   attr {
-    name: "dilations"
-    type: "list(int)"
+    name: "extrapolation_value"
+    type: "float"
     default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
+      f: 0
     }
   }
 }
 op {
-  name: "Conv2DBackpropInput"
+  name: "CropAndResizeGradBoxes"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "grads"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "filter"
+    name: "image"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
+    name: "method"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "bilinear"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
+        s: "bilinear"
       }
     }
   }
 }
 op {
-  name: "Conv3D"
+  name: "CropAndResizeGradImage"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "grads"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
@@ -5027,72 +6006,37 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
+    name: "method"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "bilinear"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
+        s: "bilinear"
       }
     }
   }
 }
 op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "Cross"
   input_arg {
-    name: "filter"
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "b"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "product"
     type_attr: "T"
   }
   attr {
@@ -5100,172 +6044,199 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  deprecation {
-    version: 10
-    explanation: "Use Conv3DBackpropFilterV2"
-  }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "CudnnRNN"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
+    name: "input_h"
+    type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
     type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
   }
   attr {
-    name: "padding"
+    name: "input_mode"
     type: "string"
+    default_value {
+      s: "linear_input"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
   attr {
-    name: "data_format"
+    name: "direction"
     type: "string"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
   attr {
-    name: "dilations"
-    type: "list(int)"
+    name: "is_training"
+    type: "bool"
     default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
+      b: true
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "CudnnRNNBackprop"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "input_h"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "input_c"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "params"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "output"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
   }
-  deprecation {
-    version: 10
-    explanation: "Use Conv3DBackpropInputV2"
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
   }
-}
-op {
-  name: "Conv3DBackpropInputV2"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "output_h_backprop"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "output_c_backprop"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "reserve_space"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
     type_attr: "T"
   }
   attr {
@@ -5274,137 +6245,102 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
+    name: "rnn_mode"
     type: "string"
+    default_value {
+      s: "lstm"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
   attr {
-    name: "data_format"
+    name: "input_mode"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "linear_input"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
   attr {
-    name: "dilations"
-    type: "list(int)"
+    name: "direction"
+    type: "string"
     default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
       list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
-}
-op {
-  name: "Copy"
-  input_arg {
-    name: "input"
-    description: "Input tensor."
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    description: "Output tensor, deep-copied from input."
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "tensor_name"
-    type: "string"
+    name: "seed"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
-    description: "The name of the input tensor."
   }
   attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
+    name: "seed2"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 0
     }
-    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
   }
-  summary: "Copy Op."
-  description: "Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the\ndevice on which the tensor is allocated.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the CopyHost Op, this op does not have HostMemory constraint on its\ninput or output."
-  allows_uninitialized_input: true
+  is_stateful: true
 }
 op {
-  name: "CopyHost"
+  name: "CudnnRNNCanonicalToParams"
   input_arg {
-    name: "input"
-    description: "Input tensor."
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    description: "Output tensor, deep-copied from input."
-    type_attr: "T"
+    name: "num_layers"
+    type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
   }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-    description: "The name of the input tensor."
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
   }
-  attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
   }
-  summary: "Copy Host Op."
-  description: "Performs CPU-to-CPU deep-copying of tensor.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the Copy Op, this op has HostMemory constraint on its input or output."
-  allows_uninitialized_input: true
-}
-op {
-  name: "Cos"
   input_arg {
-    name: "x"
+    name: "biases"
     type_attr: "T"
+    number_attr: "num_params"
   }
   output_arg {
-    name: "y"
+    name: "params"
     type_attr: "T"
   }
   attr {
@@ -5413,99 +6349,104 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
-}
-op {
-  name: "Cosh"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
-}
-op {
-  name: "CountUpTo"
-  input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "limit"
-    type: "int"
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
 }
 op {
-  name: "CropAndResize"
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
+  name: "CudnnRNNParamsSize"
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "num_layers"
+    type: DT_INT32
   }
   input_arg {
-    name: "box_ind"
+    name: "num_units"
     type: DT_INT32
   }
   input_arg {
-    name: "crop_size"
+    name: "input_size"
     type: DT_INT32
   }
   output_arg {
-    name: "crops"
-    type: DT_FLOAT
+    name: "params_size"
+    type_attr: "S"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -5513,58 +6454,112 @@ op {
     }
   }
   attr {
-    name: "method"
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "lstm"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
   attr {
-    name: "extrapolation_value"
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
     type: "float"
     default_value {
       f: 0
     }
   }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
 }
 op {
-  name: "CropAndResizeGradBoxes"
+  name: "CudnnRNNParamsToCanonical"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "num_layers"
+    type: DT_INT32
   }
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "num_units"
+    type: DT_INT32
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "input_size"
+    type: DT_INT32
   }
   input_arg {
-    name: "box_ind"
-    type: DT_INT32
+    name: "params"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_FLOAT
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -5572,96 +6567,72 @@ op {
     }
   }
   attr {
-    name: "method"
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "lstm"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
       }
     }
   }
-}
-op {
-  name: "CropAndResizeGradImage"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "image_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
       }
     }
   }
   attr {
-    name: "method"
+    name: "direction"
     type: "string"
     default_value {
-      s: "bilinear"
+      s: "unidirectional"
     }
     allowed_values {
       list {
-        s: "bilinear"
+        s: "unidirectional"
+        s: "bidirectional"
       }
     }
   }
-}
-op {
-  name: "Cross"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
-  output_arg {
-    name: "product"
-    type_attr: "T"
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
 }
@@ -8800,6 +9771,46 @@ op {
     }
   }
 }
+op {
+  name: "FlushSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
 op {
   name: "FractionalAvgPool"
   input_arg {
@@ -10189,20 +11200,59 @@ op {
     type: DT_RESOURCE
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
   }
-  is_stateful: true
 }
 op {
   name: "Igamma"
@@ -10367,6 +11417,18 @@ op {
     type: "string"
   }
 }
+op {
+  name: "ImportEvent"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "event"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "InTopK"
   input_arg {
@@ -10710,6 +11772,18 @@ op {
     }
   }
 }
+op {
+  name: "IsBoostedTreesEnsembleInitialized"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
 op {
   name: "IsFinite"
   input_arg {
@@ -11883,6 +12957,10 @@ op {
     name: "num_parallel_batches"
     type: DT_INT64
   }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -20840,19 +21918,232 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceCountUpTo"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "limit"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceGather"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterDiv"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMax"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
@@ -20877,30 +22168,7 @@ op {
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceCountUpTo"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "limit"
-    type: "int"
-  }
-  attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
@@ -20912,7 +22180,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceGather"
+  name: "ResourceScatterMin"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -20921,20 +22189,34 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "updates"
     type_attr: "dtype"
   }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   attr {
     name: "dtype"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
     name: "Tindices"
@@ -20949,7 +22231,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "ResourceScatterMul"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -21036,6 +22318,57 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterSub"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterUpdate"
   input_arg {
@@ -22775,6 +24108,110 @@ op {
     }
   }
 }
+op {
+  name: "ScatterMax"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMin"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ScatterMul"
   input_arg {
@@ -24305,18 +25742,49 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SlideDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
@@ -28659,6 +30127,28 @@ op {
     }
   }
 }
+op {
+  name: "SummaryWriter"
+  output_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Svd"
   input_arg {
@@ -31517,6 +33007,31 @@ op {
     }
   }
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
@@ -31562,6 +33077,39 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "WriteAudioSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "WriteFile"
   input_arg {
@@ -31573,6 +33121,180 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "WriteGraphSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteHistogramSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteImageSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bad_color"
+    type: DT_UINT8
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteScalarSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "summary_metadata"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "ZerosLike"
   input_arg {
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index 0d8cf78cc2a196cde4a77f53ce912c437648786a..3d0a6c2157d050869d5758128e9467e0ecdc7203 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -167,27 +167,75 @@ REGISTER_OP("ResourceGather")
       return Status::OK();
     });
 
+namespace {
+
+Status ResourceScatterUpdateShape(InferenceContext* c) {
+  ShapeAndType handle_shape_and_type;
+  TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &handle_shape_and_type));
+  ShapeHandle var_shape = handle_shape_and_type.shape;
+  ShapeHandle indices_shape = c->input(1);
+
+  ShapeHandle unused_updates_shape;
+  ShapeHandle concat;
+  ShapeHandle var_subshape;
+  TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
+  TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
+  TF_RETURN_IF_ERROR(
+      InferenceContext::Rank(c->input(2)) == 0
+          ? Status::OK()
+          : c->Merge(c->input(2), concat, &unused_updates_shape));
+  return Status::OK();
+}
+
+}  // namespace
+
 REGISTER_OP("ResourceScatterAdd")
     .Input("resource: resource")
     .Input("indices: Tindices")
     .Input("updates: dtype")
     .Attr("dtype: numbertype")
     .Attr("Tindices: {int32, int64}")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeAndType handle_shape_and_type;
-      TF_RETURN_IF_ERROR(
-          ValidateVariableResourceHandle(c, &handle_shape_and_type));
-      ShapeHandle var_shape = handle_shape_and_type.shape;
-      ShapeHandle indices_shape = c->input(1);
+    .SetShapeFn(ResourceScatterUpdateShape);
 
-      ShapeHandle unused_updates_shape;
-      ShapeHandle concat;
-      ShapeHandle var_subshape;
-      TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
-      TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
-      TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
-      return Status::OK();
-    });
+REGISTER_OP("ResourceScatterSub")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
+
+REGISTER_OP("ResourceScatterMul")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
+
+REGISTER_OP("ResourceScatterDiv")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
+
+REGISTER_OP("ResourceScatterMin")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
+
+REGISTER_OP("ResourceScatterMax")
+    .Input("resource: resource")
+    .Input("indices: Tindices")
+    .Input("updates: dtype")
+    .Attr("dtype: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ResourceScatterUpdateShape);
 
 REGISTER_OP("ResourceScatterUpdate")
     .Input("resource: resource")
@@ -195,21 +243,7 @@ REGISTER_OP("ResourceScatterUpdate")
     .Input("updates: dtype")
     .Attr("dtype: type")
     .Attr("Tindices: {int32, int64}")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeAndType handle_shape_and_type;
-      TF_RETURN_IF_ERROR(
-          ValidateVariableResourceHandle(c, &handle_shape_and_type));
-      ShapeHandle var_shape = handle_shape_and_type.shape;
-      ShapeHandle indices_shape = c->input(1);
-
-      ShapeHandle unused_updates_shape;
-      ShapeHandle concat;
-      ShapeHandle var_subshape;
-      TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
-      TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
-      TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
-      return Status::OK();
-    });
+    .SetShapeFn(ResourceScatterUpdateShape);
 
 REGISTER_OP("MutexV2")
     .Attr("container: string = ''")
diff --git a/tensorflow/core/ops/scoped_allocator_ops.cc b/tensorflow/core/ops/scoped_allocator_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f053a53f4cf02e89fce53461bfda0cb23756287b
--- /dev/null
+++ b/tensorflow/core/ops/scoped_allocator_ops.cc
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("_ScopedAllocator")
+    .Output("output: T")
+    .Attr("shapes: list(shape)")
+    .Attr("T: type")
+    .Attr("sa_name: string")
+    .Attr("id: int")
+    .Attr("expected_call_count: int")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape)
+    .Doc(R"doc(
+Allocates a mutable tensor that becomes available to appropriately annotated
+downstream Ops as backing store for their output tensor allocations via the
+ScopedAllocatorMgr.
+Returns a reference to this value.
+
+This is an experimental op for internal use only.  It is possible to use this
+op in unsafe ways.
+)doc");
+
+REGISTER_OP("_ScopedAllocatorConcat")
+    .Output("output: T")
+    .Input("backing: T")
+    .Input("inputs: N * T")
+    .Attr("shape: shape")
+    .Attr("T: type")
+    .Attr("sa_name: string")
+    .Attr("id: int")
+    .Attr("N: int >= 2")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape)
+    .Doc(R"doc(
+Acts like a Concat Op that merges multple tensors into one, however it must
+only be used in conjunction with a ScopedAllocator which is backing the memory
+of all of its input tensors so that actually it just outputs a read-only
+reference to that ScopedAllocator's backing tensor.
+
+This is an experimental op for internal use only.  It is possible to use this
+op in unsafe ways.
+)doc");
+
+REGISTER_OP("_ScopedAllocatorSplit")
+    .Output("output: N * T")
+    .Input("concat: T")
+    .Input("split: N * T")
+    .Attr("T: type")
+    .Attr("sa_name: string")
+    .Attr("id: int")
+    .Attr("N: int >= 2")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape)
+    .Doc(R"doc(
+Acts like a Concat Op that merges multple tensors into one, however it must
+only be used in conjunction with a ScopedAllocator which is backing the memory
+of all of its input tensors so that actually it just outputs a read-only
+reference to that ScopedAllocator's backing tensor.
+
+This is an experimental op for internal use only.  It is possible to use this
+op in unsafe ways.
+)doc");
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index 7a524b60c0aa711f36158b73b93fa91606266592..664f52452e3339e895f568f83e1fbf80cdd8f035 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -122,7 +122,10 @@ Status ScatterUpdateShape(InferenceContext* c) {
   ShapeHandle var_subshape;
   TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
   TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
-  TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
+  TF_RETURN_IF_ERROR(
+      InferenceContext::Rank(c->input(2)) == 0
+          ? Status::OK()
+          : c->Merge(c->input(2), concat, &unused_updates_shape));
 
   c->set_output(0, var_shape);
   return Status::OK();
@@ -180,6 +183,26 @@ REGISTER_OP("ScatterDiv")
     .Attr("use_locking: bool = false")
     .SetShapeFn(ScatterUpdateShape);
 
+REGISTER_OP("ScatterMin")
+    .Input("ref: Ref(T)")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output_ref: Ref(T)")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn(ScatterUpdateShape);
+
+REGISTER_OP("ScatterMax")
+    .Input("ref: Ref(T)")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output_ref: Ref(T)")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn(ScatterUpdateShape);
+
 REGISTER_OP("ScatterNdUpdate")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/ops/summary_ops.cc b/tensorflow/core/ops/summary_ops.cc
index aa7458f903cf76af660c04149ff50ac899987eac..742a221adcb101e3d2d152d60e343b666d3fb96b 100644
--- a/tensorflow/core/ops/summary_ops.cc
+++ b/tensorflow/core/ops/summary_ops.cc
@@ -22,15 +22,7 @@ REGISTER_OP("SummaryWriter")
     .Output("writer: resource")
     .Attr("shared_name: string = ''")
     .Attr("container: string = ''")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Returns a handle to be used to access a summary writer.
-
-The summary writer is an in-graph resource which can be used by ops to write
-summaries to event files.
-
-writer: the summary writer resource. Scalar handle.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("CreateSummaryFileWriter")
     .Input("writer: resource")
@@ -38,17 +30,7 @@ REGISTER_OP("CreateSummaryFileWriter")
     .Input("max_queue: int32")
     .Input("flush_millis: int32")
     .Input("filename_suffix: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Creates a summary file writer accessible by the given resource handle.
-
-writer: A handle to the summary writer resource
-logdir: Directory where the event file will be written.
-max_queue: Size of the queue of pending events and summaries.
-flush_millis: How often, in milliseconds, to flush the pending events and
-  summaries to disk.
-filename_suffix: Every event file's name is suffixed with this suffix.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("CreateSummaryDbWriter")
     .Input("writer: resource")
@@ -56,47 +38,15 @@ REGISTER_OP("CreateSummaryDbWriter")
     .Input("experiment_name: string")
     .Input("run_name: string")
     .Input("user_name: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Creates summary database writer accessible by given resource handle.
-
-This can be used to write tensors from the execution graph directly
-to a database. Only SQLite is supported right now. This function
-will create the schema if it doesn't exist. Entries in the Users,
-Experiments, and Runs tables will be created automatically if they
-don't already exist.
-
-writer: Handle to SummaryWriter resource to overwrite.
-db_uri: For example "file:/tmp/foo.sqlite".
-experiment_name: Can't contain ASCII control characters or <>. Case
-  sensitive. If empty, then the Run will not be associated with any
-  Experiment.
-run_name: Can't contain ASCII control characters or <>. Case sensitive.
-  If empty, then each Tag will not be associated with any Run.
-user_name: Must be valid as both a DNS label and Linux username. If
-  empty, then the Experiment will not be associated with any User.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("FlushSummaryWriter")
     .Input("writer: resource")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"(
-Flushes the writer's unwritten events.
-
-writer: A handle to the summary writer resource.
-)");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("CloseSummaryWriter")
     .Input("writer: resource")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"(
-Flushes and closes the summary writer.
-
-Also removes it from the resource manager. To reopen, use another
-CreateSummaryFileWriter op.
-
-writer: A handle to the summary writer resource.
-)");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteSummary")
     .Input("writer: resource")
@@ -105,31 +55,12 @@ REGISTER_OP("WriteSummary")
     .Input("tag: string")
     .Input("summary_metadata: string")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Outputs a `Summary` protocol buffer with a tensor.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tensor: A tensor to serialize.
-tag: The summary's tag.
-summary_metadata: Serialized SummaryMetadata protocol buffer containing
- plugin-related metadata for this summary.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("ImportEvent")
     .Input("writer: resource")
     .Input("event: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Outputs a `tf.Event` protocol buffer.
-
-When CreateSummaryDbWriter is being used, this op can be useful for
-importing data from event logs.
-
-writer: A handle to a summary writer.
-event: A string containing a binary-encoded tf.Event proto.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteScalarSummary")
     .Input("writer: resource")
@@ -137,17 +68,7 @@ REGISTER_OP("WriteScalarSummary")
     .Input("tag: string")
     .Input("value: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `Summary` protocol buffer with scalar values.
-
-The input `tag` and `value` must have the scalars.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tag: Tag for the summary.
-value: Value for the summary.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteHistogramSummary")
     .Input("writer: resource")
@@ -155,21 +76,7 @@ REGISTER_OP("WriteHistogramSummary")
     .Input("tag: string")
     .Input("values: T")
     .Attr("T: realnumbertype = DT_FLOAT")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `Summary` protocol buffer with a histogram.
-
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing a histogram for `values`.
-
-This op reports an `InvalidArgument` error if any value is not finite.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tag: Scalar.  Tag to use for the `Summary.Value`.
-values: Any shape. Values to use to build the histogram.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteImageSummary")
     .Input("writer: resource")
@@ -179,52 +86,7 @@ REGISTER_OP("WriteImageSummary")
     .Input("bad_color: uint8")
     .Attr("max_images: int >= 1 = 3")
     .Attr("T: {uint8, float, half} = DT_FLOAT")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `Summary` protocol buffer with images.
-
-The summary has up to `max_images` summary values containing images. The
-images are built from `tensor` which must be 4-D with shape `[batch_size,
-height, width, channels]` and where `channels` can be:
-
-*  1: `tensor` is interpreted as Grayscale.
-*  3: `tensor` is interpreted as RGB.
-*  4: `tensor` is interpreted as RGBA.
-
-The images have the same number of channels as the input tensor. For float
-input, the values are normalized one image at a time to fit in the range
-`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-normalization algorithms:
-
-*  If the input values are all positive, they are rescaled so the largest one
-   is 255.
-
-*  If any input value is negative, the values are shifted so input value 0.0
-   is at 127.  They are then rescaled so that either the smallest value is 0,
-   or the largest one is 255.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_images` is 1, the summary value tag is '*tag*/image'.
-*  If `max_images` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-
-The `bad_color` argument is the color to use in the generated images for
-non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-Each element must be in the range `[0, 255]` (It represents the value of a
-pixel in the output image).  Non-finite values in the input tensor are
-replaced by this tensor in the output image.  The default value is the color
-red.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tag: Scalar. Used to build the `tag` attribute of the summary values.
-tensor: 4-D of shape `[batch_size, height, width, channels]` where
-  `channels` is 1, 3, or 4.
-max_images: Max number of batch elements to generate images for.
-bad_color: Color to use for pixels with non-finite values.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteAudioSummary")
     .Input("writer: resource")
@@ -233,41 +95,12 @@ REGISTER_OP("WriteAudioSummary")
     .Input("tensor: float")
     .Input("sample_rate: float")
     .Attr("max_outputs: int >= 1 = 3")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `Summary` protocol buffer with audio.
-
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-
-writer: A handle to a summary writer.
-step: The step to write the summary for.
-tag: Scalar. Used to build the `tag` attribute of the summary values.
-tensor: 2-D of shape `[batch_size, frames]`.
-sample_rate: The sample rate of the signal in hertz.
-max_outputs: Max number of batch elements to generate audio for.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("WriteGraphSummary")
     .Input("writer: resource")
     .Input("step: int64")
     .Input("tensor: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Writes a `GraphDef` protocol buffer to a `SummaryWriter`.
-
-writer: Handle of `SummaryWriter`.
-step: The step to write the summary for.
-tensor: A scalar string of the serialized tf.GraphDef proto.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/abi.cc b/tensorflow/core/platform/abi.cc
index 4df62734e96c86fdbdae4dcf34f7a1f2a6583d5c..e597a490d619e55ad491d108c4a536727431b92b 100644
--- a/tensorflow/core/platform/abi.cc
+++ b/tensorflow/core/platform/abi.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/abi.h"
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 #include <windows.h>
 #include <cstring>
 #else
@@ -26,19 +26,19 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 
 extern "C" char* __unDName(char* output_string, const char* name,
                            int max_string_length, void* (*p_alloc)(std::size_t),
                            void (*p_free)(void*), unsigned short disable_flags);
 
-#endif  // defined(PLATFORM_WINDOWS)
+#endif  // defined(_MSC_VER)
 
 namespace tensorflow {
 namespace port {
 
 std::string MaybeAbiDemangle(const char* name) {
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
   std::unique_ptr<char> demangled{__unDName(nullptr, name, 0, std::malloc,
                                             std::free,
                                             static_cast<unsigned short>(0))};
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 21636641e7a35c4f3757cd9bb9f9e90ec51620c2..be84316c482aa5eb9031388c3d3694a1ae3514ea 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -14,20 +14,6 @@ load(
     "if_windows",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "expiring_lru_cache",
     hdrs = ["expiring_lru_cache.h"],
@@ -99,6 +85,7 @@ cc_library(
         ":retrying_utils",
         ":time_util",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@jsoncpp_git//:jsoncpp",
     ],
@@ -277,6 +264,7 @@ tf_cc_test(
     deps = [
         ":gcs_file_system",
         ":http_request_fake",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 1691826483a3227ea00b3b37e82002f3ad8d5225..3c0dc13d75fb113ece6960e6cf1e39e9c3f9adf1 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -172,7 +172,7 @@ Status ParseGcsPath(StringPiece fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
                                    fname);
   }
-  objectp.Consume("/");
+  str_util::ConsumePrefix(&objectp, "/");
   *object = objectp.ToString();
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("GCS path doesn't contain an object name: ",
@@ -535,7 +535,8 @@ class GcsWritableFile : public WritableFile {
       *uploaded = 0;
     } else {
       StringPiece range_piece(received_range);
-      range_piece.Consume("bytes=");  // May or may not be present.
+      str_util::ConsumePrefix(&range_piece,
+                              "bytes=");  // May or may not be present.
       std::vector<int64> range_parts;
       if (!str_util::SplitAndParseAsInts(range_piece, '-', &range_parts) ||
           range_parts.size() != 2) {
@@ -1172,7 +1173,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         // 'object_prefix', which is part of 'dirname', should be removed from
         // the beginning of 'name'.
         StringPiece relative_path(name);
-        if (!relative_path.Consume(object_prefix)) {
+        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(strings::StrCat(
               "Unexpected response: the returned file name ", name,
               " doesn't match the prefix ", object_prefix));
@@ -1201,7 +1202,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         }
         const string& prefix_str = prefix.asString();
         StringPiece relative_path(prefix_str);
-        if (!relative_path.Consume(object_prefix)) {
+        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(
               "Unexpected response: the returned folder name ", prefix_str,
               " doesn't match the prefix ", object_prefix);
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 8516421614481cbb5e96cacd4b1f16aded883a91..2fbde9b6a79883b674df15fb4e69b5cf3cc643a2 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -584,8 +585,9 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
   TF_EXPECT_OK(file->Append("content2"));
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::ABORTED, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("All 10 retry attempts failed. The last failure: "
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "All 10 retry attempts failed. The last failure: "
                             "Unavailable: important HTTP error 503"))
       << status;
 }
@@ -641,13 +643,12 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::UNAVAILABLE, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message())
-          .contains(
-              "Upload to gs://bucket/path/writeable.txt failed, caused by: "
-              "Not found: important HTTP error 410"))
+      str_util::StrContains(status.error_message(),
+                            "Upload to gs://bucket/path/writeable.txt failed, "
+                            "caused by: Not found: important HTTP error 410"))
       << status;
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("when uploading gs://bucket/path/writeable.txt"))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "when uploading gs://bucket/path/writeable.txt"))
       << status;
 }
 
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index d3f763bb3c845436e8458135a0a754d8cb002957..ee6886fef70328dafd199ba94ef2af2c58e0de29 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -245,7 +246,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   char scratch[10];
   const auto& status = random_access_file->Read(0, 10, &result, scratch);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -399,7 +400,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   // Use it and check the results.
   const auto& status = writable_file->Sync();
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -428,7 +429,7 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
   const auto& status =
       fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -454,7 +455,7 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.GetChildren("gs://path", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -481,7 +482,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -506,7 +507,7 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.DeleteFile("gs://path/file.txt");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -531,7 +532,7 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.CreateDir("gs://path/newdir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -556,7 +557,7 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.DeleteDir("gs://path/dir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -582,7 +583,7 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
   uint64 size;
   const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -605,7 +606,7 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
 
   const auto& status = fs.RenameFile("old_name", "new_name");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -630,7 +631,7 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
   FileStatistics stat;
   const auto& status = fs.Stat("file_name", &stat);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -642,7 +643,7 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
 
   const auto& status = fs.FileExists("file_name");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -677,7 +678,7 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
 
   const auto& status = fs.IsDirectory("gs://path/dir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -706,7 +707,7 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
   const auto& status =
       fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
diff --git a/tensorflow/core/platform/cloud/retrying_utils_test.cc b/tensorflow/core/platform/cloud/retrying_utils_test.cc
index 6eb340e09438eafbe59844a378aa06801ed3b4bf..1b6527618a8e0fa1261b96bd79bdd8e5e2e6f8d1 100644
--- a/tensorflow/core/platform/cloud/retrying_utils_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -31,10 +32,9 @@ TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
 
   const auto& status = RetryingUtils::CallWithRetries(f, 500000L, sleep);
   EXPECT_EQ(errors::Code::ABORTED, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("All 10 retry attempts "
-                            "failed. The last failure: "
-                            "Unavailable: Failed."))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "All 10 retry attempts failed. The last failure: Unavailable: Failed."))
       << status;
 
   EXPECT_EQ(10, requested_delays.size());
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 331f3e525169a93fa01739eefdf2dc6c588980a0..bb77650e26e7e7bd3391d4d0b5131c5a386a6dc6 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <string>
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 #include "tensorflow/core/platform/windows/cpu_info.h"
 #endif
 
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 2cd607edbe554cd18d21626e258176e8570282ed..447056eb4b001b8e40eebc9c6e165023286d3c1f 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -128,6 +128,11 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "stacktrace",
+    srcs = [],
+)
+
 cc_library(
     name = "gif",
     copts = tf_copts(),
@@ -218,15 +223,3 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index f0efa31d5576393e9d9bba6e39a454b2a33cddc3..2c134f1be931982930047850736d1d3a33fdffcc 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -64,11 +64,11 @@ class LogMessageFatal : public LogMessage {
 };
 
 #define _TF_LOG_INFO \
-  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::INFO)
+  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, ::tensorflow::INFO)
 #define _TF_LOG_WARNING \
-  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::WARNING)
+  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, ::tensorflow::WARNING)
 #define _TF_LOG_ERROR \
-  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::ERROR)
+  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, ::tensorflow::ERROR)
 #define _TF_LOG_FATAL \
   ::tensorflow::internal::LogMessageFatal(__FILE__, __LINE__)
 
diff --git a/tensorflow/core/platform/default/mutex.cc b/tensorflow/core/platform/default/mutex.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79830a4738afeb2a19e3e3811be4c88e13f06090
--- /dev/null
+++ b/tensorflow/core/platform/default/mutex.cc
@@ -0,0 +1,89 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/mutex.h"
+#include <chrono>
+#include <condition_variable>
+#include "nsync_cv.h"
+#include "nsync_mu.h"
+
+namespace tensorflow {
+
+// Check that the external_mu_space struct used to reserve space for the mutex
+// in tensorflow::mutex is big enough.
+static_assert(sizeof(nsync::nsync_mu) <= sizeof(mutex::external_mu_space),
+              "tensorflow::mutex::external_mu_space needs to be bigger");
+
+// Cast a pointer to mutex::external_mu_space to a pointer to the mutex mutex
+// representation.  This is done so that the header files for nsync_mu do not
+// need to be included in every file that uses tensorflow's mutex.
+static inline nsync::nsync_mu *mu_cast(mutex::external_mu_space *mu) {
+  return reinterpret_cast<nsync::nsync_mu *>(mu);
+}
+
+mutex::mutex() { nsync::nsync_mu_init(mu_cast(&mu_)); }
+
+void mutex::lock() { nsync::nsync_mu_lock(mu_cast(&mu_)); }
+
+bool mutex::try_lock() { return nsync::nsync_mu_trylock(mu_cast(&mu_)) != 0; };
+
+void mutex::unlock() { nsync::nsync_mu_unlock(mu_cast(&mu_)); }
+
+void mutex::lock_shared() { nsync::nsync_mu_rlock(mu_cast(&mu_)); }
+
+bool mutex::try_lock_shared() {
+  return nsync::nsync_mu_rtrylock(mu_cast(&mu_)) != 0;
+};
+
+void mutex::unlock_shared() { nsync::nsync_mu_runlock(mu_cast(&mu_)); }
+
+// Check that the external_cv_space struct used to reserve space for the
+// condition variable in tensorflow::condition_variable is big enough.
+static_assert(
+    sizeof(nsync::nsync_cv) <= sizeof(condition_variable::external_cv_space),
+    "tensorflow::condition_variable::external_cv_space needs to be bigger");
+
+// Cast a pointer to mutex::external_cv_space to a pointer to the condition
+// variable representation.  This is done so that the header files for nsync_mu
+// do not need to be included in every file that uses tensorflow's
+// condition_variable.
+static inline nsync::nsync_cv *cv_cast(
+    condition_variable::external_cv_space *cv) {
+  return reinterpret_cast<nsync::nsync_cv *>(cv);
+}
+
+condition_variable::condition_variable() {
+  nsync::nsync_cv_init(cv_cast(&cv_));
+}
+
+void condition_variable::wait(mutex_lock &lock) {
+  nsync::nsync_cv_wait(cv_cast(&cv_), mu_cast(&lock.mutex()->mu_));
+}
+
+std::cv_status condition_variable::wait_until_system_clock(
+    mutex_lock &lock,
+    const std::chrono::system_clock::time_point timeout_time) {
+  int r = nsync::nsync_cv_wait_with_deadline(
+      cv_cast(&cv_), mu_cast(&lock.mutex()->mu_), timeout_time, nullptr);
+  return r ? std::cv_status::timeout : std::cv_status::no_timeout;
+}
+
+void condition_variable::notify_one() { nsync::nsync_cv_signal(cv_cast(&cv_)); }
+
+void condition_variable::notify_all() {
+  nsync::nsync_cv_broadcast(cv_cast(&cv_));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/mutex.h b/tensorflow/core/platform/default/mutex.h
index 044c754e80bd0dee04c73e969c325a2aa4a89c31..a12d92795e14665f62222879488ab1cb89da8cfc 100644
--- a/tensorflow/core/platform/default/mutex.h
+++ b/tensorflow/core/platform/default/mutex.h
@@ -22,9 +22,8 @@ limitations under the License.
 #include <chrono>
 #include <condition_variable>
 #include <mutex>
-#include "nsync_cv.h"
-#include "nsync_mu.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+
 namespace tensorflow {
 
 #undef mutex_lock
@@ -38,26 +37,26 @@ class condition_variable;
 // lock.
 class LOCKABLE mutex {
  public:
-  mutex() { nsync::nsync_mu_init(&mu_); }
-  // The default implementation of nsync_mutex is safe to use after the linker
-  // initializations
+  mutex();
+  // The default implementation of the underlying mutex is safe to use after
+  // the linker initialization to zero.
   explicit mutex(LinkerInitialized x) {}
 
-  void lock() EXCLUSIVE_LOCK_FUNCTION() { nsync::nsync_mu_lock(&mu_); }
-  bool try_lock() EXCLUSIVE_TRYLOCK_FUNCTION(true) {
-    return nsync::nsync_mu_trylock(&mu_) != 0;
-  };
-  void unlock() UNLOCK_FUNCTION() { nsync::nsync_mu_unlock(&mu_); }
+  void lock() EXCLUSIVE_LOCK_FUNCTION();
+  bool try_lock() EXCLUSIVE_TRYLOCK_FUNCTION(true);
+  void unlock() UNLOCK_FUNCTION();
+
+  void lock_shared() SHARED_LOCK_FUNCTION();
+  bool try_lock_shared() SHARED_TRYLOCK_FUNCTION(true);
+  void unlock_shared() UNLOCK_FUNCTION();
 
-  void lock_shared() SHARED_LOCK_FUNCTION() { nsync::nsync_mu_rlock(&mu_); }
-  bool try_lock_shared() SHARED_TRYLOCK_FUNCTION(true) {
-    return nsync::nsync_mu_rtrylock(&mu_) != 0;
+  struct external_mu_space {
+    void* space[2];
   };
-  void unlock_shared() UNLOCK_FUNCTION() { nsync::nsync_mu_runlock(&mu_); }
 
  private:
   friend class condition_variable;
-  nsync::nsync_mu mu_;
+  external_mu_space mu_;
 };
 
 // Mimic a subset of the std::unique_lock<tensorflow::mutex> functionality.
@@ -139,26 +138,29 @@ class SCOPED_LOCKABLE tf_shared_lock {
 // Mimic std::condition_variable.
 class condition_variable {
  public:
-  condition_variable() { nsync::nsync_cv_init(&cv_); }
+  condition_variable();
 
-  void wait(mutex_lock& lock) {
-    nsync::nsync_cv_wait(&cv_, &lock.mutex()->mu_);
-  }
+  void wait(mutex_lock& lock);
   template <class Rep, class Period>
   std::cv_status wait_for(mutex_lock& lock,
                           std::chrono::duration<Rep, Period> dur) {
-    int r = nsync::nsync_cv_wait_with_deadline(
-        &cv_, &lock.mutex()->mu_, std::chrono::system_clock::now() + dur,
-        nullptr);
-    return r ? std::cv_status::timeout : std::cv_status::no_timeout;
+    return wait_until_system_clock(lock,
+                                   std::chrono::system_clock::now() + dur);
   }
-  void notify_one() { nsync::nsync_cv_signal(&cv_); }
-  void notify_all() { nsync::nsync_cv_broadcast(&cv_); }
+  void notify_one();
+  void notify_all();
+
+  struct external_cv_space {
+    void* space[2];
+  };
 
  private:
   friend ConditionResult WaitForMilliseconds(mutex_lock* mu,
                                              condition_variable* cv, int64 ms);
-  nsync::nsync_cv cv_;
+  std::cv_status wait_until_system_clock(
+      mutex_lock& lock,
+      const std::chrono::system_clock::time_point timeout_time);
+  external_cv_space cv_;
 };
 
 inline ConditionResult WaitForMilliseconds(mutex_lock* mu,
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
index 03d8b6c2380151f6148a560648317aa8a98a5e2e..c732c76ff79412cc2c676757343bb5d669c84634 100644
--- a/tensorflow/core/platform/default/protobuf.h
+++ b/tensorflow/core/platform/default/protobuf.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "google/protobuf/arena.h"
 #include "google/protobuf/compiler/importer.h"
 #include "google/protobuf/descriptor.h"
+#include "google/protobuf/dynamic_message.h"
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
diff --git a/tensorflow/core/platform/default/tracing_impl.h b/tensorflow/core/platform/default/tracing_impl.h
index e813e4a17aca918582e8346d4bf2655724a143b3..78345488969ee3284f31504e2a8bfaf01bf19d14 100644
--- a/tensorflow/core/platform/default/tracing_impl.h
+++ b/tensorflow/core/platform/default/tracing_impl.h
@@ -22,7 +22,6 @@ limitations under the License.
 // IWYU pragma: friend third_party/tensorflow/core/platform/tracing.h
 
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/tracing.h"
 
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index 3631d9ddf99430372c11403dba56c14331a3db24..82cbc43b4f83f9a02d09700fdb9b1546e1d45d49 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <tuple>
+
 #include "tensorflow/core/platform/denormal.h"
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 12509c250eab9047b869694e930bf523a975a4f8..b9a9ef85eb16e62bf9fecf01910fb98673e8cf7b 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #endif
 
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 4ce4e0b4e024d50ae2bd081ec7b8b155060d2a4a..9192f7ba10d466aa8bcfc2b2536d5d42a9263533 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -291,10 +291,10 @@ class Env {
   virtual string FormatLibraryFileName(const string& name,
                                        const string& version) = 0;
 
- private:
   // Returns a possible list of local temporary directories.
-  void GetLocalTempDirectories(std::vector<string>* list);
+  virtual void GetLocalTempDirectories(std::vector<string>* list) = 0;
 
+ private:
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
   TF_DISALLOW_COPY_AND_ASSIGN(Env);
   EnvTime* envTime = EnvTime::Default();
@@ -358,6 +358,10 @@ class EnvWrapper : public Env {
   }
 
  private:
+  void GetLocalTempDirectories(std::vector<string>* list) override {
+    target_->GetLocalTempDirectories(list);
+  }
+
   Env* target_;
 };
 
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 47ddf0ccb93e827d410e87050d6802747fb84fbf..a70a417e6a2f3ade644f5d7377adf5ebc52d77e5 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/null_file_system.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -372,9 +373,8 @@ TEST_F(DefaultEnvTest, CreateUniqueFileName) {
 
   EXPECT_TRUE(env->CreateUniqueFileName(&filename, suffix));
 
-  StringPiece str(filename);
-  EXPECT_TRUE(str.starts_with(prefix));
-  EXPECT_TRUE(str.ends_with(suffix));
+  EXPECT_TRUE(str_util::StartsWith(filename, prefix));
+  EXPECT_TRUE(str_util::EndsWith(filename, suffix));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 271d73f5f1a7bd3e1301520aed09cbafd89c8ebc..b55e94d552ed3a66bd05930702acd9633cd02f81 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -18,41 +18,15 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
-namespace {
-
-constexpr int kNumThreads = 8;
-
-// Run a function in parallel using a ThreadPool, but skip the ThreadPool
-// on the iOS platform due to its problems with more than a few threads.
-void ForEach(int first, int last, const std::function<void(int)>& f) {
-#if TARGET_OS_IPHONE
-  for (int i = first; i < last; i++) {
-    f(i);
-  }
-#else
-  int num_threads = std::min(kNumThreads, last - first);
-  thread::ThreadPool threads(Env::Default(), "ForEach", num_threads);
-  for (int i = first; i < last; i++) {
-    threads.Schedule([f, i] { f(i); });
-  }
-#endif
-}
-
-}  // anonymous namespace
-
 FileSystem::~FileSystem() {}
 
 string FileSystem::TranslateName(const string& name) const {
@@ -97,76 +71,6 @@ bool FileSystem::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-Status FileSystem::GetMatchingPaths(const string& pattern,
-                                    std::vector<string>* results) {
-  results->clear();
-  // Find the fixed prefix by looking for the first wildcard.
-  string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
-  string eval_pattern = pattern;
-  std::vector<string> all_files;
-  string dir = io::Dirname(fixed_prefix).ToString();
-  // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
-  // include . as the top level directory.
-  if (dir.empty()) {
-    dir = ".";
-    fixed_prefix = io::JoinPath(dir, fixed_prefix);
-    eval_pattern = io::JoinPath(dir, pattern);
-  }
-
-  // Setup a BFS to explore everything under dir.
-  std::deque<string> dir_q;
-  dir_q.push_back(dir);
-  Status ret;  // Status to return.
-  // children_dir_status holds is_dir status for children. It can have three
-  // possible values: OK for true; FAILED_PRECONDITION for false; CANCELLED
-  // if we don't calculate IsDirectory (we might do that because there isn't
-  // any point in exploring that child path).
-  std::vector<Status> children_dir_status;
-  while (!dir_q.empty()) {
-    string current_dir = dir_q.front();
-    dir_q.pop_front();
-    std::vector<string> children;
-    Status s = GetChildren(current_dir, &children);
-    ret.Update(s);
-    if (children.empty()) continue;
-    // This IsDirectory call can be expensive for some FS. Parallelizing it.
-    children_dir_status.resize(children.size());
-    ForEach(0, children.size(),
-            [this, &current_dir, &children, &fixed_prefix,
-             &children_dir_status](int i) {
-              const string child_path = io::JoinPath(current_dir, children[i]);
-              // In case the child_path doesn't start with the fixed_prefix then
-              // we don't need to explore this path.
-              if (!StringPiece(child_path).starts_with(fixed_prefix)) {
-                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
-                                                "Operation not needed");
-              } else {
-                children_dir_status[i] = IsDirectory(child_path);
-              }
-            });
-    for (int i = 0; i < children.size(); ++i) {
-      const string child_path = io::JoinPath(current_dir, children[i]);
-      // If the IsDirectory call was cancelled we bail.
-      if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
-        continue;
-      }
-      // If the child is a directory add it to the queue.
-      if (children_dir_status[i].ok()) {
-        dir_q.push_back(child_path);
-      }
-      all_files.push_back(child_path);
-    }
-  }
-
-  // Match all obtained files to the input pattern.
-  for (const auto& f : all_files) {
-    if (Env::Default()->MatchPath(f, eval_pattern)) {
-      results->push_back(f);
-    }
-  }
-  return ret;
-}
-
 Status FileSystem::DeleteRecursively(const string& dirname,
                                      int64* undeleted_files,
                                      int64* undeleted_dirs) {
@@ -244,7 +148,7 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname) {
       return status;
     }
     // Basename returns "" for / ending dirs.
-    if (!remaining_dir.ends_with("/")) {
+    if (!str_util::EndsWith(remaining_dir, "/")) {
       sub_dirs.push_back(io::Basename(remaining_dir));
     }
     remaining_dir = io::Dirname(remaining_dir);
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 3085b6958fd921ae124b885107e807f0a02e1d9d..077b1d79cfb259c6d497f37e7f06d0da189f3ff5 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/platform/file_statistics.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 #ifdef PLATFORM_WINDOWS
@@ -139,10 +138,8 @@ class FileSystem {
   ///  * OK - no errors
   ///  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
   ///                    implemented
-  /// The default implementation uses a combination of GetChildren, MatchPath
-  /// and IsDirectory.
   virtual Status GetMatchingPaths(const string& pattern,
-                                  std::vector<string>* results);
+                                  std::vector<string>* results) = 0;
 
   /// \brief Obtains statistics for the given path.
   virtual Status Stat(const string& fname, FileStatistics* stat) = 0;
@@ -306,74 +303,6 @@ class ReadOnlyMemoryRegion {
   virtual uint64 length() = 0;
 };
 
-// START_SKIP_DOXYGEN
-
-#ifndef SWIG
-// Degenerate file system that provides no implementations.
-class NullFileSystem : public FileSystem {
- public:
-  NullFileSystem() {}
-
-  ~NullFileSystem() override = default;
-
-  Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
-    return errors::Unimplemented("NewRandomAccessFile unimplemented");
-  }
-
-  Status NewWritableFile(const string& fname,
-                         std::unique_ptr<WritableFile>* result) override {
-    return errors::Unimplemented("NewWritableFile unimplemented");
-  }
-
-  Status NewAppendableFile(const string& fname,
-                           std::unique_ptr<WritableFile>* result) override {
-    return errors::Unimplemented("NewAppendableFile unimplemented");
-  }
-
-  Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
-    return errors::Unimplemented(
-        "NewReadOnlyMemoryRegionFromFile unimplemented");
-  }
-
-  Status FileExists(const string& fname) override {
-    return errors::Unimplemented("FileExists unimplemented");
-  }
-
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
-    return errors::Unimplemented("GetChildren unimplemented");
-  }
-
-  Status DeleteFile(const string& fname) override {
-    return errors::Unimplemented("DeleteFile unimplemented");
-  }
-
-  Status CreateDir(const string& dirname) override {
-    return errors::Unimplemented("CreateDir unimplemented");
-  }
-
-  Status DeleteDir(const string& dirname) override {
-    return errors::Unimplemented("DeleteDir unimplemented");
-  }
-
-  Status GetFileSize(const string& fname, uint64* file_size) override {
-    return errors::Unimplemented("GetFileSize unimplemented");
-  }
-
-  Status RenameFile(const string& src, const string& target) override {
-    return errors::Unimplemented("RenameFile unimplemented");
-  }
-
-  Status Stat(const string& fname, FileStatistics* stat) override {
-    return errors::Unimplemented("Stat unimplemented");
-  }
-};
-#endif
-
-// END_SKIP_DOXYGEN
-
 /// \brief A registry for file system implementations.
 ///
 /// Filenames are specified as an URI, which is of the form
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22c5057281959fa1584828d927387e8094bfa50a
--- /dev/null
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/file_system_helper.h"
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/platform.h"
+
+namespace tensorflow {
+namespace internal {
+
+namespace {
+
+constexpr int kNumThreads = 8;
+
+// Run a function in parallel using a ThreadPool, but skip the ThreadPool
+// on the iOS platform due to its problems with more than a few threads.
+void ForEach(int first, int last, const std::function<void(int)>& f) {
+#if TARGET_OS_IPHONE
+  for (int i = first; i < last; i++) {
+    f(i);
+  }
+#else
+  int num_threads = std::min(kNumThreads, last - first);
+  thread::ThreadPool threads(Env::Default(), "ForEach", num_threads);
+  for (int i = first; i < last; i++) {
+    threads.Schedule([f, i] { f(i); });
+  }
+#endif
+}
+
+}  // namespace
+
+Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                        std::vector<string>* results) {
+  results->clear();
+  // Find the fixed prefix by looking for the first wildcard.
+  string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
+  string eval_pattern = pattern;
+  std::vector<string> all_files;
+  string dir = io::Dirname(fixed_prefix).ToString();
+  // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
+  // include . as the top level directory.
+  if (dir.empty()) {
+    dir = ".";
+    fixed_prefix = io::JoinPath(dir, fixed_prefix);
+    eval_pattern = io::JoinPath(dir, pattern);
+  }
+
+  // Setup a BFS to explore everything under dir.
+  std::deque<string> dir_q;
+  dir_q.push_back(dir);
+  Status ret;  // Status to return.
+  // children_dir_status holds is_dir status for children. It can have three
+  // possible values: OK for true; FAILED_PRECONDITION for false; CANCELLED
+  // if we don't calculate IsDirectory (we might do that because there isn't
+  // any point in exploring that child path).
+  std::vector<Status> children_dir_status;
+  while (!dir_q.empty()) {
+    string current_dir = dir_q.front();
+    dir_q.pop_front();
+    std::vector<string> children;
+    Status s = fs->GetChildren(current_dir, &children);
+    ret.Update(s);
+    if (children.empty()) continue;
+    // This IsDirectory call can be expensive for some FS. Parallelizing it.
+    children_dir_status.resize(children.size());
+    ForEach(0, children.size(),
+            [fs, &current_dir, &children, &fixed_prefix,
+             &children_dir_status](int i) {
+              const string child_path = io::JoinPath(current_dir, children[i]);
+              // In case the child_path doesn't start with the fixed_prefix then
+              // we don't need to explore this path.
+              if (!str_util::StartsWith(child_path, fixed_prefix)) {
+                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
+                                                "Operation not needed");
+              } else {
+                children_dir_status[i] = fs->IsDirectory(child_path);
+              }
+            });
+    for (int i = 0; i < children.size(); ++i) {
+      const string child_path = io::JoinPath(current_dir, children[i]);
+      // If the IsDirectory call was cancelled we bail.
+      if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
+        continue;
+      }
+      // If the child is a directory add it to the queue.
+      if (children_dir_status[i].ok()) {
+        dir_q.push_back(child_path);
+      }
+      all_files.push_back(child_path);
+    }
+  }
+
+  // Match all obtained files to the input pattern.
+  for (const auto& f : all_files) {
+    if (env->MatchPath(f, eval_pattern)) {
+      results->push_back(f);
+    }
+  }
+  return ret;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system_helper.h b/tensorflow/core/platform/file_system_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d812b0e38150f9190f69fd279561944d42174c6
--- /dev/null
+++ b/tensorflow/core/platform/file_system_helper.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+#define TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class FileSystem;
+class Env;
+
+namespace internal {
+
+// Given a pattern, stores in 'results' the set of paths (in the given file
+// system) that match that pattern.
+//
+// This helper may be used by implementations of FileSystem::GetMatchingPaths()
+// in order to provide parallel scanning of subdirectories (except on iOS).
+//
+// Arguments:
+//   fs: may not be null and will be used to identify directories and list
+//       their contents.
+//   env: may not be null and will be used to check if a match has been found.
+//   pattern: see FileSystem::GetMatchingPaths() for details.
+//   results: will be cleared and may not be null.
+//
+// Returns an error status if any call to 'fs' failed.
+Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                        std::vector<string>* results);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index abe88ab6c7e876046db28cd15e804d756fee7066..f261b8f5761506fc5d706c9646c36eef912fc18f 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/null_file_system.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -123,7 +124,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     io::ParseURI(name, &scheme, &host, &path);
     ASSERT_EQ(scheme, "ipfs");
     ASSERT_EQ(host, "solarsystem");
-    path.Consume("/");
+    str_util::ConsumePrefix(&path, "/");
     *parsed_path = path.ToString();
   }
 
@@ -159,7 +160,8 @@ string Match(InterPlanetaryFileSystem* ipfs, const string& suffix_pattern) {
     std::sort(results.begin(), results.end());
     for (const string& result : results) {
       StringPiece trimmed_result(result);
-      EXPECT_TRUE(trimmed_result.Consume(strings::StrCat(kPrefix, "/")));
+      EXPECT_TRUE(str_util::ConsumePrefix(&trimmed_result,
+                                          strings::StrCat(kPrefix, "/")));
       trimmed_results.push_back(trimmed_result);
     }
     return str_util::Join(trimmed_results, ",");
diff --git a/tensorflow/core/platform/hadoop/BUILD b/tensorflow/core/platform/hadoop/BUILD
index 774a439855e49904b29f1e0c3d82196b1b9afb5d..7c38c399bd7a4645b3556e653110c19b8b9ab9ff 100644
--- a/tensorflow/core/platform/hadoop/BUILD
+++ b/tensorflow/core/platform/hadoop/BUILD
@@ -12,18 +12,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "hadoop_file_system",
     srcs = ["hadoop_file_system.cc"],
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 74863293a32451e8881c93de468539b913169aaa..9a71fbe2b785be2a47d413962d3996b19e39fecc 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/posix/error.h"
@@ -396,6 +397,11 @@ Status HadoopFileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
+Status HadoopFileSystem::GetMatchingPaths(const string& pattern,
+                                          std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status HadoopFileSystem::DeleteFile(const string& fname) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.h b/tensorflow/core/platform/hadoop/hadoop_file_system.h
index 5f2b222622cf01033af117f92d49458eeae00e6f..6af7a698ffe91d79d1460a4e335ddd7bf8727a3c 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.h
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.h
@@ -49,6 +49,9 @@ class HadoopFileSystem : public FileSystem {
 
   Status GetChildren(const string& dir, std::vector<string>* result) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index 6ba2f04d0f839cedee9d75d8ed960a50668e541c..b207d3474977361777383299a2a603a9f21481d4 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -197,7 +198,7 @@ TEST_F(HadoopFileSystemTest, WriteWhileReading) {
   // Skip the test if we're not testing on HDFS. Hadoop's local filesystem
   // implementation makes no guarantees that writable files are readable while
   // being written.
-  if (!StringPiece(fname).starts_with("hdfs://")) {
+  if (!str_util::StartsWith(fname, "hdfs://")) {
     return;
   }
 
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index 7bb9fc264fbf6ee3f20e9b2687c9ba52b6171ec4..fca3a2332d15f986d637f7d3a5eb91069dfce1a0 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -59,7 +59,7 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes);
 // routine, this routine returns 0.
 std::size_t MallocExtension_GetAllocatedSize(const void* p);
 
-// Returns the amount of RAM available in kB, or INT64_MAX if unknown.
+// Returns the amount of RAM available in bytes, or INT64_MAX if unknown.
 int64 AvailableRam();
 
 }  // namespace port
diff --git a/tensorflow/core/platform/null_file_system.h b/tensorflow/core/platform/null_file_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..420abc1ada81456e8883e48dd693614f75b81116
--- /dev/null
+++ b/tensorflow/core/platform/null_file_system.h
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_NULL_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_NULL_FILE_SYSTEM_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
+
+namespace tensorflow {
+
+// START_SKIP_DOXYGEN
+
+#ifndef SWIG
+// Degenerate file system that provides no implementations.
+class NullFileSystem : public FileSystem {
+ public:
+  NullFileSystem() {}
+
+  ~NullFileSystem() override = default;
+
+  Status NewRandomAccessFile(
+      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
+    return errors::Unimplemented("NewRandomAccessFile unimplemented");
+  }
+
+  Status NewWritableFile(const string& fname,
+                         std::unique_ptr<WritableFile>* result) override {
+    return errors::Unimplemented("NewWritableFile unimplemented");
+  }
+
+  Status NewAppendableFile(const string& fname,
+                           std::unique_ptr<WritableFile>* result) override {
+    return errors::Unimplemented("NewAppendableFile unimplemented");
+  }
+
+  Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
+    return errors::Unimplemented(
+        "NewReadOnlyMemoryRegionFromFile unimplemented");
+  }
+
+  Status FileExists(const string& fname) override {
+    return errors::Unimplemented("FileExists unimplemented");
+  }
+
+  Status GetChildren(const string& dir, std::vector<string>* result) override {
+    return errors::Unimplemented("GetChildren unimplemented");
+  }
+
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override {
+    return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+  }
+
+  Status DeleteFile(const string& fname) override {
+    return errors::Unimplemented("DeleteFile unimplemented");
+  }
+
+  Status CreateDir(const string& dirname) override {
+    return errors::Unimplemented("CreateDir unimplemented");
+  }
+
+  Status DeleteDir(const string& dirname) override {
+    return errors::Unimplemented("DeleteDir unimplemented");
+  }
+
+  Status GetFileSize(const string& fname, uint64* file_size) override {
+    return errors::Unimplemented("GetFileSize unimplemented");
+  }
+
+  Status RenameFile(const string& src, const string& target) override {
+    return errors::Unimplemented("RenameFile unimplemented");
+  }
+
+  Status Stat(const string& fname, FileStatistics* stat) override {
+    return errors::Unimplemented("Stat unimplemented");
+  }
+};
+#endif
+
+// END_SKIP_DOXYGEN
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_NULL_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 8097624e09f81364071895ad114f26f93f4aab14..418874d3406200566cdd9a4c6141852b948413ff 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -118,6 +118,9 @@ class PosixEnv : public Env {
                                const string& version) override {
     return tensorflow::internal::FormatLibraryFileName(name, version);
   }
+
+ private:
+  void GetLocalTempDirectories(std::vector<string>* list) override;
 };
 
 }  // namespace
@@ -131,7 +134,7 @@ Env* Env::Default() {
 }
 #endif
 
-void Env::GetLocalTempDirectories(std::vector<string>* list) {
+void PosixEnv::GetLocalTempDirectories(std::vector<string>* list) {
   list->clear();
   // Directories, in order of preference. If we find a dir that
   // exists, we stop adding other less-preferred dirs
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 494acde803a778fb839a7444e4d5ac2fd094eb09..8e316472fe2ea6f7c3187f0a5f98052c20f5ce6b 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -177,7 +177,7 @@ int64 AvailableRam() {
   struct sysinfo info;
   int err = sysinfo(&info);
   if (err == 0) {
-    return info.freeram / 1024;
+    return info.freeram;
   }
 #endif
   return INT64_MAX;
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 9a8021565cbcc2a172a23439d2a7139108c0df39..47bfa020cef991e6a2e9e9de283318b287788454 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/posix/posix_file_system.h"
@@ -225,6 +226,11 @@ Status PosixFileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
+Status PosixFileSystem::GetMatchingPaths(const string& pattern,
+                                         std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status PosixFileSystem::DeleteFile(const string& fname) {
   Status result;
   if (unlink(TranslateName(fname).c_str()) != 0) {
diff --git a/tensorflow/core/platform/posix/posix_file_system.h b/tensorflow/core/platform/posix/posix_file_system.h
index 98ffa43b8acf8a10a4ace1bf11cc7d6f5e8a95a7..e8898d0a97f50e29d1216bf2d9d340711cb29754 100644
--- a/tensorflow/core/platform/posix/posix_file_system.h
+++ b/tensorflow/core/platform/posix/posix_file_system.h
@@ -47,6 +47,9 @@ class PosixFileSystem : public FileSystem {
 
   Status Stat(const string& fname, FileStatistics* stats) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/posix/subprocess.cc b/tensorflow/core/platform/posix/subprocess.cc
index cefc66831a9b9fe11a170013e64ff7a1cd2e2bcd..a661c34ef01b1afba05dedf3d7c6c2aef86245fa 100644
--- a/tensorflow/core/platform/posix/subprocess.cc
+++ b/tensorflow/core/platform/posix/subprocess.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <memory>
+#include <vector>
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/subprocess.h"
@@ -461,4 +463,12 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
   return WaitInternal(&status) ? status : -1;
 }
 
+std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
+  std::unique_ptr<SubProcess> proc(new SubProcess());
+  proc->SetProgram(argv[0], argv);
+  proc->SetChannelAction(CHAN_STDERR, ACTION_DUPPARENT);
+  proc->SetChannelAction(CHAN_STDOUT, ACTION_DUPPARENT);
+  return proc;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/test.cc b/tensorflow/core/platform/posix/test.cc
index a69127b3e88834458503012ad4d8d9334bba247a..28f7478a6d5a371079acef135eaab69ead1ebf4b 100644
--- a/tensorflow/core/platform/posix/test.cc
+++ b/tensorflow/core/platform/posix/test.cc
@@ -20,19 +20,10 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/subprocess.h"
 
 namespace tensorflow {
 namespace testing {
 
-std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
-  std::unique_ptr<SubProcess> proc(new SubProcess());
-  proc->SetProgram(argv[0], argv);
-  proc->SetChannelAction(CHAN_STDERR, ACTION_DUPPARENT);
-  proc->SetChannelAction(CHAN_STDOUT, ACTION_DUPPARENT);
-  return proc;
-}
-
 int PickUnusedPortOrDie() { return internal::PickUnusedPortOrDie(); }
 
 string TensorFlowSrcRoot() {
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index 3a0ad2e9bd09211aa452f8b39b621343a113785d..21038cfeb15be052f7460151bacaa15544c8d77c 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -13,18 +13,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_cc_binary(
     name = "s3_file_system.so",
     srcs = [
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 301fcb9dbf653d29f6ac5321332c8764adaad681..ee423699b2f15e973326358aa38776a71951edb3 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/s3/aws_logging.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
@@ -497,6 +498,11 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
   return Status::OK();
 }
 
+Status S3FileSystem::GetMatchingPaths(const string& pattern,
+                                      std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status S3FileSystem::DeleteFile(const string& fname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 31264be621d93c1efb68f7b0b49e28cb65b05de1..5d0565b378198a39f80940c0627f7638e92691fa 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -46,6 +46,9 @@ class S3FileSystem : public FileSystem {
 
   Status Stat(const string& fname, FileStatistics* stat) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/subprocess.h b/tensorflow/core/platform/subprocess.h
index dfdcf82173b774dee119f1fe9e84818e45d7b50c..dcc0c1a4ee33ff47beefa6c3f82c6954770e7036 100644
--- a/tensorflow/core/platform/subprocess.h
+++ b/tensorflow/core/platform/subprocess.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_SUBPROCESS_H_
 #define TENSORFLOW_PLATFORM_SUBPROCESS_H_
 
+#include <memory>
+#include <vector>
+
 namespace tensorflow {
 
 // Channel identifiers.
@@ -43,6 +46,12 @@ enum ChannelAction {
 // Supports spawning and killing child processes.
 class SubProcess;
 
+// Returns an object that represents a child process that will be
+// launched with the given command-line arguments `argv`. The process
+// must be explicitly started by calling the Start() method on the
+// returned object.
+std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv);
+
 }  // namespace tensorflow
 
 #include "tensorflow/core/platform/platform.h"
diff --git a/tensorflow/core/platform/test.h b/tensorflow/core/platform/test.h
index 295957c3d801cc959eeb3e60dd2a74587ee14197..99bae63edf8ae26fb51acde12dc1a4f8bcaf778c 100644
--- a/tensorflow/core/platform/test.h
+++ b/tensorflow/core/platform/test.h
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/platform/types.h"
 
 // As of September 2016, we continue to attempt to avoid the use of gmock aka
@@ -49,12 +48,6 @@ string TensorFlowSrcRoot();
 // Returns the same value for the lifetime of the process.
 int RandomSeed();
 
-// Returns an object that represents a child process that will be
-// launched with the given command-line arguments `argv`. The process
-// must be explicitly started by calling the Start() method on the
-// returned object.
-std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv);
-
 // Returns an unused port number, for use in multi-process testing.
 // NOTE: This function is not thread-safe.
 int PickUnusedPortOrDie();
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index eebbeaeba602fe277ee76396504fce6f70b7e771..3c6e7b0db5995136a7cff468288cca0bd32c2349 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -103,6 +103,8 @@ class Tracing {
   friend class ScopedAnnotation;
   friend class TraceMe;
 
+  // TODO: TF_EXPORT is for building //tensorflow/contrib/data:_dataset_ops.so
+  //       on Windows. Figure out a way to remove TF_EXPORT here.
   TF_EXPORT static std::atomic<Tracing::Engine*> tracing_engine_;
   static Tracing::Engine* engine() {
     return tracing_engine_.load(std::memory_order_acquire);
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index 38d75dbb3265bc77e800b8d6295e3dd73072b58f..6308e588470d75c2236113f7e19a27241f2f9224 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -31,10 +31,6 @@ limitations under the License.
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
 
-#if defined(PLATFORM_WINDOWS)
-#include "tensorflow/core/platform/windows/cpu_info.h"
-#endif
-
 namespace tensorflow {
 
 // Define tensorflow::string to refer to appropriate platform specific type.
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 41b264417071cadb5f70806b458ee2b46ebb2feb..2f54f423b2ee7d8842f2edab7f6bf29877fac173 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -160,6 +160,8 @@ class WindowsEnv : public Env {
   }
 
  private:
+  void GetLocalTempDirectories(std::vector<string>* list) override;
+
   typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
   FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
@@ -174,7 +176,7 @@ Env* Env::Default() {
   return default_env;
 }
 
-void Env::GetLocalTempDirectories(std::vector<string>* list) {
+void WindowsEnv::GetLocalTempDirectories(std::vector<string>* list) {
   list->clear();
   // On windows we'll try to find a directory in this order:
   //   C:/Documents & Settings/whomever/TEMP (or whatever GetTempPath() is)
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index f3b27ea394d04770b612752328d5d571e6521cc6..174f41a993f8010112f316dc9ba220f6ecc2804e 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -166,7 +166,7 @@ int64 AvailableRam() {
   MEMORYSTATUSEX statex;
   statex.dwLength = sizeof(statex);
   if (GlobalMemoryStatusEx(&statex)) {
-    return statex.ullAvailPhys / 1024;
+    return statex.ullAvailPhys;
   }
   return INT64_MAX;
 }
diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h
index 66ec44885d52195b807f4957aec6d590324b2975..f00471d484014d431665dbf0cb0d38ea82a14435 100644
--- a/tensorflow/core/platform/windows/subprocess.h
+++ b/tensorflow/core/platform/windows/subprocess.h
@@ -16,11 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
 #define TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
 
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+
 namespace tensorflow {
 
 // SubProcess is not yet implemented for Windows.
 class SubProcess {};
 
+std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
+  LOG(FATAL) << "CreateSubProcess NOT IMPLEMENTED for Windows yet ! ";
+  return nullptr;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
diff --git a/tensorflow/core/platform/windows/test.cc b/tensorflow/core/platform/windows/test.cc
index 584acad91b24fc6be9b93f71b7d44b0fba3cb2e8..ad2b7bc6ff6e6037a922352d34b628b2138d0712 100644
--- a/tensorflow/core/platform/windows/test.cc
+++ b/tensorflow/core/platform/windows/test.cc
@@ -22,11 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace testing {
 
-std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
-  LOG(FATAL) << "CreateSubProcess NOT IMPLEMENTED for Windows yet ! ";
-  return nullptr;
-}
-
 int PickUnusedPortOrDie() { return internal::PickUnusedPortOrDie(); }
 
 string TensorFlowSrcRoot() {
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 682e46e0fcd0322ed34fa94d0ee5516cf9194a3b..dc2efbeaf5e3eabc6077df4c1c126762d36ba8a6 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/windows/error.h"
@@ -494,7 +495,8 @@ Status WindowsFileSystem::GetMatchingPaths(const string& pattern,
   // but no code appears to rely on this behavior.
   string converted_pattern(pattern);
   std::replace(converted_pattern.begin(), converted_pattern.end(), '\\', '/');
-  TF_RETURN_IF_ERROR(FileSystem::GetMatchingPaths(converted_pattern, results));
+  TF_RETURN_IF_ERROR(internal::GetMatchingPaths(this, Env::Default(),
+                                                converted_pattern, results));
   for (string& result : *results) {
     std::replace(result.begin(), result.end(), '/', '\\');
   }
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 5ce6f1046d3a812039106520d4883622c4df485b..3d3203cdaa80fa4b59063e256146dd47a9303279 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -4,21 +4,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 05a798bff80a0775e5170bf8f428d9e88d8060b3..8dcfde9a2adbd3a1774bce8506a84f80ca099c34 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -365,17 +365,3 @@ cc_library(
         "//tensorflow/core:regexp_internal",
     ],
 )
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/profiler/internal/advisor/BUILD b/tensorflow/core/profiler/internal/advisor/BUILD
index 40cfd1e12e609de0f70b12b2cf98ef1086b4d024..1fedb05ae319176886cb0ff0409ea6685df76a4c 100644
--- a/tensorflow/core/profiler/internal/advisor/BUILD
+++ b/tensorflow/core/profiler/internal/advisor/BUILD
@@ -73,18 +73,3 @@ tf_cc_test(
         "//tensorflow/core/profiler/internal:tfprof_tf_testlib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index e968b9c97e28eeae22954102d5f0e07e09d75f7f..96b6cc30bd9b3d603eb585a05023f36fe7b816b7 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
 
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -82,8 +83,8 @@ TEST_F(TFProfAdvisorTest, OperationChecker) {
   (*options.mutable_checkers())[kCheckers[1]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[1]).reports_size(), 1);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[1]).reports(0))
-                  .contains("NCHW"));
+  EXPECT_TRUE(str_util::StrContains(
+      advice.checkers().at(kCheckers[1]).reports(0), "NCHW"));
 }
 
 TEST_F(TFProfAdvisorTest, UtilizationChecker) {
@@ -91,16 +92,17 @@ TEST_F(TFProfAdvisorTest, UtilizationChecker) {
   (*options.mutable_checkers())[kCheckers[0]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[0]).reports_size(), 1);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[0]).reports(0))
-                  .contains("low utilization"));
+  EXPECT_TRUE(str_util::StrContains(
+      advice.checkers().at(kCheckers[0]).reports(0), "low utilization"));
 }
 
 TEST_F(TFProfAdvisorTest, ExpensiveOperationChecker) {
   AdvisorOptionsProto options;
   (*options.mutable_checkers())[kCheckers[2]];
   AdviceProto advice = advisor_->Advise(options);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[2]).reports(0))
-                  .contains("top 1 operation type: Conv2D"));
+  EXPECT_TRUE(
+      str_util::StrContains(advice.checkers().at(kCheckers[2]).reports(0),
+                            "top 1 operation type: Conv2D"));
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index abbbe392aa3536aa887f678262285caa88054eed..a3557e4721644dd2577e7b56077a4e7ef8030463 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -67,9 +67,7 @@ message GPUOptions {
   // set or set to 0, gets set to a non-zero default.
   int32 polling_active_delay_usecs = 6;
 
-  // In the event polling loop sleep this many millisconds between
-  // PollEvents calls, when the queue is empty.  If value is not
-  // set or set to 0, gets set to a non-zero default.
+  // This field is deprecated and ignored.
   int32 polling_inactive_delay_msecs = 7;
 
   // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index b1fceaacf4488a0c3894b2d65e22ebeafef05b2f..9b6202e7b494a0881788375e66698269e33505d8 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -29,6 +29,14 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
+  // Enum controling the number of times to run optimizers. The default is to
+  // run them once.
+  enum NumIterationsType {
+    DEFAULT_NUM_ITERS = 0;
+    ONE = 1;
+    TWO = 2;
+  }
+
   // Optimize tensor layouts (default is ON)
   // e.g. This will try to use NCHW layout on GPU which is faster.
   Toggle layout_optimizer = 1;
@@ -42,13 +50,19 @@ message RewriterConfig {
   // Control dependency optimizations (default is ON).
   // Remove redundant control dependencies, which may enable other optimization.
   Toggle dependency_optimization = 8;
-  // Loop optimizations (default is OFF).
+  // Loop optimizations (default is ON).
   Toggle loop_optimization = 9;
   // Function optimizations (default is ON).
   Toggle function_optimization = 10;
+  // Strips debug-related nodes from the graph (off by default).
+  Toggle debug_stripper = 11;
   // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
 
+  // Controls how many times we run the optimizers in meta optimizer (default
+  // is once).
+  NumIterationsType meta_optimizer_iterations = 12;
+
   enum MemOptType {
     // The default setting (SCHEDULING and SWAPPING HEURISTICS only)
     DEFAULT_MEM_OPT = 0;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 22f2c02b78b0b0cf1f8ea2fd6b9c804c3b3a9f80..706968d34745b8d21653bcee762f8a37555b93c1 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,7 +19,7 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 6
+#define TF_MINOR_VERSION 7
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 3efc703faf7b23958eb49d59fd0dd4565f090bbe..480ce94fcaeddd62c30089d09752cc4d965ebf01 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -28,7 +29,9 @@ bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                      const std::function<bool(string)>& hook,
                      bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     *value_parsing_ok = hook(arg.ToString());
     return true;
   }
@@ -40,7 +43,9 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int32)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     int32 parsed_int32;
     if (sscanf(arg.data(), "%d%c", &parsed_int32, &extra) != 1) {
@@ -60,7 +65,9 @@ bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int64)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     int64 parsed_int64;
     if (sscanf(arg.data(), "%lld%c", &parsed_int64, &extra) != 1) {
@@ -80,7 +87,8 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    const std::function<bool(bool)>& hook,
                    bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag)) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *value_parsing_ok = hook(true);
       return true;
@@ -107,7 +115,9 @@ bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(float)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     float parsed_float;
     if (sscanf(arg.data(), "%f%c", &parsed_float, &extra) != 1) {
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index 1521349e4ddf064ce55726a9c6ca400ae6342c15..317420204e20ab2994ca9b7b7f4cc39e688e728f 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -26,18 +26,6 @@ alias(
     actual = ":mobile_srcs",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "ctc",
     deps = [
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index c1bc0f33785ef1576bf0f0d3db71e8daa1a51801..ff9c108f10cdbfa6f1ca3bb966d42e32fb223c74 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -408,7 +408,7 @@ static void MergeDevNamesError(const string& name_a, const string& name_b,
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StringPiece(s.error_message()).contains(expected_error_substr))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), expected_error_substr))
       << s;
 }
 
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index f1ec497a6772c84d599a76169515ef417c11f430..b87dce0dff536733397bff946c12e992a8097666 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -144,7 +145,7 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
 
   int first_control_input = actual.input_size();
   for (int i = 0; i < actual.input_size(); ++i) {
-    if (StringPiece(actual.input(i)).starts_with("^")) {
+    if (str_util::StartsWith(actual.input(i), "^")) {
       first_control_input = i;
       break;
     }
@@ -240,7 +241,7 @@ uint64 NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
   // Normal inputs. Order important.
   int first_control_input = ndef.input_size();
   for (int i = 0; i < ndef.input_size(); ++i) {
-    if (StringPiece(ndef.input(i)).starts_with("^")) {
+    if (str_util::StartsWith(ndef.input(i), "^")) {
       first_control_input = i;
       break;
     }
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index a0f43d2d4a745722d2095b6817c9156415c78127..1fa6b8bec037c3ee0d2b9b95f2ccce59813c98b9 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/util/memmapped_file_system.h"
 
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/memmapped_file_system.pb.h"
 
@@ -157,6 +158,12 @@ Status MemmappedFileSystem::GetChildren(const string& filename,
   return errors::Unimplemented("memmapped format doesn't support GetChildren");
 }
 
+Status MemmappedFileSystem::GetMatchingPaths(const string& pattern,
+                                             std::vector<string>* results) {
+  return errors::Unimplemented(
+      "memmapped format doesn't support GetMatchingPaths");
+}
+
 Status MemmappedFileSystem::DeleteFile(const string& filename) {
   return errors::Unimplemented("memmapped format doesn't support DeleteFile");
 }
@@ -236,7 +243,7 @@ Status MemmappedFileSystem::InitializeFromFile(Env* env,
 }
 
 bool MemmappedFileSystem::IsMemmappedPackageFilename(const string& filename) {
-  return StringPiece(filename).starts_with(kMemmappedPackagePrefix);
+  return str_util::StartsWith(filename, kMemmappedPackagePrefix);
 }
 
 namespace {
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 541587aeab05242f0c71beb139fe74c768b810b0..76cc4911f5e067253455d4d4eb86189e7d6e9de8 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -85,6 +85,8 @@ class MemmappedFileSystem : public FileSystem {
   Status NewAppendableFile(const string& fname,
                            std::unique_ptr<WritableFile>* result) override;
   Status GetChildren(const string& dir, std::vector<string>* r) override;
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
   Status DeleteFile(const string& f) override;
   Status CreateDir(const string& d) override;
   Status DeleteDir(const string& d) override;
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 34db96075d45f690cffad44bcc08cdf17d6e68dc..9f58e40d94c6f50694583bc82057790040115de1 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1579,10 +1579,10 @@ class MklDnnData {
   }
 
   /// Set function for data buffer of user memory primitive.
-  inline void* SetUsrMemDataHandle(void* data_buffer) {
+  inline void SetUsrMemDataHandle(void* data_buffer) {
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(data_buffer);
-    return user_memory_->set_data_handle(data_buffer);
+    user_memory_->set_data_handle(data_buffer);
   }
 
   /// Set function for data buffer of user memory primitive.
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 575c27d4ef72ec33c4b9352de59fc806b12d6385..90ea09876e85468fbc05a1baa79b29a7a42ebace 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index 166bd0f659dae3124faac6d71d69cbcd41c15b48..648358606c130b7ed64a56739be0b37884f585d5 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -75,18 +75,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 08f1aa7125bc47421e0db24a9db6f6e2b2f1e365..7f166f0ec0aeee78738648060ef7782827918cd8 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/table_builder.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -293,7 +294,7 @@ void VersionTest(const VersionDef& version, StringPiece expected_error) {
   BundleReader reader(Env::Default(), path);
   EXPECT_TRUE(errors::IsInvalidArgument(reader.status()));
   EXPECT_TRUE(
-      StringPiece(reader.status().error_message()).starts_with(expected_error));
+      str_util::StartsWith(reader.status().error_message(), expected_error));
 }
 
 }  // namespace
@@ -588,7 +589,7 @@ TEST(TensorBundleTest, Error) {
     TF_EXPECT_OK(writer.Add("foo", Constant_2x3(1.f)));
     EXPECT_FALSE(writer.Add("foo", Constant_2x3(2.f)).ok());
     EXPECT_TRUE(
-        StringPiece(writer.status().ToString()).contains("duplicate key"));
+        str_util::StrContains(writer.status().ToString(), "duplicate key"));
     EXPECT_FALSE(writer.Finish().ok());
   }
   {  // Double finish
@@ -598,7 +599,7 @@ TEST(TensorBundleTest, Error) {
   }
   {  // Not found.
     BundleReader reader(Env::Default(), Prefix("nonexist"));
-    EXPECT_TRUE(StringPiece(reader.status().ToString()).contains("Not found"));
+    EXPECT_TRUE(str_util::StrContains(reader.status().ToString(), "Not found"));
   }
 }
 
@@ -629,7 +630,7 @@ TEST(TensorBundleTest, Checksum) {
     BundleReader reader(Env::Default(), Prefix(prefix));
     Status status = reader.Lookup(key, &val);
     EXPECT_TRUE(errors::IsDataLoss(status));
-    EXPECT_TRUE(StringPiece(status.ToString()).contains(expected_msg));
+    EXPECT_TRUE(str_util::StrContains(status.ToString(), expected_msg));
   };
 
   // Corrupts a float tensor.
@@ -680,8 +681,8 @@ TEST(TensorBundleTest, Endianness) {
 
   BundleReader reader(Env::Default(), Prefix("end"));
   EXPECT_TRUE(errors::IsUnimplemented(reader.status()));
-  EXPECT_TRUE(StringPiece(reader.status().ToString())
-                  .contains("different endianness from the reader"));
+  EXPECT_TRUE(str_util::StrContains(reader.status().ToString(),
+                                    "different endianness from the reader"));
 }
 
 TEST(TensorBundleTest, TruncatedTensorContents) {
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index 010cc36823b739a6209b9f56fd883cf6d6abc6d7..3c9590e488d1895fcc5c630f846c0fb63aea12f5 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -422,7 +423,7 @@ static void VersionTest(const VersionDef& versions, const string& error) {
   // Read it back in and verify that we get the expected error
   TensorSliceReader reader(path, OpenTableTensorSliceReader);
   EXPECT_TRUE(reader.status().code() == error::INVALID_ARGUMENT &&
-              StringPiece(reader.status().error_message()).starts_with(error))
+              str_util::StartsWith(reader.status().error_message(), error))
       << "Expected error starting with '" << errors::InvalidArgument(error)
       << "', got '" << reader.status() << "'";
 }
diff --git a/tensorflow/core/util/tensor_slice_writer_test.cc b/tensorflow/core/util/tensor_slice_writer_test.cc
index ff5bfd65aef360cd89908a94bee7d8bb052f1905..31397f11b66ce7b2a64fa7f5e508a801a1d47969 100644
--- a/tensorflow/core/util/tensor_slice_writer_test.cc
+++ b/tensorflow/core/util/tensor_slice_writer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -333,8 +334,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<int8> data(300000000, -1);
     Status s = writer.Add("test1", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("Tensor slice is too large to serialize"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "Tensor slice is too large to serialize"));
   }
 
   // Add a large string tensor slice, which will fail.
@@ -344,8 +345,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<string> data(256 * 1024, std::string(8192, 'f'));
     Status s = writer.Add("test2", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("Tensor slice is too large to serialize"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "Tensor slice is too large to serialize"));
   }
 }
 
diff --git a/tensorflow/docs_src/about/uses.md b/tensorflow/docs_src/about/uses.md
index d646880bd350c42e463680a5c7eb0903f2c0a497..d3db98203e8746b8d824d3ac853dcfbc35ab9d25 100644
--- a/tensorflow/docs_src/about/uses.md
+++ b/tensorflow/docs_src/about/uses.md
@@ -18,9 +18,9 @@ This section describes some of the current uses of the TensorFlow system.
 
 > If you are using TensorFlow for research, for education, or for production
 > usage in some product, we would love to add something about your usage here.
-> Please feel free to email us a brief description of how you're using
-> TensorFlow, or even better, send us a pull request to add an entry to this
-> file.
+> Please feel free to [email us](mailto:usecases@tensorflow.org) a brief
+> description of how you're using TensorFlow, or even better, send us a
+> pull request to add an entry to this file.
 
 * **Deep Speech**
 <ul>
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
index 956dccb64f971f8f2f5b97422583ea5913da1ff5..f3db5857aecce2467026d4e02960df906138b84d 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
@@ -6,42 +6,42 @@ Monte Carlo integration and helpers.
 ## Background
 
 Monte Carlo integration refers to the practice of estimating an expectation with
-a sample mean.  For example, given random variable `Z in R^k` with density `p`,
+a sample mean.  For example, given random variable `Z in \\(R^k\\)` with density `p`,
 the expectation of function `f` can be approximated like:
 
 ```
-E_p[f(Z)] = \int f(z) p(z) dz
-          ~ S_n
-          := n^{-1} \sum_{i=1}^n f(z_i),  z_i iid samples from p.
+$$E_p[f(Z)] = \int f(z) p(z) dz$$
+$$          ~ S_n
+          := n^{-1} \sum_{i=1}^n f(z_i),  z_i\ iid\ samples\ from\ p.$$
 ```
 
-If `E_p[|f(Z)|] < infinity`, then `S_n --> E_p[f(Z)]` by the strong law of large
-numbers.  If `E_p[f(Z)^2] < infinity`, then `S_n` is asymptotically normal with
-variance `Var[f(Z)] / n`.
+If `\\(E_p[|f(Z)|] < infinity\\)`, then `\\(S_n\\) --> \\(E_p[f(Z)]\\)` by the strong law of large
+numbers.  If `\\(E_p[f(Z)^2] < infinity\\)`, then `\\(S_n\\)` is asymptotically normal with
+variance `\\(Var[f(Z)] / n\\)`.
 
 Practitioners of Bayesian statistics often find themselves wanting to estimate
-`E_p[f(Z)]` when the distribution `p` is known only up to a constant.  For
+`\\(E_p[f(Z)]\\)` when the distribution `p` is known only up to a constant.  For
 example, the joint distribution `p(z, x)` may be known, but the evidence
-`p(x) = \int p(z, x) dz` may be intractable.  In that case, a parameterized
-distribution family `q_lambda(z)` may be chosen, and the optimal `lambda` is the
-one minimizing the KL divergence between `q_lambda(z)` and
-`p(z | x)`.  We only know `p(z, x)`, but that is sufficient to find `lambda`.
+`\\(p(x) = \int p(z, x) dz\\)` may be intractable.  In that case, a parameterized
+distribution family `\\(q_\lambda(z)\\)` may be chosen, and the optimal `\\(\lambda\\)` is the
+one minimizing the KL divergence between `\\(q_\lambda(z)\\)` and
+`\\(p(z | x)\\)`.  We only know `p(z, x)`, but that is sufficient to find `\\(\lambda\\)`.
 
 
 ## Log-space evaluation and subtracting the maximum
 
 Care must be taken when the random variable lives in a high dimensional space.
-For example, the naive importance sample estimate `E_q[f(Z) p(Z) / q(Z)]`
-involves the ratio of two terms `p(Z) / q(Z)`, each of which must have tails
-dropping off faster than `O(|z|^{-(k + 1)})` in order to have finite integral.
+For example, the naive importance sample estimate `\\(E_q[f(Z) p(Z) / q(Z)]\\)`
+involves the ratio of two terms `\\(p(Z) / q(Z)\\)`, each of which must have tails
+dropping off faster than `\\(O(|z|^{-(k + 1)})\\)` in order to have finite integral.
 This ratio would often be zero or infinity up to numerical precision.
 
 For that reason, we write
 
 ```
-Log E_q[ f(Z) p(Z) / q(Z) ]
-   = Log E_q[ exp{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C} ] + C,  where
-C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].
+$$Log E_q[ f(Z) p(Z) / q(Z) ]$$
+$$   = Log E_q[ \exp\{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C\} ] + C,$$  where
+$$C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].$$
 ```
 
 The maximum value of the exponentiated term will be 0.0, and the expectation
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
index 0ce187b329bce38fe096f2640a09cc93c71f9543..e169897f31717d994a0229f1e1b485874d2b0572 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
@@ -28,6 +28,5 @@ To apply a `Bijector`, use `distributions.TransformedDistribution`.
 *   @{tf.contrib.distributions.bijectors.Inline}
 *   @{tf.contrib.distributions.bijectors.Invert}
 *   @{tf.contrib.distributions.bijectors.PowerTransform}
-*   @{tf.contrib.distributions.bijectors.SigmoidCentered}
 *   @{tf.contrib.distributions.bijectors.SoftmaxCentered}
 *   @{tf.contrib.distributions.bijectors.Softplus}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
index de4f126507930331d348cc795bd03b9971778d07..20fe88a799b3e0f6767207eb36d132d4c9e4b220 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
@@ -61,21 +61,21 @@ A subgraph can be created in several ways:
 
 * using a list of ops:
 
-```python
-my_sgv = ge.sgv(ops)
-```
+  ```python
+  my_sgv = ge.sgv(ops)
+  ```
 
 * from a name scope:
 
-```python
-my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
-```
+  ```python
+  my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
+  ```
 
 * using regular expression:
 
-```python
-my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
-```
+  ```python
+  my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
+  ```
 
 Note that the Graph Editor is meant to manipulate several graphs at the same
 time, typically during transform or copy operation. For that reason,
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
index d7f862625e02a50cd716118f882344c1d16ffe1c..8b7442216c05ccb0df6be540edb15165ff4752c1 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.losses.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -107,19 +107,19 @@ weighted average over the individual prediction errors:
   loss = tf.contrib.losses.mean_squared_error(predictions, depths, weight)
 ```
 
-@{tf.contrib.losses.absolute_difference}
-@{tf.contrib.losses.add_loss}
-@{tf.contrib.losses.hinge_loss}
-@{tf.contrib.losses.compute_weighted_loss}
-@{tf.contrib.losses.cosine_distance}
-@{tf.contrib.losses.get_losses}
-@{tf.contrib.losses.get_regularization_losses}
-@{tf.contrib.losses.get_total_loss}
-@{tf.contrib.losses.log_loss}
-@{tf.contrib.losses.mean_pairwise_squared_error}
-@{tf.contrib.losses.mean_squared_error}
-@{tf.contrib.losses.sigmoid_cross_entropy}
-@{tf.contrib.losses.softmax_cross_entropy}
-@{tf.contrib.losses.sparse_softmax_cross_entropy}
+* @{tf.contrib.losses.absolute_difference}
+* @{tf.contrib.losses.add_loss}
+* @{tf.contrib.losses.hinge_loss}
+* @{tf.contrib.losses.compute_weighted_loss}
+* @{tf.contrib.losses.cosine_distance}
+* @{tf.contrib.losses.get_losses}
+* @{tf.contrib.losses.get_regularization_losses}
+* @{tf.contrib.losses.get_total_loss}
+* @{tf.contrib.losses.log_loss}
+* @{tf.contrib.losses.mean_pairwise_squared_error}
+* @{tf.contrib.losses.mean_squared_error}
+* @{tf.contrib.losses.sigmoid_cross_entropy}
+* @{tf.contrib.losses.softmax_cross_entropy}
+* @{tf.contrib.losses.sparse_softmax_cross_entropy}
 
 
diff --git a/tensorflow/docs_src/api_guides/python/io_ops.md b/tensorflow/docs_src/api_guides/python/io_ops.md
index 94cf0de32a2d2ea16d1581e7c42a08b59aa52888..86b4b39409863f09c3669dc6971901f6350377ca 100644
--- a/tensorflow/docs_src/api_guides/python/io_ops.md
+++ b/tensorflow/docs_src/api_guides/python/io_ops.md
@@ -8,7 +8,7 @@ Note: Functions taking `Tensor` arguments can also take anything accepted by
 ## Placeholders
 
 TensorFlow provides a placeholder operation that must be fed with data
-on execution.  For more info, see the section on @{$reading_data#feeding$Feeding data}.
+on execution.  For more info, see the section on @{$reading_data#Feeding$Feeding data}.
 
 *   @{tf.placeholder}
 *   @{tf.placeholder_with_default}
@@ -42,7 +42,7 @@ formats into tensors.
 
 ### Example protocol buffer
 
-TensorFlow's @{$reading_data#standard-tensorflow-format$recommended format for training examples}
+TensorFlow's @{$reading_data#standard_tensorflow_format$recommended format for training examples}
 is serialized `Example` protocol buffers, [described
 here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 They contain `Features`, [described
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
index 8e6fd1cff93332b84f552c18f627ba05dc67103e..8d8daaae19fa3e7863f9fa88393c35a3d95edf87 100644
--- a/tensorflow/docs_src/api_guides/python/nn.md
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -89,7 +89,7 @@ bottom. Note that this is different from existing libraries such as cuDNN and
 Caffe, which explicitly specify the number of padded pixels and always pad the
 same number of pixels on both sides.
 
-For the `'VALID`' scheme, the output height and width are computed as:
+For the `'VALID'` scheme, the output height and width are computed as:
 
     out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
     out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))
@@ -98,10 +98,10 @@ and no padding is used.
 
 Given the output size and the padding, the output can be computed as
 
-    output[b, i, j, :] =
-        sum_{di, dj} input[b, strides[1] * i + di - pad_top,
-                           strides[2] * j + dj - pad_left, ...] *
-                     filter[di, dj, ...]
+$$    output[b, i, j, :] =
+        sum_{d_i, d_j} input[b, strides[1] * i + d_i - pad_{top},\
+                           strides[2] * j + d_j - pad_{left}, ...] *
+                     filter[d_i, d_j,\ ...]$$
 
 where any value outside the original input image region are considered zero (
 i.e. we pad zero values around the border of the image).
@@ -161,12 +161,12 @@ Morphological operators are non-linear filters used in image processing.
 ](https://en.wikipedia.org/wiki/Dilation_(morphology))
 is the max-sum counterpart of standard sum-product convolution:
 
-    output[b, y, x, c] =
+$$    output[b, y, x, c] =
         max_{dy, dx} input[b,
                            strides[1] * y + rates[1] * dy,
                            strides[2] * x + rates[2] * dx,
                            c] +
-                     filter[dy, dx, c]
+                     filter[dy, dx, c]$$
 
 The `filter` is usually called structuring function. Max-pooling is a special
 case of greyscale morphological dilation when the filter assumes all-zero
@@ -176,12 +176,12 @@ values (a.k.a. flat structuring function).
 ](https://en.wikipedia.org/wiki/Erosion_(morphology))
 is the min-sum counterpart of standard sum-product convolution:
 
-    output[b, y, x, c] =
+$$    output[b, y, x, c] =
         min_{dy, dx} input[b,
                            strides[1] * y - rates[1] * dy,
                            strides[2] * x - rates[2] * dx,
                            c] -
-                     filter[dy, dx, c]
+                     filter[dy, dx, c]$$
 
 Dilation and erosion are dual to each other. The dilation of the input signal
 `f` by the structuring signal `g` is equal to the negation of the erosion of
diff --git a/tensorflow/docs_src/api_guides/python/state_ops.md b/tensorflow/docs_src/api_guides/python/state_ops.md
index 0d612ee0c7e5e3693cf8a46813633dcc22229355..ec2d8773860f0595cabe91d591a5fdc025e99b83 100644
--- a/tensorflow/docs_src/api_guides/python/state_ops.md
+++ b/tensorflow/docs_src/api_guides/python/state_ops.md
@@ -83,6 +83,8 @@ automatically by the optimizers in most cases.
 *   @{tf.scatter_sub}
 *   @{tf.scatter_mul}
 *   @{tf.scatter_div}
+*   @{tf.scatter_min}
+*   @{tf.scatter_max}
 *   @{tf.scatter_nd_update}
 *   @{tf.scatter_nd_add}
 *   @{tf.scatter_nd_sub}
diff --git a/tensorflow/docs_src/community/contributing.md b/tensorflow/docs_src/community/contributing.md
new file mode 100644
index 0000000000000000000000000000000000000000..afbb8bbdd0fd25f1e4fa607ac6b4f74e4cc37c0c
--- /dev/null
+++ b/tensorflow/docs_src/community/contributing.md
@@ -0,0 +1,49 @@
+# Contributing to TensorFlow
+
+TensorFlow is an open-source project, and we welcome your participation
+and contribution. This page describes how to get involved.
+
+## Repositories
+
+The code for TensorFlow is hosted in the [TensorFlow GitHub
+organization](https://github.com/tensorflow). Multiple projects are located
+inside the organization, including:
+
+* [TensorFlow](https://github.com/tensorflow/tensorflow)
+* [Models](https://github.com/tensorflow/models)
+* [TensorBoard](https://github.com/tensorflow/tensorboard)
+* [TensorFlow.js](https://github.com/tensorflow/tfjs)
+* [TensorFlow Serving](https://github.com/tensorflow/serving)
+* [TensorFlow Documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/docs_src)
+
+## Contributor checklist
+
+* Before contributing to TensorFlow source code, please review the [contribution
+guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
+
+* Join the
+[developers@tensorflow.org](https://groups.google.com/a/tensorflow.org/d/forum/developers)
+mailing list, to coordinate and discuss with others contributing to TensorFlow.
+
+* For coding style conventions, read the @{$style_guide$TensorFlow Style Guide}.
+
+* Finally, review @{$documentation$Writing TensorFlow Documentation}, which
+  explains documentation conventions.
+
+You may also wish to review our guide to @{$benchmarks$defining and running benchmarks}.
+
+## Special Interest Groups
+
+To enable focused collaboration on particular areas of TensorFlow, we host
+Special Interest Groups (SIGs). SIGs do their work in public: if you want to
+join and contribute, review the work of the group, and get in touch with the
+relevant SIG leader.  Membership policies vary on a per-SIG basis.
+
+* **SIG Build** focuses on issues surrounding building, packaging, and
+  distribution of TensorFlow. [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/build).
+
+* **SIG TensorBoard** furthers the development and direction of TensorBoard and its plugins.
+  [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard).
+
+* **SIG Rust** collaborates on the development of TensorFlow's Rust bindings.
+  [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/rust).
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index 003e0a25ecd7c6afcc42aed08bd5d91f7c85a9bb..6f2107ef4086f863e113dbffdebbb4fcbb6c7a99 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -477,31 +477,29 @@ should use Markdown in the docstring.
 
 Here's a simple example:
 
-```python
-def foo(x, y, name="bar"):
-  """Computes foo.
+    def foo(x, y, name="bar"):
+      """Computes foo.
 
-  Given two 1-D tensors `x` and `y`, this operation computes the foo.
+      Given two 1-D tensors `x` and `y`, this operation computes the foo.
 
-  Example:
+      Example:
 
-  ```
-  # x is [1, 1]
-  # y is [2, 2]
-  tf.foo(x, y) ==> [3, 3]
-  ```
-  Args:
-    x: A `Tensor` of type `int32`.
-    y: A `Tensor` of type `int32`.
-    name: A name for the operation (optional).
+      ```
+      # x is [1, 1]
+      # y is [2, 2]
+      tf.foo(x, y) ==> [3, 3]
+      ```
+      Args:
+        x: A `Tensor` of type `int32`.
+        y: A `Tensor` of type `int32`.
+        name: A name for the operation (optional).
 
-  Returns:
-    A `Tensor` of type `int32` that is the foo of `x` and `y`.
+      Returns:
+        A `Tensor` of type `int32` that is the foo of `x` and `y`.
 
-  Raises:
-    ValueError: If `x` or `y` are not of type `int32`.
-  """
-```
+      Raises:
+        ValueError: If `x` or `y` are not of type `int32`.
+      """
 
 ## Description of the docstring sections
 
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
new file mode 100644
index 0000000000000000000000000000000000000000..d92f5775fafa394d795dd451077a721a2ecbb259
--- /dev/null
+++ b/tensorflow/docs_src/community/groups.md
@@ -0,0 +1,17 @@
+# User Groups
+
+TensorFlow has communities around the world.
+
+## Asia
+
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+
+
+## Europe
+
+* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
+* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+
diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md
index b706d9b2047a4ff9707772edb30bfd036bbffc24..eec2e51a8706b73abcedb8329df3ad03e3b349c3 100644
--- a/tensorflow/docs_src/community/index.md
+++ b/tensorflow/docs_src/community/index.md
@@ -1,15 +1,85 @@
 # Community
 
-This section contains the following documents:
-
-  * @{$welcome$Welcome to the TensorFlow Community}, which explains how
-    you can get involved, where to report issues, and where to join
-    like-minded TensorFlow enthusiasts online.
-  * @{$roadmap$Roadmap}, which summarizes upcoming additions to TensorFlow.
-  * @{$documentation$Writing TensorFlow Documentation}, which explains
-    TensorFlow's documentation conventions.  If you are modifying
-    TensorFlow source code or documentation, please read this guide.
-  * @{$style_guide$TensorFlow Style Guide}, which identifies coding style
-    conventions that TensorFlow developers and users should follow.
-  * @{$community/benchmarks$Benchmarks}, Benchmarks, a guide for defining and
-    running a TensorFlow benchmark.
+Welcome to the TensorFlow community! This page explains where to get help, and
+different ways to be part of the community. We are committed to fostering an
+open and welcoming environment, and request that you review our [code of
+conduct](https://github.com/tensorflow/tensorflow/blob/master/CODE_OF_CONDUCT.md).
+
+## Get Help
+
+### Technical Questions
+
+To ask or answer technical questions about TensorFlow, use [Stack
+Overflow](https://stackoverflow.com/questions/tagged/tensorflow). For example,
+ask or search about a particular error message you encountered during
+installation.
+
+### Bugs and Feature Requests
+
+To report bugs or make feature requests, file an issue on GitHub. Please choose
+the appropriate repository for the project. Major repositories include:
+
+  * [TensorFlow](https://github.com/tensorflow/tensorflow/issues)
+  * [TensorBoard](https://github.com/tensorflow/tensorboard/issues)
+  * [TensorFlow models](https://github.com/tensorflow/models/issues)
+  
+### Security
+
+Before using TensorFlow, please take a look at our security model, list of
+recent security announcements, and ways you can report security issues to the
+TensorFlow team at the
+[Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) page on GitHub.
+
+## Stay Informed
+
+### Announcements Mailing List
+
+All major releases and important announcements are sent to
+[announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
+We recommend that you join this list if you depend on TensorFlow in any way.
+
+### Development Roadmap
+
+The @{$roadmap$Roadmap} summarizes plans for upcoming additions to TensorFlow.
+
+### Social Media
+
+For news and updates from around the universe of TensorFlow projects, follow
+[@tensorflow](https://twitter.com/tensorflow) on Twitter.
+
+### Blog
+
+We post regularly to the [TensorFlow Blog](http://blog.tensorflow.org/),
+with content from the TensorFlow team and the best articles from the community.
+
+### YouTube
+
+Our [YouTube Channel](http://youtube.com/tensorflow/) focuses on machine learing
+and AI with TensorFlow. On it we have a number of new shows, including:
+
+- TensorFlow Meets: meet with community contributors to learn and share what they're doing
+- Ask TensorFlow: the team answers the best questions tagged #AskTensorFlow from social media 
+- Coding TensorFlow: short bites with tips for success with TensorFlow
+
+## Community Support
+
+### Mailing Lists
+
+For general discussion about TensorFlow development and direction, please join
+the [TensorFlow discuss mailing
+list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
+
+A number of other mailing lists exist, focused on different project areas, which
+can be found at @{$lists$TensorFlow Mailing Lists}.
+
+### User Groups
+
+To meet with like-minded people local to you, check out the many
+@{$groups$TensorFlow user groups} around the world.
+
+
+## Contributing To TensorFlow
+
+We welcome contributions and collaboration on TensorFlow. For more information,
+please read [Contributing to TensorFlow](contributing.md).
+
diff --git a/tensorflow/docs_src/community/leftnav_files b/tensorflow/docs_src/community/leftnav_files
index fab35024ad63e09adba1298eab52f7904eca1007..2bae60d9ddc5c18f67b1611054ac58b072e9674a 100644
--- a/tensorflow/docs_src/community/leftnav_files
+++ b/tensorflow/docs_src/community/leftnav_files
@@ -1,6 +1,9 @@
 index.md
-welcome.md
 roadmap.md
+contributing.md
+lists.md
+groups.md
 documentation.md
 style_guide.md
 benchmarks.md
+swift.md
diff --git a/tensorflow/docs_src/community/lists.md b/tensorflow/docs_src/community/lists.md
new file mode 100644
index 0000000000000000000000000000000000000000..7450ab36c436538dd584541fb0dafb5a2c6067b3
--- /dev/null
+++ b/tensorflow/docs_src/community/lists.md
@@ -0,0 +1,51 @@
+# Mailing Lists
+
+As a community, we do much of our collaboration on public mailing lists.
+Please note that if you're looking for help using TensorFlow, [Stack
+Overflow](https://stackoverflow.com/questions/tagged/tensorflow) and
+[GitHub issues](https://github.com/tensorflow/tensorflow/issues)
+are the best initial places to look. For more information,
+see [how to get help](/community/#get_help).
+
+## General TensorFlow lists
+
+* [announce](https://groups.google.com/a/tensorflow.org/d/forum/announce) - Low-volume announcements of new releases.
+* [discuss](https://groups.google.com/a/tensorflow.org/d/forum/discuss) - General community discussion around TensorFlow.
+* [developers](https://groups.google.com/a/tensorflow.org/d/forum/developers) - Discussion for developers contributing to TensorFlow.
+
+## Project-specific lists
+
+These projects inside the TensorFlow GitHub organization have lists dedicated to their communities:
+
+* [hub](https://groups.google.com/a/tensorflow.org/d/forum/hub) -
+  Discussion and collaboration around [TensorFlow Hub](https://github.com/tensorflow/hub).
+* [magenta-discuss](https://groups.google.com/a/tensorflow.org/d/forum/magenta-discuss) -
+  General discussion about [Magenta](https://magenta.tensorflow.org/)
+  development and directions.
+* [swift](https://groups.google.com/a/tensorflow.org/d/forum/swift) -
+  Community and collaboration around Swift for TensorFlow.
+* [tensor2tensor](https://groups.google.com/d/forum/tensor2tensor) - Discussion
+  and peer support for Tensor2Tensor.
+* [tfjs-announce](https://groups.google.com/a/tensorflow.org/d/forum/tfjs-announce) -
+  Announcements of new TensorFlow.js releases.
+* [tfjs](https://groups.google.com/a/tensorflow.org/d/forum/tfjs) - Discussion
+  and peer support for TensorFlow.js.
+* [tflite](https://groups.google.com/a/tensorflow.org/d/forum/tflite) - Discussion and
+  peer support for TensorFlow Lite.
+* [tpu-users](https://groups.google.com/a/tensorflow.org/d/forum/tpu-users) - Community discussion
+  and support for TPU users.
+
+## Special Interest Groups
+
+TensorFlow's [Special Interest
+Groups](/community/contributing#special_interest_groups) (SIGs) support
+community collaboration on particular project focuses. Members of these groups
+work together to build and support TensorFlow related projects. While their
+archives are public, different SIGs have their own membership policies.
+
+* [build](https://groups.google.com/a/tensorflow.org/d/forum/build) -
+  Supporting SIG Build, for build, distribution and packaging of TensorFlow.
+* [sig-tensorboard](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard) -
+  Supporting SIG TensorBoard, for plugin development and other contribution.
+* [rust](https://groups.google.com/a/tensorflow.org/d/forum/rust) -
+  Supporting SIG Rust, for the Rust language bindings.
diff --git a/tensorflow/docs_src/community/security.md b/tensorflow/docs_src/community/security.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d13c7a1eaf336193b39ea3a9ee8e316a04fcb63
--- /dev/null
+++ b/tensorflow/docs_src/community/security.md
@@ -0,0 +1,7 @@
+# Using TensorFlow Securely
+
+Before using TensorFlow, please take a look at our security model, list of
+recent security announcements, and ways you can report security issues to the
+TensorFlow team at the
+[https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md](Using
+TensorFlow Securely) page on GitHub.
diff --git a/tensorflow/docs_src/community/swift.md b/tensorflow/docs_src/community/swift.md
new file mode 100644
index 0000000000000000000000000000000000000000..54d9960b23405bb1a384477b62fd161eb4b06faa
--- /dev/null
+++ b/tensorflow/docs_src/community/swift.md
@@ -0,0 +1,35 @@
+# Swift Community
+
+Welcome to the Swift for TensorFlow development community!
+
+Swift for TensorFlow is a result of first-principles thinking applied to machine
+learning frameworks, and works quite differently than existing TensorFlow
+language bindings.  Whereas prior solutions are designed within the constraints
+of what can be achieved by a (typically Python or Lua) library, Swift for
+TensorFlow is based on the belief that machine learning is important enough to
+deserve first-class language and compiler support.
+
+First-class language and compiler support allows us to innovate in areas that
+have traditionally been out of bounds for machine learning libraries.  Our
+results provide the performance of TensorFlow graphs with the ease of use of
+define-by-run models, and provides a great user experience - for example, by
+catching more mistakes before you run your code.
+
+## Open Source
+
+As announced at the TensorFlow Developer Summit, we are planning to launch our
+open source project on GitHub in April.  In addition to releasing the code, we
+will be using an open design model, where design discussions happen in public.
+
+Between now and then, we are writing some technical white papers that explain in
+detail the design approach (e.g., the core compiler partitioning technique that
+underlies the whole thing, our approach to automatic differentiation, etc.),
+implementation tradeoffs, and the status of this work.  We can’t wait to engage
+with the broader community, but prefer to start the conversation when these
+white papers are ready.
+
+[Sign up here to join the community Google
+group](https://groups.google.com/a/tensorflow.org/d/forum/swift). We will
+initially use it for announcements, and then open it for general discussion when
+we are ready in April.
+
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
deleted file mode 100644
index 6d0458e678b5507fc722e2c3848e84ca2168e1e3..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/community/welcome.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# Welcome to the TensorFlow Community
-
-TensorFlow is an open-source project.  This page explains how to contribute,
-where to ask questions, and how to help each other.
-
-
-## Development
-
-The source code for TensorFlow is on
-[GitHub](https://github.com/tensorflow/tensorflow).
-
-Before contributing to TensorFlow source code, please review the
-[Contribution guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
-
-### Projects developed by the TensorFlow community
-
-The TensorFlow community has created many great projects around TensorFlow, including:
-
-* [Machine Learning with TensorFlow (Book & Code)](http://tensorflowbook.com)
-* [@jtoy's awesome "Awesome TensorFlow" list of awesome things](https://github.com/jtoy/awesome-tensorflow)
-* [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
-* [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
-* [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
-* [Rust language bindings](https://github.com/google/tensorflow-rust)
-* [Operator Vectorization Library](https://github.com/opveclib/opveclib)
-* [Swift language bindings](https://github.com/PerfectlySoft/Perfect-TensorFlow)
-* [Sublime Tensorflow - A plugin for Sublime Text](https://github.com/baptisteArnaud/Sublime-Tensorflow)
-* [Edward - A library for probabilistic modeling, inference, and criticism](http://edwardlib.org) ([Github](https://github.com/blei-lab/edward), [Forum](https://discourse.edwardlib.org))
-* [GPflow - Gaussian processes in TensorFlow](https://github.com/GPflow/GPflow)
-* [CS 20SI: Tensorflow for Deep Learning Research](https://web.stanford.edu/class/cs20si/) - Please note, this course was designed with TensorFlow v0.12, so some of the notes may be out of date - but it's still a great resource.
-
-## TensorFlow Communities Around the World
-
-Asia:
-
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
-* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
-
-
-Europe:
-
-* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
-* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
-
-
-
-## Support
-
-TensorFlow provides multiple communication paths.  To pick the right path,
-please read the following list carefully:
-
-  * For new release announcements and security updates, subscribe to
-    [announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
-  * To ask or answer technical questions about TensorFlow, use
-    [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).
-    For example, ask or search Stack Overflow about a particular error message
-    you encountered during installation.
-  * To join general discussions about TensorFlow development and directions,
-    please join the
-    [TensorFlow discuss mailing list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
-    For example, use this mailing list to learn about new features in
-    upcoming releases of TensorFlow.
-  * To report bugs or make feature requests, use the
-    [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
-    on GitHub.  For example, use the issue tracker to request a
-    new operation in TensorFlow.
-  * To report vulnerabilities, please follow our
-    [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md).
-
diff --git a/tensorflow/docs_src/deploy/deploy_to_js.md b/tensorflow/docs_src/deploy/deploy_to_js.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7ce3ea90bda25a84c6dc8ca52e97b1613043c0b
--- /dev/null
+++ b/tensorflow/docs_src/deploy/deploy_to_js.md
@@ -0,0 +1,4 @@
+# Deploy to JavaScript
+
+You can find details about deploying JavaScript TensorFlow programs
+in the separate [js.tensorflow.org site](https://js.tensorflow.org).
diff --git a/tensorflow/docs_src/deploy/leftnav_files b/tensorflow/docs_src/deploy/leftnav_files
index c682e7add16c741279aedb40c1b12f4ca8f0286a..93f5bd1ed20d34eaf7c9ef64ea89e5632331d5c1 100644
--- a/tensorflow/docs_src/deploy/leftnav_files
+++ b/tensorflow/docs_src/deploy/leftnav_files
@@ -2,3 +2,4 @@ index.md
 distributed.md
 hadoop.md
 s3.md
+deploy_to_js.md
diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md
index 06f11de4eb0ea7878b01cd37d994c5a40ec400be..bc0f662f0cf8054add41c4c677e369a9e1582343 100644
--- a/tensorflow/docs_src/extend/add_filesys.md
+++ b/tensorflow/docs_src/extend/add_filesys.md
@@ -225,7 +225,7 @@ it will use the `FooBarFileSystem` implementation.
 Next, you must build a shared object containing this implementation. An example
 of doing so using bazel's `cc_binary` rule can be found
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/BUILD#L244),
-but you may use any build system to do so. See the section on @{$adding_an_op#build-the-op-library$building the op library} for similar
+but you may use any build system to do so. See the section on @{$adding_an_op#build_the_op_library$building the op library} for similar
 instructions.
 
 The result of building this target is a `.so` shared object file.
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index bdff60b39ec6fe939273a529ec4e46407cface8a..1ab0340ad983de891ef5e18a729c1e4fb3c4e0d9 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -16,9 +16,10 @@ TensorFlow:
     for your own file and record formats.
 
 Python is currently the only language supported by TensorFlow's API stability
-promises.  However, TensorFlow also provides functionality in C++, Java, and Go,
+promises. However, TensorFlow also provides functionality in C++, Go, Java and
+[JavaScript](https://js.tensorflow.org),
 plus community support for [Haskell](https://github.com/tensorflow/haskell) and
-[Rust](https://github.com/tensorflow/rust).  If you'd like to create or
+[Rust](https://github.com/tensorflow/rust). If you'd like to create or
 develop TensorFlow features in a language other than these languages, read the
 following guide:
 
diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index b3cc96804740991ada56a9b7f60439a63e9eb895..10e717c280f09c4f1bdfea9d0a2c8d3a00191734 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -167,7 +167,7 @@ REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU),
 ```
 
 The last step is to add the Python wrapper.  You can either do this by
-@{$adding_an_op#building_the_op_library$compiling a dynamic library}
+@{$adding_an_op#build_the_op_library$compiling a dynamic library}
 or, if you are building TensorFlow from source, adding to `user_ops.py`.
 For the latter, you will import `tensorflow.python.ops.io_ops` in
 [`tensorflow/python/user_ops/user_ops.py`](https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py)
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 941c3e16905a9062b3081ad0af6bcbc1621a146b..275cda12bc397e1a8a980f6c97e6b2d97c5e64e8 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -546,7 +546,7 @@ In brief, here's what the three graphs tell you:
 
 * accuracy: The accuracy is recorded by the following two lines:
 
-    * `eval_metric_ops={'my_accuracy': accuracy})`, during evaluation.
+    * `eval_metric_ops={'my_accuracy': accuracy}`, during evaluation.
     * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
 
 These tensorboard graphs are one of the main reasons it's important to pass a
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad89f0154c06d97673bdb0d598ca5387c61bc6ac
--- /dev/null
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -0,0 +1,3 @@
+# Get Started with Eager Execution
+
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md
index b88483be699630d2275850cbc7c461eeb90f5943..fbe0ed74f82bb34bc55dd7bab5819c0d9fdc54e9 100644
--- a/tensorflow/docs_src/get_started/get_started_for_beginners.md
+++ b/tensorflow/docs_src/get_started/get_started_for_beginners.md
@@ -1,4 +1,4 @@
-# Getting Started for ML Beginners
+# Get Started with Graph Execution
 
 This document explains how to use machine learning to classify (categorize)
 Iris flowers by species.  This document dives deeply into the TensorFlow
@@ -14,6 +14,11 @@ If you are already familiar with basic machine learning concepts
 but are new to TensorFlow, read
 @{$premade_estimators$Getting Started with TensorFlow: for ML Experts}.
 
+If you'd like to learn a lot about the basics of Machine Learning,
+consider taking
+[Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/).
+
+
 ## The Iris classification problem
 
 Imagine you are a botanist seeking an automated way to classify each
@@ -86,6 +91,9 @@ a number.  Here's the representation scheme:
 * 1 represents versicolor
 * 2 represents virginica
 
+For a look at other examples of labels and examples, see the
+[ML Terminology section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/framing/ml-terminology).
+
 
 ## Models and training
 
@@ -371,7 +379,7 @@ There are several categories of neural networks.
 We'll be using a [**fully connected neural
 network**](https://developers.google.com/machine-learning/glossary/#fully_connected_layer),
 which means that the neurons in one layer take inputs from *every* neuron in
-the previous layer.  For example, the following figure illustrates a 
+the previous layer.  For example, the following figure illustrates a
 fully connected neural network consisting of three hidden layers:
 
 *   The first hidden layer contains four neurons.
@@ -385,6 +393,9 @@ fully connected neural network consisting of three hidden layers:
 **A neural network with three hidden layers.**
 <p>&nbsp;</p>
 
+For a more detailed introduction to neural networks, see the
+[Introduction to Neural Nets section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/introduction-to-neural-networks/anatomy).
+
 To specify a model type, instantiate an
 [**Estimator**](https://developers.google.com/machine-learning/glossary/#Estimators)
 class.  TensorFlow provides two categories of Estimators:
@@ -448,9 +459,9 @@ will become very important.
 
 ### Train the model
 
-Instantiating a `tf.Estimator.DNNClassifier` creates a framework for learning 
-the model. Basically, we've wired a network but haven't yet let data flow 
-through it. To train the neural network, call the Estimator object's `train` 
+Instantiating a `tf.Estimator.DNNClassifier` creates a framework for learning
+the model. Basically, we've wired a network but haven't yet let data flow
+through it. To train the neural network, call the Estimator object's `train`
 method. For example:
 
 ```python
@@ -559,15 +570,15 @@ of 0.5.  The following suggests a more effective model:
     <th colspan="1">Label</th>
     <th colspan="1">Prediction</th>
   </tr>
-  <tr> <td>5.9</td> <td>3.0</td> <td>4.3</td> <td>1.5</td> <td>1</td> 
+  <tr> <td>5.9</td> <td>3.0</td> <td>4.3</td> <td>1.5</td> <td>1</td>
           <td style="background-color:green">1</td></tr>
-  <tr> <td>6.9</td> <td>3.1</td> <td>5.4</td> <td>2.1</td> <td>2</td> 
+  <tr> <td>6.9</td> <td>3.1</td> <td>5.4</td> <td>2.1</td> <td>2</td>
           <td style="background-color:green">2</td></tr>
-  <tr> <td>5.1</td> <td>3.3</td> <td>1.7</td> <td>0.5</td> <td>0</td> 
+  <tr> <td>5.1</td> <td>3.3</td> <td>1.7</td> <td>0.5</td> <td>0</td>
           <td style="background-color:green">0</td></tr>
-  <tr> <td>6.0</td> <td>3.4</td> <td>4.5</td> <td>1.6</td> <td>1</td> 
+  <tr> <td>6.0</td> <td>3.4</td> <td>4.5</td> <td>1.6</td> <td>1</td>
           <td style="background-color:red">2</td></tr>
-  <tr> <td>5.5</td> <td>2.5</td> <td>4.0</td> <td>1.3</td> <td>1</td> 
+  <tr> <td>5.5</td> <td>2.5</td> <td>4.0</td> <td>1.3</td> <td>1</td>
           <td style="background-color:green">1</td></tr>
 </table>
 
@@ -631,6 +642,10 @@ Test set accuracy: 0.967
 An accuracy of 0.967 implies that our trained model correctly classified 29
 out of the 30 Iris species in the test set.
 
+To get a deeper understanding of different metrics for evaluating
+models, see the
+[Classification section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/classification).
+
 
 ### Predicting
 
@@ -723,7 +738,6 @@ Prediction is "Virginica" (97.9%), expected "Virginica"
 
 ## Summary
 
-<!--TODO(barryr): When MLCC is released, add pointers to relevant sections.-->
 This document provides a short introduction to machine learning.
 
 Because `premade_estimators.py` relies on high-level APIs, much of the
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index b7bd1286e3ce9026df49718d94cf53cf784a3be8..b28cb9df75d94a71a7d3a501a84e1cdd8e4c3ecb 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -1,10 +1,22 @@
-# Getting Started
+# Get Started
+
+If you are new to machine learning, we recommend taking the following online
+course prior to diving into TensorFlow documentation:
+
+  * [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/),
+    which introduces machine learning concepts and encourages experimentation
+    with existing TensorFlow code.
 
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-TensorFlow provides many APIs. This section focuses on the high-level APIs.
-If you are new to TensorFlow, begin by reading one of the following documents:
+The easiest way to get started with tensorflow is using Eager Execution.
+
+  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
+
+TensorFlow provides many APIs. The remainder of this section focuses on the
+Estimator API which provide scalable, high-performance models.
+To get started with Estimators begin by reading one of the following documents:
 
   * @{$get_started/get_started_for_beginners}, which is aimed at readers
     new to machine learning.
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index 437791d6a32db3e43415e381a034424ae8225f6f..4c12f0d84b3d13e4d9ececcb4559e806486b4120 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -1,10 +1,14 @@
 index.md
 
-### Getting Started
+### Beginners
+eager.md
 get_started_for_beginners.md
 premade_estimators.md
 
-### Details
+### Estimators
+get_started_for_beginners.md: For Beginners
+premade_estimators.md: Premade Estimators
+>>>
 checkpoints.md
 feature_columns.md
 datasets_quickstart.md
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index e50d2f542037c8537f79a2ae53a2cbb3448243c6..4be7e508f94074f20d07e271259bf77074dd19e3 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -1,5 +1,4 @@
-
-# Getting Started with TensorFlow
+# Premade Estimators
 
 This document introduces the TensorFlow programming environment and shows you
 how to solve the Iris classification problem in TensorFlow.
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 0481c97885df97675553eccb08989846a1275948..a3eca4bf376aea0fe12d8ebec5fa9152484ced75 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 8f89898c92d00eaf0d637e78b3ec897a34df6b89..1a0956634d6b03585a39164a2492df3fd1b4ffc7 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 0ee9c849e11448c4efc74263f6aacbb46056201d..cdde45a6f4fb4fc93407bc882d7bc5c8c32fda46 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.6.0</version>
+  <version>1.7.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.6.0</version>
+                 <version>1.7.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.6.0</version>
+  <version>1.7.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.6.0</version>
+  <version>1.7.0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.6.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.7.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.6.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.7.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.6.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.7.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 3e8744bf9d1e01d00c703d9153db8ac48bce1d50..04e4242b0ffd476818f6e5c4522c60111a65e151 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -31,20 +31,21 @@ If you are installing TensorFlow with GPU support using one of the
 mechanisms described in this guide, then the following NVIDIA software
 must be installed on your system:
 
-  * CUDA® Toolkit 9.0. For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
-    Ensure that you append the relevant Cuda pathnames to the
+  * [CUDA Toolkit 9.0](http://nvidia.com/cuda). For details, see
+    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+    Ensure that you append the relevant CUDA pathnames to the
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
-  * cuDNN v7.0. For details, see
-    [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
+  * [cuDNN SDK v7](http://developer.nvidia.com/cudnn). For details, see
+    [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
     Ensure that you create the `CUDA_HOME` environment variable as
     described in the NVIDIA documentation.
   * GPU card with CUDA Compute Capability 3.0 or higher for building
     from source and 3.5 or higher for our binaries. See
     [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for
     a list of supported GPU cards.
+  * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA
+    Toolkit.
   * The libcupti-dev library, which is the NVIDIA CUDA Profile Tools Interface.
     This library provides advanced profiling support. To install this library,
     issue the following command for CUDA Toolkit >= 8.0:
@@ -56,7 +57,7 @@ must be installed on your system:
     and add its path to your `LD_LIBRARY_PATH` environment variable:
 
     <pre>
-    $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b>
+    $ <b>export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</b>
     </pre>
 
     For CUDA Toolkit <= 7.5 do:
@@ -64,17 +65,20 @@ must be installed on your system:
     <pre>
     $ <b>sudo apt-get install libcupti-dev</b>
     </pre>
+  * **[OPTIONAL]**  For optimized inferencing performance, you can also install
+    NVIDIA TensorRT 3.0. For details, see
+    [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#installing-tar).
+    Only steps 1-4 in the TensorRT Tar File installation instructions are
+    required for compatibility with TensorFlow; the Python package installation
+    in steps 5 and 6 can be omitted. Detailed installation instructions can be found at [package documentataion](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#installing-tensorrt-304)
+
+    **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu`
+    package, please use the Ubuntu **14.04** tar file package of TensorRT
+    even when installing onto an Ubuntu 16.04 system.   
 
 If you have an earlier version of the preceding packages, please upgrade to
 the specified versions. If upgrading is not possible, then you may still run
-TensorFlow with GPU support, but only if you do the following:
-
-  * Install TensorFlow from sources as documented in
-    @{$install_sources$Installing TensorFlow from Sources}.
-  * Install or upgrade to at least the following NVIDIA versions:
-    * CUDA toolkit 7.0 or greater
-    * cuDNN v3 or greater
-    * GPU card with CUDA Compute Capability 3.0 or higher.
+TensorFlow with GPU support, if you @{$install_sources$install TensorFlow from Sources}.
 
 
 ## Determine how to install TensorFlow
@@ -149,7 +153,8 @@ Take the following steps to install TensorFlow with Virtualenv:
      commands:
 
      <pre>$ <b>source ~/tensorflow/bin/activate</b> # bash, sh, ksh, or zsh
-    $ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh</pre>
+    $ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh
+    $ <b>. ~/tensorflow/bin/activate.fish</b>  # fish</pre>
 
      The preceding <tt>source</tt> command should change your prompt
      to the following:
@@ -189,7 +194,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -294,7 +299,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +485,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -530,11 +535,18 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with TensorFlow}.
-
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
 
+If you are new to machine learning, we recommend the following:
+
+*  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
+*  @{$get_started/get_started_for_beginners$Getting Started for ML Beginners}
+
+If you are experienced with machine learning but new to TensorFlow, see
+@{$get_started/premade_estimators$Getting Started with TensorFlow}.
+
+
 ## Common installation problems
 
 We are relying on Stack Overflow to document TensorFlow installation problems
@@ -647,14 +659,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -666,14 +678,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -685,14 +697,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -704,14 +716,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 205db8e6bdd438ee5502f6cd1d92c30ea9e33152..b3e9616a0592c43f457183e53c8e99e55f3f5d94 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -118,8 +118,8 @@ Take the following steps to install TensorFlow with Virtualenv:
      Python 2.7, the command to install
      TensorFlow in the active Virtualenv is as follows:
 
-     <pre> $ <b>pip install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0-py2-none-any.whl</b></pre>
+     <pre> $ <b>pip3 install --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,11 +238,11 @@ take the following steps:
      operating system and Python version. Find the appropriate
      value for <i>tfBinaryURL</i>
      [here](#the_url_of_the_tensorflow_python_package).  For example, if
-     you are installing TensorFlow for Mac OS and Python 2.7
+     you are installing TensorFlow for macOS and Python 2.7
      issue the following command:
 
-     <pre> $ <b>sudo pip install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0-py2-none-any.whl</b> </pre>
+     <pre> $ <b>sudo pip3 install --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -400,12 +400,18 @@ writing TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see
-@{$get_started/premade_estimators$Getting Started with TensorFlow}.
-
 If the system outputs an error message instead of a greeting, see
 [Common installation problems](#common_installation_problems).
 
+If you are new to machine learning, we recommend the following:
+
+*  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
+*  @{$get_started/get_started_for_beginners$Getting Started for ML Beginners}
+
+If you are experienced with machine learning but new to TensorFlow, see
+@{$get_started/premade_estimators$Getting Started with TensorFlow}.
+
+
 ## Common installation problems
 
 We are relying on Stack Overflow to document TensorFlow installation problems
@@ -512,18 +518,13 @@ RuntimeError: Broken toolchain: cannot link a simple C program</pre>
 ## The URL of the TensorFlow Python package
 
 A few installation mechanisms require the URL of the TensorFlow Python package.
-The value you specify depends on three factors:
-
-  * operating system
-  * Python version
-
-This section documents the relevant values for Mac OS installations.
+The value you specify depends on your Python version.
 
 ### Python 2.7
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
 </pre>
 
 
@@ -531,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index c09c9c2c0c6eaabe9066a2630a3aec39f05fcfec..7d7c2aa75aeef15d9b400f2bf5dddb083f387a2b 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -133,30 +133,21 @@ The following NVIDIA <i>hardware</i> must be installed on your system:
 
 The following NVIDIA <i>software</i> must be installed on your system:
 
-  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 9.0.
+  * [CUDA Toolkit](http://nvidia.com/cuda) (>= 7.0). We recommend version 9.0.
     For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
-    Ensure that you append the relevant Cuda pathnames to the
+    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+    Ensure that you append the relevant CUDA pathnames to the
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
-  * The NVIDIA drivers associated with NVIDIA's Cuda Toolkit.
-  * cuDNN (>= v3). We recommend version 6.0. For details, see
-    [NVIDIA's documentation](https://developer.nvidia.com/cudnn),
-    particularly the description of appending the appropriate pathname
-    to your `LD_LIBRARY_PATH` environment variable.
-
-Finally, you must also install `libcupti` which for Cuda Toolkit >= 8.0 you do via
-
-<pre> $ <b>sudo apt-get install cuda-command-line-tools</b> </pre>
-
-and add its path to your `LD_LIBRARY_PATH` environment variable:
-
-<pre> $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b> </pre>
-
-For Cuda Toolkit <= 7.5, you install `libcupti-dev` by invoking the following command:
-
-<pre> $ <b>sudo apt-get install libcupti-dev</b> </pre>
+  * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA
+    Toolkit.
+  * [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= v3). We recommend version 7.0. For details, see
+    [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
+  * [CUPTI](http://docs.nvidia.com/cuda/cupti/) ships with the CUDA Toolkit, but
+    you also need to append its path to the `LD_LIBRARY_PATH` environment
+    variable:
 
+    <pre> $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b> </pre>
 
 ### Next
 
@@ -240,8 +231,8 @@ such as compiler flags. You must run this script *prior* to
 creating the pip package and installing TensorFlow.
 
 If you wish to build TensorFlow with GPU, `configure` will ask
-you to specify the version numbers of Cuda and cuDNN. If several
-versions of Cuda or cuDNN are installed on your system, explicitly select
+you to specify the version numbers of CUDA and cuDNN. If several
+versions of CUDA or cuDNN are installed on your system, explicitly select
 the desired version instead of relying on the default.
 
 One of the questions that `configure` will ask is as follows:
@@ -289,12 +280,12 @@ Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
 CUDA support will be enabled for TensorFlow
 Do you want to use clang as CUDA compiler? [y/N]
 nvcc will be used as CUDA compiler
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
 Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
 Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
 Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-Please specify a list of comma-separated Cuda compute capabilities you want to build with.
+Please specify a list of comma-separated CUDA compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
 [Default is: "3.5,5.2"]: <b>3.0</b>
@@ -304,14 +295,14 @@ Configuration finished
 </pre>
 
 If you told `configure` to build for GPU support, then `configure`
-will create a canonical set of symbolic links to the Cuda libraries
-on your system.  Therefore, every time you change the Cuda library paths,
+will create a canonical set of symbolic links to the CUDA libraries
+on your system.  Therefore, every time you change the CUDA library paths,
 you must rerun the `configure` script before re-invoking
 the <code>bazel build</code> command.
 
 Note the following:
 
-  * Although it is possible to build both Cuda and non-Cuda configs
+  * Although it is possible to build both CUDA and non-CUDA configs
     under the same source tree, we recommend running `bazel clean` when
     switching between these two configurations in the same source tree.
   * If you don't run the `configure` script *before* running the
@@ -359,10 +350,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.6.0 on Linux:
+for TensorFlow 1.7.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -459,6 +450,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
@@ -478,6 +471,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@@ -492,6 +486,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 2413bc9cfbbfd577ebd583be4da82994e8551c9e..86add74da15005a56bf0fd88c775139cd030c243 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -17,7 +17,7 @@ You must choose one of the following types of TensorFlow to install:
     NVIDIA® GPU, you must install this version. Note that this version of
     TensorFlow is typically much easier to install (typically,
     in 5 or 10 minutes), so even if you have an NVIDIA GPU, we recommend
-    installing this version first. Prebuilt binaries will use AVX instructions. 
+    installing this version first. Prebuilt binaries will use AVX instructions.
   * **TensorFlow with GPU support**. TensorFlow programs typically run
     significantly faster on a GPU than on a CPU. Therefore, if your
     system has a NVIDIA® GPU meeting the prerequisites shown below
@@ -154,13 +154,17 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with TensorFlow}.
-
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
 
-There is also a helpful [script](https://gist.github.com/mrry/ee5dbcfdd045fa48a27d56664411d41c)
-for Windows TensorFlow installation issues.
+If you are new to machine learning, we recommend the following:
+
+*  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
+*  @{$get_started/get_started_for_beginners$Getting Started for ML Beginners}
+
+If you are experienced with machine learning but new to TensorFlow, see
+@{$get_started/premade_estimators$Getting Started with TensorFlow}.
+
 
 ## Common installation problems
 
diff --git a/tensorflow/docs_src/javascript/index.md b/tensorflow/docs_src/javascript/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad63eeb255d870064567a0de8a28815ce2ae0172
--- /dev/null
+++ b/tensorflow/docs_src/javascript/index.md
@@ -0,0 +1,5 @@
+# JavaScript 
+
+You may develop TensorFlow programs in JavaScript, training and deploying
+models right in your browser.  For details, see
+[js.tensorflow.org](https://js.tensorflow.org).
diff --git a/tensorflow/docs_src/javascript/leftnav_files b/tensorflow/docs_src/javascript/leftnav_files
new file mode 100644
index 0000000000000000000000000000000000000000..fc0ab8a5435943f6442969ec5787305b98c7908b
--- /dev/null
+++ b/tensorflow/docs_src/javascript/leftnav_files
@@ -0,0 +1 @@
+index.md
diff --git a/tensorflow/docs_src/mobile/leftnav_files b/tensorflow/docs_src/mobile/leftnav_files
index 4cf134cc3c2c323405d769a5ced5d5a68f188203..585470d5f0847716863ba6129bf75c26631fecbd 100644
--- a/tensorflow/docs_src/mobile/leftnav_files
+++ b/tensorflow/docs_src/mobile/leftnav_files
@@ -1,6 +1,7 @@
 index.md
 ### TensorFlow Lite
 tflite/index.md
+tflite/devguide.md
 tflite/demo_android.md
 tflite/demo_ios.md
 >>>
diff --git a/tensorflow/docs_src/mobile/optimizing.md b/tensorflow/docs_src/mobile/optimizing.md
index ca9cb043e9282702044b0da26c27f5bf29141cae..778e4d3a6233c3bec70b830bc998013745a1f0ba 100644
--- a/tensorflow/docs_src/mobile/optimizing.md
+++ b/tensorflow/docs_src/mobile/optimizing.md
@@ -233,6 +233,8 @@ order by how long they took. From left to right, the columns are:
 - The cumulative total time of this and the previous ops in the table. This is
   handy for understanding what the distribution of work is across the layers, to
   see if just a few of the nodes are taking up most of the time.
+  
+- The amount of memory consumed by outputs of this type of op.
 
 - Name of the node.
 
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 360ee302aa96bc3a0b65eab7b39c3dacf56b42c0..8b22c04d872f18607c485775cb8f096f0a361995 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -60,7 +60,7 @@ and serialized as protocol buffers:
   the `NodeDef`, so if all the `Variable` weights are converted to `Const` nodes,
   then we only need a single `GraphDef` file to hold the model architecture and
   the weights. Freezing the graph handles the process of loading the
-  checkpoints, and then converts all Consts to Variables. You can then load the
+  checkpoints, and then converts all Variables to Consts. You can then load the
   resulting file in a single call, without having to restore variable values
   from checkpoints. One thing to watch out for with `GraphDef` files is that
   sometimes they’re stored in text format for easy inspection. These versions
diff --git a/tensorflow/docs_src/mobile/tflite/demo_android.md b/tensorflow/docs_src/mobile/tflite/demo_android.md
index c94b5597a673a7e68aed517b325b9719b3b73bbd..7f2f8882a24702d167599452e66afbe720026808 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_android.md
+++ b/tensorflow/docs_src/mobile/tflite/demo_android.md
@@ -1,42 +1,144 @@
-# TensorFlow Lite Demo for Android
+# Android Demo App
 
-The TensorFlow Lite demo is a camera app that continuously classifies whatever
-it sees from your device's back camera, using a quantized MobileNet model.
+An example Android application using TensorFLow Lite is available
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
+The demo is a sample camera app that classifies images continuously
+using either a quantized Mobilenet model or a floating point Inception-v3 model.
+To run the demo, a device running Android 5.0 ( API 21) or higher is required.
 
-You'll need an Android device running Android 5.0 or higher to run the demo.
+In the demo app, inference is done using the TensorFlow Lite Java API. The demo
+app classifies frames in real-time, displaying the top most probable
+classifications. It also displays the time taken to detect the object.
 
-To get you started working with TensorFlow Lite on Android, we'll walk you
-through building and deploying our TensorFlow demo app in Android Studio.
+There are three ways to get the demo app to your device:
 
-Note: For a more detailed guide see the
-[TFLite Codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-tflite/index.html#0)
+* Download the [prebuilt binary APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+* Use Android Studio to build the application.
+* Download the source code for TensorFlow Lite and the demo and build it using
+  bazel.
 
-It's also possible to build the demo app with Bazel, but we only recommend
-this for advanced users who are very familiar with the Bazel build
-environment. For more information on that, see our page [on Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite#building-tensorflow-lite-and-the-demo-app-from-source).
 
-## Build and deploy with Android Studio
+## Download the pre-built binary
 
-1. Clone the TensorFlow repository from GitHub if you haven't already:
+The easiest way to try the demo is to download the
+[pre-built binary APK](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
 
-        git clone https://github.com/tensorflow/tensorflow
+Once the APK is installed, click the app icon to start the program. The first
+time the app is opened, it asks for runtime permissions to access the device
+camera. The demo app opens the back-camera of the device and recognizes objects
+in the camera's field of view. At the bottom of the image (or at the left
+of the image if the device is in landscape mode), it displays top three objects
+classified and the classification latency.
 
-2. Install the latest version of Android Studio from [here](https://developer.android.com/studio/index.html).
 
-3. From the **Welcome to Android Studio** screen, use the **Import Project
-   (Gradle, Eclipse ADT, etc)** option to import the
-   `tensorflow/contrib/lite/java/demo` directory as an existing Android Studio
-   Project.
+## Build in Android Studio with TensorFlow Lite AAR from JCenter
 
-    Android Studio may prompt you to install Gradle upgrades and other tool
-    versions; you should accept these upgrades.
+Use Android Studio to try out changes in the project code and compile the demo
+app:
 
-4. Download the TensorFlow Lite MobileNet model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip).
+* Install the latest version of
+  [Android Studio](https://developer.android.com/studio/index.html).
+* Make sure the Android SDK version is greater than 26 and NDK version is greater
+  than 14 (in the Android Studio settings).
+* Import the `tensorflow/contrib/lite/java/demo` directory as a new
+  Android Studio project.
+* Install all the Gradle extensions it requests.
 
-    Unzip this and copy the `mobilenet_quant_v1_224.tflite` file to the assets
-    directory: `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
+To get a model, either:
 
-5. Build and run the app in Android Studio.
+* Download the quantized [Mobilenet TensorFlow Lite model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
+  and unzip and copy `mobilenet_quant_v1_224.tflite` to the assets directory:
+  `tensorflow/contrib/lite/java/demo/app/src/main/assets/`.
+* Or, download the floating point [Inception-v3 model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
+  and unzip and copy `inceptionv3_non_slim_2015.tflite` to the assets
+  directory. Change the chosen classifier in
+  [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java)<br>
+  from: `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`<br>
+  to: `classifier = new ImageClassifierFloatInception(getActivity());`.
 
-You'll have to grant permissions for the app to use the device's camera. Point
-the camera at various objects and enjoy seeing how the model classifies things!
+Now you can build and run the demo app.
+
+
+## Build TensorFlow Lite and the demo app from source
+
+### Clone the TensorFlow repo
+
+```sh
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Install Bazel
+
+If `bazel` is not installed on your system, see
+[Installing Bazel](https://bazel.build/versions/master/docs/install.html).
+
+Note: Bazel does not currently support Android builds on Windows. Windows users
+should download the
+[prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+
+### Install Android NDK and SDK
+
+The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The
+current recommended version is *14b* and can be found on the
+[NDK Archives](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads)
+page.
+
+The Android SDK and build tools can be
+[downloaded separately](https://developer.android.com/tools/revisions/build-tools.html)
+or used as part of
+[Android Studio](https://developer.android.com/studio/index.html). To build the
+TensorFlow Lite Android demo, build tools require API >= 23 (but it will run on
+devices with API >= 21).
+
+In the root of the TensorFlow repository, update the `WORKSPACE` file with the
+`api_level` and location of the SDK and NDK. If you installed it with
+Android Studio, the SDK path can be found in the SDK manager. The default NDK
+path is:`{SDK path}/ndk-bundle.` For example:
+
+```
+android_sdk_repository (
+    name = "androidsdk",
+    api_level = 23,
+    build_tools_version = "23.0.2",
+    path = "/home/xxxx/android-sdk-linux/",
+)
+
+android_ndk_repository(
+    name = "androidndk",
+    path = "/home/xxxx/android-ndk-r10e/",
+    api_level = 19,
+)
+```
+
+Some additional details are available on the
+[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
+
+### Build the source code
+
+To build the demo app, run `bazel`:
+
+```
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
+```
+
+Caution: Because of an bazel bug, we only support building the Android demo app
+within a Python 2 environment.
+
+
+## About the demo
+
+The demo app is resizing each camera image frame (224 width * 224 height) to
+match the quantized MobileNets model (299 * 299 for Inception-v3). The resized
+image is converted—row by row—into a
+[ByteBuffer](https://developer.android.com/reference/java/nio/ByteBuffer.html).
+Its size is  1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch.
+224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents
+the 3 colors of a pixel.
+
+This demo uses the TensorFlow Lite Java inference API
+for models which take a single input and provide a single output. This outputs a
+two-dimensional array, with the first dimension being the category index and the
+second dimension being the confidence of classification. Both models have 1001
+unique categories and the app sorts the probabilities of all the categories and
+displays the top three. The model file must be downloaded and bundled within the
+assets directory of the app.
diff --git a/tensorflow/docs_src/mobile/tflite/demo_ios.md b/tensorflow/docs_src/mobile/tflite/demo_ios.md
index 3ee9b1cbca6cfef98616bd33bbf91b756b4efa15..3be21da89f9e53d324c2ade0cb937f4b5b30fad4 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_ios.md
+++ b/tensorflow/docs_src/mobile/tflite/demo_ios.md
@@ -1,4 +1,4 @@
-# TensorFlow Lite Demo for iOS
+# iOS Demo App
 
 The TensorFlow Lite demo is a camera app that continuously classifies whatever
 it sees from your device's back camera, using a quantized MobileNet model. These
diff --git a/tensorflow/docs_src/mobile/tflite/devguide.md b/tensorflow/docs_src/mobile/tflite/devguide.md
new file mode 100644
index 0000000000000000000000000000000000000000..96392a3c9b8c054aca9085950928308addcea342
--- /dev/null
+++ b/tensorflow/docs_src/mobile/tflite/devguide.md
@@ -0,0 +1,224 @@
+# Developer Guide
+
+Using a TensorFlow Lite model in your mobile app requires multiple
+considerations: you must choose a pre-trained or custom model, convert the model
+to a TensorFLow Lite format, and finally, integrate the model in your app.
+
+## 1. Choose a model
+
+Depending on the use case, you can choose one of the popular open-sourced models,
+such as *InceptionV3* or *MobileNets*, and re-train these models with a custom
+data set or even build your own custom model.
+
+### Use a pre-trained model
+
+[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
+is a family of mobile-first computer vision models for TensorFlow designed to
+effectively maximize accuracy, while taking into consideration the restricted
+resources for on-device or embedded applications. MobileNets are small,
+low-latency, low-power models parameterized to meet the resource constraints for
+a variety of uses. They can be used for classification, detection, embeddings, and
+segmentation—similar to other popular large scale models, such as
+[Inception](https://arxiv.org/pdf/1602.07261.pdf). Google provides 16 pre-trained
+[ImageNet](http://www.image-net.org/challenges/LSVRC/) classification checkpoints
+for MobileNets that can be used in mobile projects of all sizes.
+
+[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model
+that achieves fairly high accuracy recognizing general objects with 1000 classes,
+for example, "Zebra", "Dalmatian", and "Dishwasher". The model extracts general
+features from input images using a convolutional neural network and classifies
+them based on those features with fully-connected and softmax layers.
+
+[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+is an on-device model that provides one-touch replies for incoming text messages
+by suggesting contextually relevant messages. The model is built specifically for
+memory constrained devices, such as watches and phones, and has been successfully
+used in Smart Replies on Android Wear. Currently, this model is Android-specific.
+
+These pre-trained models are [available for download](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/models.md)
+
+### Re-train Inception-V3 or MobileNet for a custom data set
+
+These pre-trained models were trained on the *ImageNet* data set which contains
+1000 predefined classes. If these classes are not sufficient for your use case,
+the model will need to be re-trained. This technique is called
+*transfer learning* and starts with a model that has been already trained on a
+problem, then retrains the model on a similar problem. Deep learning from
+scratch can take days, but transfer learning is fairly quick. In order to do
+this, you need to generate a custom data set labeled with the relevant classes.
+
+The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
+codelab walks through the re-training process step-by-step. The code supports
+both floating point and quantized inference.
+
+### Train a custom model
+
+A developer may choose to train a custom model using Tensorflow (see the
+@{$tutorials} for examples of building and training models). If you have already
+written a model, the first step is to export this to a @{tf.GraphDef} file. This
+is required because some formats do not store the model structure outside the
+code, and we must communicate with other parts of the framework. See
+[Exporting the Inference Graph](https://github.com/tensorflow/models/blob/master/research/slim/README.md)
+to create .pb file for the custom model.
+
+TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to the
+[TensorFlow Lite & TensorFlow Compatibility Guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
+for supported operators and their usage. This set of operators will continue to
+grow in future Tensorflow Lite releases.
+
+
+## 2. Convert the model format
+
+The model generated (or downloaded) in the previous step is a *standard*
+Tensorflow model and you should now have a .pb or .pbtxt @{tf.GraphDef} file.
+Models generated with transfer learning (re-training) or custom models must be
+converted—but, we must first freeze the graph to convert the model to the
+Tensorflow Lite format. This process uses several model formats:
+
+* @{tf.GraphDef} (.pb) —A protobuf that represents the TensorFlow training or
+  computation graph. It contains operators, tensors, and variables definitions.
+* *CheckPoint* (.ckpt) —Serialized variables from a TensorFlow graph. Since this
+  does not contain a graph structure, it cannot be interpreted by itself.
+* `FrozenGraphDef` —A subclass of `GraphDef` that does not contain
+  variables. A `GraphDef` can be converted to a `FrozenGraphDef` by taking a
+  CheckPoint and a `GraphDef`, and converting each variable into a constant
+  using the value retrieved from the CheckPoint.
+* `SavedModel` —A `GraphDef` and CheckPoint with a signature that labels
+  input and output arguments to a model. A `GraphDef` and CheckPoint can be
+  extracted from a `SavedModel`.
+* *TensorFlow Lite model* (.tflite) —A serialized
+  [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
+  Lite operators and tensors for the TensorFlow Lite interpreter, similar to a
+  `FrozenGraphDef`.
+
+### Freeze Graph
+
+To use the `GraphDef` .pb file with TensorFlow Lite, you must have checkpoints
+that contain trained weight parameters. The .pb file only contains the structure
+of the graph. The process of merging the checkpoint values with the graph
+structure is called *freezing the graph*.
+
+You should have a checkpoints folder or download them for a pre-trained model
+(for example,
+[MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
+
+To freeze the graph, use the following command (changing the arguments):
+
+```
+freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
+  --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
+  --input_binary=true \
+  --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
+  --output_node_names=MobileNetV1/Predictions/Reshape_1
+```
+
+The `input_binary` flag must be enabled so the protobuf is read and written in
+a binary format. Set the `input_graph` and `input_checkpoint` files.
+
+The `output_node_names` may not be obvious outside of the code that built the
+model. The easiest way to find them is to visualize the graph, either with
+[TensorBoard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3)
+or `graphviz`.
+
+The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
+(.tflite) for use on Android or iOS devices. For Android, the Tensorflow
+Optimizing Converter tool supports both float and quantized models. To convert
+the frozen `GraphDef` to the .tflite format:
+
+```
+toco --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
+  --inference_type=FLOAT \
+  --input_type=FLOAT \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --input_shapes=1,224,224,3
+```
+
+The `input_file` argument should reference the frozen `GraphDef` file
+containing the model architecture. The [frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
+file used here is available for download. `output_file` is where the TensorFlow
+Lite model will get generated. The `input_type` and `inference_type`
+arguments should be set to `FLOAT`, unless converting a
+@{$performance/quantization$quantized model}. Setting the `input_array`,
+`output_array`, and `input_shape` arguments are not as straightforward. The
+easiest way to find these values is to explore the graph using Tensorboard. Reuse
+the arguments for specifying the output nodes for inference in the
+`freeze_graph` step.
+
+It is also possible to use the Tensorflow Optimizing Converter with protobufs
+from either Python or from the command line (see the 
+[toco_from_protos.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py)
+example). This allows you to integrate the conversion step into the model design
+workflow, ensuring the model is easily convertible to a mobile inference graph.
+For example:
+
+```python
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+out = tf.identity(val, name="out")
+
+with tf.Session() as sess:
+  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
+  open("converteds_model.tflite", "wb").write(tflite_model)
+```
+
+For usage, see the Tensorflow Optimizing Converter
+[command-line examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
+
+Refer to the
+[Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
+for troubleshooting help, and if that doesn't help, please
+[file an issue](https://github.com/tensorflow/tensorflow/issues).
+
+The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
+to visualize TensorFlow Lite models after conversion. To build the
+[visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/tools/visualize.py)
+tool:
+
+```sh
+bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
+```
+
+This generates an interactive HTML page listing subgraphs, operations, and a
+graph visualization.
+
+
+## 3. Use the TensorFlow Lite model for inference in a mobile app
+
+After completing the prior steps, you should now have a .tflite model file.
+
+### Android
+
+Since Android apps are written in Java and the core TensorFlow library is in C++,
+a JNI library is provided as an interface. This is only meant for inference—it
+provides the ability to load a graph, set up inputs, and run the model to
+calculate outputs.
+
+The open source Android demo app uses the JNI interface and is available
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
+You can also download a
+[prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+See the @{$tflite/demo_android} guide for details.
+
+The @{$mobile/android_build} guide has instructions for installing TensorFlow on
+Android and setting up `bazel` and Android Studio.
+
+### iOS
+
+To integrate a TensorFlow model in an iOS app, see the
+[TensorFlow Lite for iOS](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md)
+guide and @{$tflite/demo_ios} guide.
+
+#### Core ML support
+
+Core ML is a machine learning framework used in Apple products. In addition to
+using Tensorflow Lite models directly in your applications, you can convert
+trained Tensorflow models to the
+[CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
+devices. To use the converter, refer to the
+[Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index beb24794fc98724e2423e02a71028f79be45cf75..11f11ea4dc54b9f152f2560384cb47cec6b308c0 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -155,7 +155,9 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-![tensorflow lite architecture](https://www.tensorflow.org/images/tflite-architecture.jpg)
+<img src="/images/tflite-architecture.jpg"
+     alt="TensorFlow Lite architecture diagram"
+     style="max-width:600px;">
 
 Starting with a trained TensorFlow model on disk, you'll convert that model to
 the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 5e39e710a0dba74dfd68a04367ce402362520590..217ab596b72bc263ae5dda377a8faab8a39b0a3c 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -241,13 +241,10 @@ See also
 
 Clamps an operand to within the range between a minimum and maximum value.
 
-<b> `Clamp(computation, args...)` </b>
+<b> `Clamp(min, operand, max)` </b>
 
 | Arguments     | Type                    | Semantics                        |
 | ------------- | ----------------------- | -------------------------------- |
-| `computation` | `Computation`           | computation of type `T_0, T_1,   |
-:               :                         : ..., T_N -> S` with N parameters :
-:               :                         : of arbitrary type                :
 | `min`         | `ComputationDataHandle` | array of type T                  |
 | `operand`     | `ComputationDataHandle` | array of type T                  |
 | `max`         | `ComputationDataHandle` | array of type T                  |
@@ -791,9 +788,7 @@ DynamicSlice extracts a sub-array from the input array at dynamic
 dimension: [start, start + size). The shape of `start_indices` must be rank ==
 1, with dimension size equal to the rank of `operand`.
 Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo input dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
+calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
@@ -850,9 +845,7 @@ is updated.
 The shape of `start_indices` must be rank == 1, with dimension size equal to
 the rank of `operand`.
 Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo update dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
+calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index d1399814ee862f5f7ecc3f448d51fb3724fa3447..f5a0eb0a2000a5c35f7e3641e6552d40629305a6 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -4,29 +4,28 @@
 
 [TOC]
 
-TensorFlow debugger (**tfdbg**) is a specialized debugger for TensorFlow. It
-lets you view the internal structure and states of running TensorFlow graphs
-during training and inference, which is difficult to debug with general-purpose
-debuggers such as Python's `pdb` due to TensorFlow's computation-graph paradigm.
-
-> NOTE: TensorFlow debugger uses a
-> [curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based
-> text user interface. On Mac OS X, the `ncurses` library is required and can
-> be installed with `brew install homebrew/dupes/ncurses`. On Windows, curses
-> isn't as well supported, so a
-> [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based interface can
-> be used with tfdbg by installing `pyreadline` with pip.
-> If you use Anaconda3, you can install it with a command
-> such as `"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`.
-> Unofficial Windows curses packages can be downloaded
-> [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
-> installed using `pip install <your_version>.whl`, however curses on Windows
-> may not work as reliably as curses on Linux or Mac.
-
-> NOTE: This guide focuses on the command-line interface (CLI) of tfdbg. For
-> guide on how to use the graphical user interface (GUI) of tfdbg, i.e., the
-> **TensorBoard Debugger Plugin**, please visit
-> [its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+`tfdbg` is a specialized debugger for TensorFlow. It lets you view the internal
+structure and states of running TensorFlow graphs during training and inference,
+which is difficult to debug with general-purpose debuggers such as Python's `pdb`
+due to TensorFlow's computation-graph paradigm.
+
+This guide focuses on the command-line interface (CLI) of `tfdbg`. For guide on
+how to use the graphical user interface (GUI) of tfdbg, i.e., the
+**TensorBoard Debugger Plugin**, please visit
+[its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+
+Note: The TensorFlow debugger uses a
+[curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text
+user interface. On Mac OS X, the `ncurses` library is required and can be
+installed with `brew install homebrew/dupes/ncurses`. On Windows, curses isn't as
+well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based
+interface can be used with tfdbg by installing `pyreadline` with `pip`. If you
+use Anaconda3, you can install it with a command such as
+`"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`. Unofficial
+Windows curses packages can be downloaded
+[here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
+installed using `pip install <your_version>.whl`, however curses on Windows may
+not work as reliably as curses on Linux or Mac.
 
 This tutorial demonstrates how to use the **tfdbg** CLI to debug the appearance
 of [`nan`s](https://en.wikipedia.org/wiki/NaN)
@@ -155,6 +154,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `-n <name_pattern>` | List dumped tensors with names matching given regular-expression pattern. | `lt -n Softmax.*` |
 | | `-t <op_pattern>` | List dumped tensors with op types matching given regular-expression pattern. | `lt -t MatMul` |
 | | `-f <filter_name>` | List only the tensors that pass a registered tensor filter. | `lt -f has_inf_or_nan` |
+| | `-f <filter_name> -fenn <regex>` | List only the tensors that pass a registered tensor filter, excluding nodes with names matching the regular expression. | `lt -f has_inf_or_nan` `-fenn .*Sqrt.*` |
 | | `-s <sort_key>` | Sort the output by given `sort_key`, whose possible values are `timestamp` (default), `dump_size`, `op_type` and `tensor_name`. | `lt -s dump_size` |
 | | `-r` | Sort in reverse order. | `lt -r -s dump_size` |
 | **`pt`** | | **Print value of a dumped tensor.** | |
@@ -200,6 +200,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `-n` | Execute through the next `Session.run` without debugging, and drop to CLI right before the run after that. | `run -n` |
 | | `-t <T>` | Execute `Session.run` `T - 1` times without debugging, followed by a run with debugging. Then drop to CLI right after the debugged run. | `run -t 10` |
 | | `-f <filter_name>` | Continue executing `Session.run` until any intermediate tensor triggers the specified Tensor filter (causes the filter to return `True`). | `run -f has_inf_or_nan` |
+| | `-f <filter_name> -fenn <regex>` | Continue executing `Session.run` until any intermediate tensor whose node names doesn't match the regular expression triggers the specified Tensor filter (causes the filter to return `True`). | `run -f has_inf_or_nan -fenn .*Sqrt.*` |
 | | `--node_name_filter <pattern>` | Execute the next `Session.run`, watching only nodes with names matching the given regular-expression pattern. | `run --node_name_filter Softmax.*` |
 | | `--op_type_filter <pattern>` | Execute the next `Session.run`, watching only nodes with op types matching the given regular-expression pattern. | `run --op_type_filter Variable.*` |
 | | `--tensor_dtype_filter <pattern>` | Execute the next `Session.run`, dumping only Tensors with data types (`dtype`s) matching the given regular-expression pattern. | `run --tensor_dtype_filter int.*` |
@@ -746,16 +747,16 @@ There are three possible workarounds or solutions:
    to which tfdbg dumps the debug data. You can use it to let tfdbg dump the
    debug data on a disk with larger free space. For example:
 
-   ``` python
-   # For LocalCLIDebugWrapperSession
-   sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
-
-   # For LocalCLIDebugHook
-   hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
-   ```
+```python
+# For LocalCLIDebugWrapperSession
+sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
 
+# For LocalCLIDebugHook
+hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
+```
    Make sure that the directory pointed to by dump_root is empty or nonexistent.
-   tfdbg cleans up the dump directories before exiting.
+   `tfdbg` cleans up the dump directories before exiting.
+
 *  Reduce the batch size used during the runs.
 *  Use the filtering options of tfdbg's `run` command to watch only specific
    nodes in the graph. For example:
@@ -813,6 +814,20 @@ sess.run(b)
 the constant-folding would not occur and `tfdbg` should show the intermediate
 tensor dumps.
 
+
+**Q**: I am debugging a model that generates unwanted infinities or NaNs. But
+       there are some nodes in my model that are known to generate infinities
+       or NaNs in their output tensors even under completely normal conditions.
+       How can I skip those nodes during my `run -f has_inf_or_nan` actions?
+
+**A**: Use the `--filter_exclude_node_names` (`-fenn` for short) flag. For
+       example, if you known you have a node with name matching the regular
+       expression `.*Sqrt.*` that generates infinities or NaNs regardless
+       of whether the model is behaving correctly, you can exclude the nodes
+       from the infinity/NaN-finding runs with the command
+       `run -f has_inf_or_nan -fenn .*Sqrt.*`.
+
+
 **Q**: Is there a GUI for tfdbg?
 
 **A**: Yes, the **TensorBoard Debugger Plugin** is the GUI of tfdbg.
diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc5b403428fed524abd2a793e695d11b9d63290e
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -0,0 +1,848 @@
+# Eager Execution
+
+TensorFlow's eager execution is an imperative programming environment that
+evaluates operations immediately, without building graphs: operations return
+concrete values instead of constructing a computational graph to run later. This
+makes it easy to get started with TensorFlow and debug models, and it
+reduces boilerplate as well. To follow along with this guide, run the code
+samples below in an interactive `python` interpreter.
+
+Eager execution is a flexible machine learning platform for research and
+experimentation, providing:
+
+* *An intuitive interface*—Structure your code naturally and use Python data
+  structures. Quickly iterate on small models and small data.
+* *Easier debugging*—Call ops directly to inspect running models and test
+  changes. Use standard Python debugging tools for immediate error reporting.
+* *Natural control flow*—Use Python control flow instead of graph control
+  flow, simplifying the specification of dynamic models.
+
+Eager execution supports most TensorFlow operations and GPU acceleration. For a
+collection of examples running in eager execution, see:
+[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
+
+Note: Some models may experience increased overhead with eager execution
+enabled. Performance improvements are ongoing, but please
+[file a bug](https://github.com/tensorflow/tensorflow/issues) if you find a
+problem and share your benchmarks.
+
+## Setup and basic usage
+
+Upgrade to the latest version of TensorFlow:
+
+```
+$ pip install --upgrade tensorflow
+```
+
+To start eager execution, add `tf.enable_eager_execution()` to the beginning of
+the program or console session. Do not add this operation to other modules that
+the program calls.
+
+```py
+from __future__ import absolute_import, division, print_function
+
+import tensorflow as tf
+
+tf.enable_eager_execution()
+```
+
+Now you can run TensorFlow operations and the results will return immediately:
+
+```py
+tf.executing_eagerly()        # => True
+
+x = [[2.]]
+m = tf.matmul(x, x)
+print("hello, {}".format(m))  # => "hello, [[4.]]"
+```
+
+Enabling eager execution changes how TensorFlow operations behave—now they
+immediately evaluate and return their values to Python. `tf.Tensor` objects
+reference concrete values instead of symbolic handles to nodes in a computational
+graph. Since there isn't a computational graph to build and run later in a
+session, it's easy to inspect results using `print()` or a debugger. Evaluating,
+printing, and checking tensor values does not break the flow for computing
+gradients.
+
+Eager execution works nicely with [NumPy](http://www.numpy.org/). NumPy
+operations accept `tf.Tensor` arguments. TensorFlow
+[math operations](https://www.tensorflow.org/api_guides/python/math_ops) convert
+Python objects and NumPy arrays to `tf.Tensor` objects. The
+`tf.Tensor.numpy` method returns the object's value as a NumPy `ndarray`.
+
+```py
+a = tf.constant([[1, 2],
+                 [3, 4]])
+print(a)
+# => tf.Tensor([[1 2]
+#               [3 4]], shape=(2, 2), dtype=int32)
+
+# Broadcasting support
+b = tf.add(a, 1)
+print(b)
+# => tf.Tensor([[2 3]
+#               [4 5]], shape=(2, 2), dtype=int32)
+
+# Operator overloading is supported
+print(a * b)
+# => tf.Tensor([[ 2  6]
+#               [12 20]], shape=(2, 2), dtype=int32)
+
+# Use NumPy values
+import numpy as np
+
+c = np.multiply(a, b)
+print(c)
+# => [[ 2  6]
+#     [12 20]]
+
+# Obtain numpy value from a tensor:
+print(a.numpy())
+# => [[1 2]
+#     [3 4]]
+```
+
+The `tfe` module contains symbols available to both eager and graph execution
+environments and is useful for writing code to [work with graphs](#work_with_graphs):
+
+```py
+import tensorflow.contrib.eager as tfe
+```
+
+## Dynamic control flow
+
+A major benefit of eager execution is that all the functionality of the host
+language is available while your model is executing. So, for example,
+it is easy to write [fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz):
+
+```py
+def fizzbuzz(max_num):
+  counter = tf.constant(0)
+  for num in range(max_num):
+    num = tf.constant(num)
+    if num % 3 == 0 and num % 5 == 0:
+      print('FizzBuzz')
+    elif num % 3 == 0:
+      print('Fizz')
+    elif num % 5 == 0:
+      print('Buzz')
+    else:
+      print(num)
+    counter += 1
+  return counter
+```
+
+This has conditionals that depend on tensor values and it prints these values
+at runtime.
+
+## Build a model
+
+Many machine learning models are represented by composing layers. When
+using TensorFlow with eager execution you can either write your own layers or
+use a layer provided in the `tf.keras.layers` package.
+
+While you can use any Python object to represent a layer,
+TensorFlow has `tf.keras.layers.Layer` as a convenient base class. Inherit from
+it to implement your own layer:
+
+```py
+class MySimpleLayer(tf.keras.layers.Layer):
+  def __init__(self, output_units):
+    self.output_units = output_units
+
+  def build(self, input):
+    # The build method gets called the first time your layer is used.
+    # Creating variables on build() allows you to make their shape depend
+    # on the input shape and hence remove the need for the user to specify
+    # full shapes. It is possible to create variables during __init__() if
+    # you already know their full shapes.
+    self.kernel = self.add_variable(
+      "kernel", [input.shape[-1], self.output_units])
+
+  def call(self, input):
+    # Override call() instead of __call__ so we can perform some bookkeeping.
+    return tf.matmul(input, self.kernel)
+```
+
+Use `tf.keras.layers.Dense` layer instead  of `MySimpleLayer` above as it has
+a superset of its functionality (it can also add a bias).
+
+When composing layers into models you can use `tf.keras.Sequential` to represent
+models which are a linear stack of layers. It is easy to use for basic models:
+
+```py
+model = tf.keras.Sequential([
+  tf.keras.layers.Dense(10, input_shape=(784,)),  # must declare input shape
+  tf.keras.layers.Dense(10)
+])
+```
+
+Alternatively, organize models in classes by inheriting from `tf.keras.Model`.
+This is a container for layers that is a layer itself, allowing `tf.keras.Model`
+objects to contain other `tf.keras.Model` objects.
+
+```py
+class MNISTModel(tf.keras.Model):
+  def __init__(self):
+    super(MNISTModel, self).__init__()
+    self.dense1 = tf.keras.layers.Dense(units=10)
+    self.dense2 = tf.keras.layers.Dense(units=10)
+
+  def call(self, input):
+    """Run the model."""
+    result = self.dense1(input)
+    result = self.dense2(result)
+    result = self.dense2(result)  # reuse variables from dense2 layer
+    return result
+
+model = MNISTModel()
+```
+
+It's not required to set an input shape for the `tf.keras.Model` class since
+the parameters are set the first time input is passed to the layer.
+
+`tf.keras.layers` classes create and contain their own model variables that
+are tied to the lifetime of their layer objects. To share layer variables, share
+their objects.
+
+
+## Eager training
+
+### Computing gradients
+
+[Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
+is useful for implementing machine learning algorithms such as
+[backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
+neural networks. During eager execution, use `tfe.GradientTape` to trace
+operations for computing gradients later.
+
+`tfe.GradientTape` is an opt-in feature to provide maximal performance when
+not tracing. Since different operations can occur during each call, all
+forward-pass operations get recorded to a "tape". To compute the gradient, play
+the tape backwards and then discard. A particular `tfe.GradientTape` can only
+compute one gradient; subsequent calls throw a runtime error.
+
+```py
+w = tfe.Variable([[1.0]])
+with tfe.GradientTape() as tape:
+  loss = w * w
+
+grad = tape.gradient(loss, [w])
+print(grad)  # => [tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32)]
+```
+
+Here's an example of `tfe.GradientTape` that records forward-pass operations
+to train a simple model:
+
+```py
+# A toy dataset of points around 3 * x + 2
+NUM_EXAMPLES = 1000
+training_inputs = tf.random_normal([NUM_EXAMPLES])
+noise = tf.random_normal([NUM_EXAMPLES])
+training_outputs = training_inputs * 3 + 2 + noise
+
+def prediction(input, weight, bias):
+  return input * weight + bias
+
+# A loss function using mean-squared error
+def loss(weights, biases):
+  error = prediction(training_inputs, weights, biases) - training_outputs
+  return tf.reduce_mean(tf.square(error))
+
+# Return the derivative of loss with respect to weight and bias
+def grad(weights, biases):
+  with tfe.GradientTape() as tape:
+    loss_value = loss(weights, biases) 
+  return tape.gradient(loss_value, [weights, biases])
+
+train_steps = 200
+learning_rate = 0.01
+# Start with arbitrary values for W and B on the same batch of data
+W = tfe.Variable(5.)
+B = tfe.Variable(10.)
+
+print("Initial loss: {:.3f}".format(loss(W, B)))
+
+for i in range(train_steps):
+  dW, dB = grad(W, B)
+  W.assign_sub(dW * learning_rate)
+  B.assign_sub(dB * learning_rate)
+  if i % 20 == 0:
+    print("Loss at step {:03d}: {:.3f}".format(i, loss(W, B)))
+
+print("Final loss: {:.3f}".format(loss(W, B)))
+print("W = {}, B = {}".format(W.numpy(), B.numpy()))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 71.204
+Loss at step 000: 68.333
+Loss at step 020: 30.222
+Loss at step 040: 13.691
+Loss at step 060: 6.508
+Loss at step 080: 3.382
+Loss at step 100: 2.018
+Loss at step 120: 1.422
+Loss at step 140: 1.161
+Loss at step 160: 1.046
+Loss at step 180: 0.996
+Final loss: 0.974
+W = 3.01582956314, B = 2.1191945076
+```
+
+Replay the `tfe.GradientTape` to compute the gradients and apply them in a
+training loop. This is demonstrated in an excerpt from the
+[mnist_eager.py](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_eager.py)
+example:
+
+```py
+dataset = tf.data.Dataset.from_tensor_slices((data.train.images,
+                                              data.train.labels))
+...
+for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
+  ...
+  with tfe.GradientTape() as tape:
+    logits = model(images, training=True)
+    loss_value = loss(logits, labels)
+  ...
+  grads = tape.gradient(loss_value, model.variables)
+  optimizer.apply_gradients(zip(grads, model.variables),
+                            global_step=tf.train.get_or_create_global_step())
+```
+
+
+The following example creates a multi-layer model that classifies the standard
+[MNIST handwritten digits](https://www.tensorflow.org/tutorials/layers). It
+demonstrates the optimizer and layer APIs to build trainable graphs in an eager
+execution environment.
+
+### Train a model
+
+Even without training, call the model and inspect the output in eager execution:
+
+```py
+# Create a tensor representing a blank image
+batch = tf.zeros([1, 1, 784])
+print(batch.shape)  # => (1, 1, 784)
+
+result = model(batch)
+# => tf.Tensor([[[ 0.  0., ..., 0.]]], shape=(1, 1, 10), dtype=float32)
+```
+
+This example uses the
+[dataset.py module](https://github.com/tensorflow/models/blob/master/official/mnist/dataset.py)
+from the
+[TensorFlow MNIST example](https://github.com/tensorflow/models/tree/master/official/mnist);
+download this file to your local directory. Run the following to download the
+MNIST data files to your working directory and prepare a `tf.data.Dataset`
+for training:
+
+```py
+import dataset  # download dataset.py file
+dataset_train = dataset.train('./datasets').shuffle(60000).repeat(4).batch(32)
+```
+
+To train a model, define a loss function to optimize and then calculate
+gradients. Use an optimizer to update the variables:
+
+```py
+def loss(model, x, y):
+  prediction = model(x)
+  return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=prediction)
+
+def grad(model, inputs, targets):
+  with tfe.GradientTape() as tape:
+    loss_value = loss(model, inputs, targets)
+  return tape.gradient(loss_value, model.variables)
+
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+
+x, y = tfe.Iterator(dataset_train).next()
+print("Initial loss: {:.3f}".format(loss(model, x, y)))
+
+# Training loop
+for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
+  # Calculate derivatives of the input function with respect to its parameters.
+  grads = grad(model, x, y)
+  # Apply the gradient to the model
+  optimizer.apply_gradients(zip(grads, model.variables),
+                            global_step=tf.train.get_or_create_global_step())
+  if i % 200 == 0:
+    print("Loss at step {:04d}: {:.3f}".format(i, loss(model, x, y)))
+
+print("Final loss: {:.3f}".format(loss(model, x, y)))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 2.674
+Loss at step 0000: 2.593
+Loss at step 0200: 2.143
+Loss at step 0400: 2.009
+Loss at step 0600: 2.103
+Loss at step 0800: 1.621
+Loss at step 1000: 1.695
+...
+Loss at step 6600: 0.602
+Loss at step 6800: 0.557
+Loss at step 7000: 0.499
+Loss at step 7200: 0.744
+Loss at step 7400: 0.681
+Final loss: 0.670
+```
+
+And for faster training, move the computation to a GPU:
+
+```py
+with tf.device("/gpu:0"):
+  for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
+    # minimize() is equivalent to the grad() and apply_gradients() calls.
+    optimizer.minimize(lambda: loss(model, x, y),
+                       global_step=tf.train.get_or_create_global_step())
+```
+
+### Variables and optimizers
+
+`tfe.Variable` objects store mutable `tf.Tensor` values accessed during
+training to make automatic differentiation easier. The parameters of a model can
+be encapsulated in classes as variables.
+
+Better encapsulate model parameters by using `tfe.Variable` with
+`tfe.GradientTape`. For example, the automatic differentiation example above
+can be rewritten:
+
+```py
+class Model(tf.keras.Model):
+  def __init__(self):
+    super(Model, self).__init__()
+    self.W = tfe.Variable(5., name='weight')
+    self.B = tfe.Variable(10., name='bias')
+  def predict(self, inputs):
+    return inputs * self.W + self.B
+
+# A toy dataset of points around 3 * x + 2
+NUM_EXAMPLES = 2000
+training_inputs = tf.random_normal([NUM_EXAMPLES])
+noise = tf.random_normal([NUM_EXAMPLES])
+training_outputs = training_inputs * 3 + 2 + noise
+
+# The loss function to be optimized
+def loss(model, inputs, targets):
+  error = model.predict(inputs) - targets
+  return tf.reduce_mean(tf.square(error))
+
+def grad(model, inputs, targets):
+  with tfe.GradientTape() as tape:
+    loss_value = loss(model, inputs, targets)
+  return tape.gradient(loss_value, [model.W, model.B])
+
+# Define:
+# 1. A model.
+# 2. Derivatives of a loss function with respect to model parameters.
+# 3. A strategy for updating the variables based on the derivatives.
+model = Model()
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+
+print("Initial loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))
+
+# Training loop
+for i in range(300):
+  grads = grad(model, training_inputs, training_outputs)
+  optimizer.apply_gradients(zip(grads, [model.W, model.B]),
+                            global_step=tf.train.get_or_create_global_step())
+  if i % 20 == 0:
+    print("Loss at step {:03d}: {:.3f}".format(i, loss(model, training_inputs, training_outputs)))
+
+print("Final loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))
+print("W = {}, B = {}".format(model.W.numpy(), model.B.numpy()))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 69.066
+Loss at step 000: 66.368
+Loss at step 020: 30.107
+Loss at step 040: 13.959
+Loss at step 060: 6.769
+Loss at step 080: 3.567
+Loss at step 100: 2.141
+Loss at step 120: 1.506
+Loss at step 140: 1.223
+Loss at step 160: 1.097
+Loss at step 180: 1.041
+Loss at step 200: 1.016
+Loss at step 220: 1.005
+Loss at step 240: 1.000
+Loss at step 260: 0.998
+Loss at step 280: 0.997
+Final loss: 0.996
+W = 2.99431324005, B = 2.02129220963
+```
+
+## Use objects for state during eager execution
+
+With graph execution, program state (such as the variables) is stored in global
+collections and their lifetime is managed by the `tf.Session` object. In
+contrast, during eager execution the lifetime of state objects is determined by
+the lifetime of their corresponding Python object.
+
+### Variables are objects
+
+During eager execution, variables persist until the last reference to the object
+is removed, and is then deleted.
+
+```py
+with tf.device("gpu:0"):
+  v = tfe.Variable(tf.random_normal([1000, 1000]))
+  v = None  # v no longer takes up GPU memory
+```
+
+### Object-based saving
+
+`tfe.Checkpoint` can save and restore `tfe.Variable`s to and from
+checkpoints:
+
+```py
+x = tfe.Variable(10.)
+
+checkpoint = tfe.Checkpoint(x=x)  # save as "x"
+
+x.assign(2.)   # Assign a new value to the variables and save.
+save_path = checkpoint.save('./ckpt/')
+
+x.assign(11.)  # Change the variable after saving.
+
+# Restore values from the checkpoint
+checkpoint.restore(save_path)
+
+print(x)  # => 2.0
+```
+
+To save and load models, `tfe.Checkpoint` stores the internal state of objects,
+without requiring hidden variables. To record the state of a `model`,
+an `optimizer`, and a global step, pass them to a `tfe.Checkpoint`:
+
+```py
+model = MyModel()
+optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
+checkpoint_dir = ‘/path/to/model_dir’
+checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
+root = tfe.Checkpoint(optimizer=optimizer,
+                      model=model,
+                      optimizer_step=tf.train.get_or_create_global_step())
+
+root.save(file_prefix=checkpoint_prefix)
+# or
+root.restore(tf.train.latest_checkpoint(checkpoint_dir))
+```
+
+### Object-oriented metrics
+
+`tfe.metrics` are stored as objects. Update a metric by passing the new data to
+the callable, and retrieve the result using the `tfe.metrics.result` method,
+for example:
+
+```py
+m = tfe.metrics.Mean("loss")
+m(0)
+m(5)
+m.result()  # => 2.5
+m([8, 9])
+m.result()  # => 5.5
+```
+
+#### Summaries and TensorBoard
+
+@{$summaries_and_tensorboard$TensorBoard} is a visualization tool for
+understanding, debugging and optimizing the model training process. It uses
+summary events that are written while executing the program.
+
+`tf.contrib.summary` is compatible with both eager and graph execution
+environments. Summary operations, such as `tf.contrib.summary.scalar`, are
+inserted during model construction. For example, to record summaries once every
+100 global steps:
+
+```py
+writer = tf.contrib.summary.create_file_writer(logdir)
+global_step=tf.train.get_or_create_global_step()  # return global step var
+
+writer.set_as_default()
+
+for _ in range(iterations):
+  global_step.assign_add(1)
+  # Must include a record_summaries method
+  with tf.contrib.summary.record_summaries_every_n_global_steps(100):
+    # your model code goes here
+    tf.contrib.summary.scalar('loss', loss)
+     ...
+```
+
+## Advanced automatic differentiation topics
+
+### Dynamic models
+
+`tfe.GradientTape` can also be used in dynamic models. This example for a
+[backtracking line search](https://wikipedia.org/wiki/Backtracking_line_search)
+algorithm looks like normal NumPy code, except there are gradients and is
+differentiable, despite the complex control flow:
+
+```py
+def line_search_step(fn, init_x, rate=1.0):
+  with tfe.GradientTape() as tape:
+    # Variables are automatically recorded, but manually watch a tensor
+    tape.watch(init_x)
+    value = fn(init_x)
+  grad, = tape.gradient(value, [init_x])
+  grad_norm = tf.reduce_sum(grad * grad)
+  init_value = value
+  while value > init_value - rate * grad_norm:
+    x = init_x - rate * grad
+    value = fn(x)
+    rate /= 2.0
+  return x, value
+```
+
+### Additional functions to compute gradients
+
+`tfe.GradientTape` is a powerful interface for computing gradients, but there
+is another [Autograd](https://github.com/HIPS/autograd)-style API available for
+automatic differentiation. These functions are useful if writing math code with
+only tensors and gradient functions, and without `tfe.Variables`:
+
+* `tfe.gradients_function` —Returns a function that computes the derivatives
+  of its input function parameter with respect to its arguments. The input
+  function parameter must return a scalar value. When the returned function is
+  invoked, it returns a list of `tf.Tensor` objects: one element for each
+  argument of the input function. Since anything of interest must be passed as a
+  function parameter, this becomes unwieldy if there's a dependency on many
+  trainable parameters.
+* `tfe.value_and_gradients_function` —Similar to
+  `tfe.gradients_function`, but when the returned function is invoked, it
+  returns the value from the input function in addition to the list of
+  derivatives of the input function with respect to its arguments.
+
+In the following example, `tfe.gradients_function` takes the `square`
+function as an argument and returns a function that computes the partial
+derivatives of `square` with respect to its inputs. To calculate the derivative
+of `square` at `3`, `grad(3.0)` returns `6`.
+
+```py
+def square(x):
+  return tf.multiply(x, x)
+
+grad = tfe.gradients_function(square)
+
+square(3.)  # => 9.0
+grad(3.)    # => [6.0]
+
+# The second-order derivative of square:
+gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
+gradgrad(3.)  # => [2.0]
+
+# The third-order derivative is None:
+gradgradgrad = tfe.gradients_function(lambda x: gradgrad(x)[0])
+gradgradgrad(3.)  # => [None]
+
+
+# With flow control:
+def abs(x):
+  return x if x > 0. else -x
+
+grad = tfe.gradients_function(abs)
+
+grad(3.)   # => [1.0]
+grad(-3.)  # => [-1.0]
+```
+
+### Custom gradients
+
+Custom gradients are an easy way to override gradients in eager and graph
+execution. Within the forward function, define the gradient with respect to the
+inputs, outputs, or intermediate results. For example, here's an easy way to clip
+the norm of the gradients in the backward pass:
+
+```py
+@tf.custom_gradient
+def clip_gradient_by_norm(x, norm):
+  y = tf.identity(x)
+  def grad_fn(dresult):
+    return [tf.clip_by_norm(dresult, norm), None]
+  return y, grad_fn
+```
+
+Custom gradients are commonly used to provide a numerically stable gradient for a
+sequence of operations:
+
+```py
+def log1pexp(x):
+  return tf.log(1 + tf.exp(x))
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# The gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# However, x = 100 fails because of numerical instability.
+grad_log1pexp(100.)  # => [nan]
+```
+
+Here, the `log1pexp` function can be analytically simplified with a custom
+gradient. The implementation below reuses the value for `tf.exp(x)` that is
+computed during the forward pass—making it more efficient by eliminating
+redundant calculations:
+
+```py
+@tf.custom_gradient
+def log1pexp(x):
+  e = tf.exp(x)
+  def grad(dy):
+    return dy * (1 - 1 / (1 + e))
+  return tf.log(1 + e), grad
+
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# As before, the gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# And the gradient computation also works at x = 100.
+grad_log1pexp(100.)  # => [1.0]
+```
+
+## Performance
+
+Computation is automatically offloaded to GPUs during eager execution. If you
+want control over where a computation runs you can enclose it in a
+`tf.device('/gpu:0')` block (or the CPU equivalent):
+
+```py
+import time
+
+def measure(x, steps):
+  # TensorFlow initializes a GPU the first time it's used, exclude from timing.
+  tf.matmul(x, x)
+  start = time.time()
+  for i in range(steps):
+    x = tf.matmul(x, x)
+    _ = x.numpy()  # Make sure to execute op and not just enqueue it
+  end = time.time()
+  return end - start
+
+shape = (1000, 1000)
+steps = 200
+print("Time to multiply a {} matrix by itself {} times:".format(shape, steps))
+
+# Run on CPU:
+with tf.device("/cpu:0"):
+  print("CPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+
+# Run on GPU, if available:
+if tfe.num_gpus() > 0:
+  with tf.device("/gpu:0"):
+    print("GPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+else:
+  print("GPU: not found")
+```
+
+Output (exact numbers depend on hardware):
+
+```
+Time to multiply a (1000, 1000) matrix by itself 200 times:
+CPU: 4.614904403686523 secs
+GPU: 0.5581181049346924 secs
+```
+
+A `tf.Tensor` object can be copied to a different device to execute its
+operations:
+
+```py
+x = tf.random_normal([10, 10])
+
+x_gpu0 = x.gpu()
+x_cpu = x.cpu()
+
+_ = tf.matmul(x_cpu, x_cpu)    # Runs on CPU
+_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
+
+if tfe.num_gpus() > 1:
+  x_gpu1 = x.gpu(1)
+  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
+```
+
+### Benchmarks
+
+For compute-heavy models, such as
+[ResNet50](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/resnet50)
+training on a GPU, eager execution performance is comparable to graph execution.
+But this gap grows larger for models with less computation and there is work to
+be done for optimizing hot code paths for models with lots of small operations.
+
+
+## Work with graphs
+
+While eager execution makes development and debugging more interactive,
+TensorFlow graph execution has advantages for distributed training, performance
+optimizations, and production deployment. However, writing graph code can feel
+different than writing regular Python code and more difficult to debug.
+
+For building and training graph-constructed models, the Python program first
+builds a graph representing the computation, then invokes `Session.run` to send
+the graph for execution on the C++-based runtime.  This provides:
+
+* Automatic differentiation using static autodiff.
+* Simple deployment to a platform independent server.
+* Graph-based optimizations (common subexpression elimination, constant-folding, etc.).
+* Compilation and kernel fusion.
+* Automatic distribution and replication (placing nodes on the distributed system).
+
+Deploying code written for eager execution is more difficult: either generate a
+graph from the model, or run the Python runtime and code directly on the server.
+
+### Write compatible code
+
+The same code written for eager execution will also build a graph during graph
+execution. Do this by simply running the same code in a new Python session where
+eager execution is not enabled.
+
+Most TensorFlow operations work during eager execution, but there are some things
+to keep in mind:
+
+* Use `tf.data` for input processing instead of queues. It's faster and easier.
+* Use object-oriented layer APIs—like `tf.keras.layers` and
+  `tf.keras.Model`—since they have explicit storage for variables.
+* Most model code works the same during eager and graph execution, but there are
+  exceptions. (For example, dynamic models using Python control flow to change the
+  computation based on inputs.)
+* Once eager execution is enabled with `tf.enable_eager_execution`, it
+  cannot be turned off. Start a new Python session to return to graph execution.
+
+It's best to write code for both eager execution *and* graph execution. This
+gives you eager's interactive experimentation and debuggability with the
+distributed performance benefits of graph execution.
+
+Write, debug, and iterate in eager execution, then import the model graph for
+production deployment. Use `tfe.Checkpoint` to save and restore model
+variables, this allows movement between eager and graph execution environments.
+See the examples in:
+[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
+
+### Use eager execution in a graph environment
+
+Selectively enable eager execution in a TensorFlow graph environment using
+`tfe.py_func`. This is used when `tf.enable_eager_execution()` has *not*
+been called.
+
+```py
+def my_py_func(x):
+  x = tf.matmul(x, x)  # You can use tf ops
+  print(x)  # but it's eager!
+  return x
+
+with tf.Session() as sess:
+  x = tf.placeholder(dtype=tf.float32)
+  # Call eager function in graph!
+  pf = tfe.py_func(my_py_func, [x], tf.float32)
+  sess.run(pf, feed_dict={x: [[2.0]]})  # [[4.0]]
+```
diff --git a/tensorflow/docs_src/programmers_guide/embedding.md b/tensorflow/docs_src/programmers_guide/embedding.md
index e8027fc12b368ddfbc51cc47441478901d7caec7..d5703e07375b1f68f4e22476288f1ed57d340c5b 100644
--- a/tensorflow/docs_src/programmers_guide/embedding.md
+++ b/tensorflow/docs_src/programmers_guide/embedding.md
@@ -7,6 +7,9 @@ with the TensorBoard Embedding Projector
 newcomers to machine learning or TensorFlow, and the Embedding Projector how-to
 is for users at all levels.
 
+An alternative tutorial on these concepts is available in the
+[Embeddings section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture).
+
 [TOC]
 
 An **embedding** is a mapping from discrete objects, such as words, to vectors
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index e8c2fa6990c8ecfca1cfe76b3f813b4ae6917742..017db0e8cb4d239fa4b6be6a5f9d6b0c582a82c2 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -5,6 +5,7 @@ works. The units are as follows:
 
 ## High Level APIs
 
+  * @{$programmers_guide/eager}, which is the easiest way to use tensorflow.
   * @{$programmers_guide/estimators}, which introduces a high-level
     TensorFlow API that greatly simplifies ML programming.
   * @{$programmers_guide/datasets}, which explains how to
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 3fe4cb2ddaee40d9d6c6470bee171dedb27ad890..7ac63bf2e019fc3b6aa7ab1b3e6422a97858d8c6 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,8 +1,9 @@
 index.md
 
 ### High Level APIs
-estimators.md
+eager.md
 datasets.md
+estimators.md
 
 ### Low Level APIs
 low_level_intro.md
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index d01d187e865ba67db9665565cdcab02f9ad16fe6..55ee42dd6405db6bd34b064d71deaeb94839b0fa 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -3,7 +3,7 @@
 The @{tf.train.Saver} class provides methods to save and restore models. The
 @{tf.saved_model.simple_save} function is an easy way to build a
 @{tf.saved_model$saved model} suitable for serving.
-[Estimators](/programmers_guide/estimators) automatically save and restore
+[Estimators](@{$programmers_guide/estimators}) automatically save and restore
 variables in the `model_dir`.
 
 ## Save and restore variables
@@ -400,7 +400,7 @@ defined in:
 
 After training an `Estimator` model, you may want to create a service
 from that model that takes requests and returns a result.  You can run such a
-service locally on your machine or deploy it scalably in the cloud.
+service locally on your machine or deploy it in the cloud.
 
 To prepare a trained Estimator for serving, you must export it in the standard
 SavedModel format. This section explains how to:
diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
index a9c2cb3e33d4817b9a35400dcce9227ddd635ff4..cb0d86fc4c54ac690f13c93ebbd10805c7738c62 100644
--- a/tensorflow/docs_src/programmers_guide/using_tpu.md
+++ b/tensorflow/docs_src/programmers_guide/using_tpu.md
@@ -11,7 +11,7 @@ This doc is aimed at users who:
   using an existing model.
 * Have, perhaps, skimmed the code of an example TPU model
   [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py)
-  [[2]](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models).
+  [[2]](https://github.com/tensorflow/tpu/tree/master/models).
 * Are interested in porting an existing `Estimator` model to
   run on Cloud TPUs
 
@@ -288,7 +288,7 @@ If shape inference has failed, but the shape is known it is possible to
 impose the correct shape using `tf.set_shape()`. 
 
 In the example below the shape
-inference algorithm fails, but it is corrected using `set_shape`:
+inference algorithm fails, but it is correctly using `set_shape`:
 
 ```
 >>> x = tf.zeros(tf.constant([1,2,3])+1)
@@ -371,10 +371,10 @@ in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so
 that data is available when needed.
 
 The TPU-demos repo includes
-[a script](https://github.com/tensorflow/tpu-demos/blob/master/cloud_tpu/datasets/imagenet_to_gcs.py)
+[a script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
 for downloading the imagenet dataset and converting it to an appropriate format.
 This together with the imagenet
-[models](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models)
+[models](https://github.com/tensorflow/tpu/tree/master/models)
 included in the repo demonstrate all of these best-practices.
 
 
@@ -387,7 +387,7 @@ For details on how to actually set up and run a Cloud TPU see:
 This document is by no means exhaustive. The best source of more detail on how
 to make a Cloud TPU compatible model are the example models published in:
 
- * The [TPU Demos Repository.](https://github.com/tensorflow/tpu-demos/)
+ * The [TPU Demos Repository.](https://github.com/tensorflow/tpu)
 
 For more information about tuning TensorFlow code for performance see:
 
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index 679754020470dddfcffa76e62ca8f55a439ec4f5..6a4c9a9b0727208a158b1b57d13ca70290961ec2 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -268,7 +268,7 @@ in `cifar10_input.py`.
 
 `cifar10_train.py` periodically @{tf.train.Saver$saves}
 all model parameters in
-@{$variables#saving-and-restoring$checkpoint files}
+@{$programmers_guide/saved_model$checkpoint files}
 but it does *not* evaluate the model. The checkpoint file
 will be used by `cifar10_eval.py` to measure the predictive
 performance (see [Evaluating a Model](#evaluating-a-model) below).
diff --git a/tensorflow/docs_src/tutorials/kernel_methods.md b/tensorflow/docs_src/tutorials/kernel_methods.md
index b1f06ce0a3b35a79b60f18dfe8a40dda0c732f07..73e5c5105784ddc9729b8cea6cd31921572837e1 100644
--- a/tensorflow/docs_src/tutorials/kernel_methods.md
+++ b/tensorflow/docs_src/tutorials/kernel_methods.md
@@ -1,7 +1,7 @@
 # Improving Linear Models Using Explicit Kernel Methods
 
 Note: This document uses a deprecated version of @{tf.estimator},
-which has a different interface (see `tf.contrib.learn Estimator`).
+which has a @{tf.contrib.learn.Estimator$different interface}.
 It also uses other `contrib` methods whose
 @{$version_compat#not_covered$API may not be stable}.
 
@@ -53,7 +53,7 @@ In order to feed data to a `tf.contrib.learn Estimator`, it is helpful to conver
 it to Tensors. For this, we will use an `input function` which adds Ops to the
 TensorFlow graph that, when executed, create mini-batches of Tensors to be used
 downstream. For more background on input functions, check
-@{$get_started/premade_estimators#input_fn$this section on input functions}.
+@{$get_started/premade_estimators#create_input_functions$this section on input functions}.
 In this example, we will use the `tf.train.shuffle_batch` Op which, besides
 converting numpy arrays to Tensors, allows us to specify the batch_size and
 whether to randomize the input every time the input_fn Ops are executed
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index 9b17d0d4d52e6c5aa9b00b2e50840fa8cd4e8cb6..cadaec391d8970faf5847c9b9e39bccb31f885ed 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -199,16 +199,22 @@ Classifier"](#training_and_evaluating_the_cnn_mnist_classifier).
 
 The methods in the `layers` module for creating convolutional and pooling layers
 for two-dimensional image data expect input tensors to have a shape of
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
-<em>channels</em>]</code>, defined as follows:
+<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>,
+<em>channels</em>]</code> by default. This behavior can be changed using the <code><em>data_format</em></code> parameter; defined as follows:
+
 
 *   _`batch_size`_. Size of the subset of examples to use when performing
     gradient descent during training.
-*   _`image_width`_. Width of the example images.
 *   _`image_height`_. Height of the example images.
+*   _`image_width`_. Width of the example images.
 *   _`channels`_. Number of color channels in the example images. For color
     images, the number of channels is 3 (red, green, blue). For monochrome
     images, there is just 1 channel (black).
+*   _`image_height`_. Height of the example images.
+*   _`data_format`_. A string, one of `channels_last` (default) or `channels_first`.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
 
 Here, our MNIST dataset is composed of monochrome 28x28 pixel images, so the
 desired shape for our input layer is <code>[<em>batch_size</em>, 28, 28,
@@ -247,28 +253,27 @@ conv1 = tf.layers.conv2d(
 ```
 
 The `inputs` argument specifies our input tensor, which must have the shape
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
+<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>,
 <em>channels</em>]</code>. Here, we're connecting our first convolutional layer
 to `input_layer`, which has the shape <code>[<em>batch_size</em>, 28, 28,
 1]</code>.
 
 > Note: <code>conv2d()</code> will instead accept a shape of
-> <code>[<em>channels</em>, <em>batch_size</em>, <em>image_width</em>,
-> <em>image_height</em>]</code> when passed the argument
+> <code>[<em>batch_size</em>, <em>channels</em>, <em>image_height</em>, <em>image_width</em>]</code> when passed the argument
 > <code>data_format=channels_first</code>.
 
 The `filters` argument specifies the number of filters to apply (here, 32), and
-`kernel_size` specifies the dimensions of the filters as <code>[<em>width</em>,
-<em>height</em>]</code> (here, <code>[5, 5]</code>).
+`kernel_size` specifies the dimensions of the filters as <code>[<em>height</em>,
+<em>width</em>]</code> (here, <code>[5, 5]</code>).
 
-<p class="tip"><b>TIP:</b> If filter width and height have the same value, you can instead specify a
+<p class="tip"><b>TIP:</b> If filter height and width have the same value, you can instead specify a
 single integer for <code>kernel_size</code>—e.g., <code>kernel_size=5</code>.</p>
 
 The `padding` argument specifies one of two enumerated values
 (case-insensitive): `valid` (default value) or `same`. To specify that the
-output tensor should have the same width and height values as the input tensor,
+output tensor should have the same height and width values as the input tensor,
 we set `padding=same` here, which instructs TensorFlow to add 0 values to the
-edges of the input tensor to preserve width and height of 28. (Without padding,
+edges of the input tensor to preserve height and width of 28. (Without padding,
 a 5x5 convolution over a 28x28 tensor will produce a 24x24 tensor, as there are
 24x24 locations to extract a 5x5 tile from a 28x28 grid.)
 
@@ -277,7 +282,7 @@ output of the convolution. Here, we specify ReLU activation with
 @{tf.nn.relu}.
 
 Our output tensor produced by `conv2d()` has a shape of
-<code>[<em>batch_size</em>, 28, 28, 32]</code>: the same width and height
+<code>[<em>batch_size</em>, 28, 28, 32]</code>: the same height and width
 dimensions as the input, but now with 32 channels holding the output from each
 of the filters.
 
@@ -292,31 +297,30 @@ pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
 ```
 
 Again, `inputs` specifies the input tensor, with a shape of
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
+<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>,
 <em>channels</em>]</code>. Here, our input tensor is `conv1`, the output from
 the first convolutional layer, which has a shape of <code>[<em>batch_size</em>,
 28, 28, 32]</code>.
 
 > Note: As with <code>conv2d()</code>, <code>max_pooling2d()</code> will instead
-> accept a shape of <code>[<em>channels</em>, <em>batch_size</em>,
-> <em>image_width</em>, <em>image_height</em>]</code> when passed the argument
+> accept a shape of <code>[<em>batch_size</em>, <em>channels</em>, 
+> <em>image_height</em>, <em>image_width</em>]</code> when passed the argument
 > <code>data_format=channels_first</code>.
 
 The `pool_size` argument specifies the size of the max pooling filter as
-<code>[<em>width</em>, <em>height</em>]</code> (here, `[2, 2]`). If both
+<code>[<em>height</em>, <em>width</em>]</code> (here, `[2, 2]`). If both
 dimensions have the same value, you can instead specify a single integer (e.g.,
 `pool_size=2`).
 
 The `strides` argument specifies the size of the stride. Here, we set a stride
 of 2, which indicates that the subregions extracted by the filter should be
-separated by 2 pixels in both the width and height dimensions (for a 2x2 filter,
+separated by 2 pixels in both the height and width dimensions (for a 2x2 filter,
 this means that none of the regions extracted will overlap). If you want to set
-different stride values for width and height, you can instead specify a tuple or
+different stride values for height and width, you can instead specify a tuple or
 list (e.g., `stride=[3, 6]`).
 
 Our output tensor produced by `max_pooling2d()` (`pool1`) has a shape of
-<code>[<em>batch_size</em>, 14, 14, 32]</code>: the 2x2 filter reduces width and
-height by 50% each.
+<code>[<em>batch_size</em>, 14, 14, 32]</code>: the 2x2 filter reduces height and width by 50% each.
 
 ### Convolutional Layer #2 and Pooling Layer #2
 
@@ -338,13 +342,11 @@ pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
 
 Note that convolutional layer #2 takes the output tensor of our first pooling
 layer (`pool1`) as input, and produces the tensor `conv2` as output. `conv2`
-has a shape of <code>[<em>batch_size</em>, 14, 14, 64]</code>, the same width
-and height as `pool1` (due to `padding="same"`), and 64 channels for the 64
+has a shape of <code>[<em>batch_size</em>, 14, 14, 64]</code>, the same height and width as `pool1` (due to `padding="same"`), and 64 channels for the 64
 filters applied.
 
 Pooling layer #2 takes `conv2` as input, producing `pool2` as output. `pool2`
-has shape <code>[<em>batch_size</em>, 7, 7, 64]</code> (50% reduction of width
-and height from `conv2`).
+has shape <code>[<em>batch_size</em>, 7, 7, 64]</code> (50% reduction of height and width from `conv2`).
 
 ### Dense Layer
 
@@ -360,7 +362,7 @@ pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
 
 In the `reshape()` operation above, the `-1` signifies that the *`batch_size`*
 dimension will be dynamically calculated based on the number of examples in our
-input data. Each example has 7 (`pool2` width) * 7 (`pool2` height) * 64
+input data. Each example has 7 (`pool2` height) * 7 (`pool2` width) * 64
 (`pool2` channels) features, so we want the `features` dimension to have a value
 of 7 * 7 * 64 (3136 in total). The output tensor, `pool2_flat`, has shape
 <code>[<em>batch_size</em>, 3136]</code>.
diff --git a/tensorflow/docs_src/tutorials/recurrent_quickdraw.md b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
index 7584a76ba5d3a044377e7050993b6ad778633433..5d83fbe2a3709c0834f448cbc316453f80428dd1 100644
--- a/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
+++ b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
@@ -38,8 +38,8 @@ To try the code for this tutorial:
 1.  [Download the data](#download-the-data) in `TFRecord` format from
     [here](http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz) and unzip it. More details about [how to
     obtain the original Quick, Draw!
-    data](#optional-download-the-full-quick-draw-data) and [how to convert that
-    to `TFRecord` files](#optional-converting-the-data) is available below.
+    data](#optional_download_the_full_quick_draw_data) and [how to convert that
+    to `TFRecord` files](#optional_converting_the_data) is available below.
 
 1.  Execute the tutorial code with the following command to train the RNN-based
     model described in this tutorial. Make sure to adjust the paths to point to
@@ -108,7 +108,7 @@ This download will take a while and download a bit more than 23GB of data.
 ### Optional: Converting the data
 
 To convert the `ndjson` files to
-@{$python/python_io#tfrecords_format_details$TFRecord} files containing
+@{$python/python_io#TFRecords_Format_Details$TFRecord} files containing
 [`tf.train.Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
 protos run the following command.
 
@@ -118,7 +118,7 @@ protos run the following command.
 ```
 
 This will store the data in 10 shards of
-@{$python/python_io#tfrecords_format_details$TFRecord} files with 10000 items
+@{$python/python_io#TFRecords_Format_Details$TFRecord} files with 10000 items
 per class for the training data and 1000 items per class as eval data.
 
 This conversion process is described in more detail in the following.
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index b3ed6589ed062dc2331b7dc64184a2b39062271e..cf8054be6a3e89a307a10fdb711a62ac3a46d410 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -139,15 +139,3 @@ tf_cc_binary(
         "//tensorflow/core:framework",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 12146477972a116903f731a03b9755aafd92acc1..a088d7cf2f05c81cf2e60cb5aa8de79957a30de2 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -100,22 +100,6 @@ filegroup(
 )
 # LINT.ThenChange(//tensorflow/examples/android/download-models.gradle)
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-            "gradleBuild/**",
-            "libs/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "java_files",
     srcs = glob(["src/**/*.java"]),
diff --git a/tensorflow/examples/benchmark/BUILD b/tensorflow/examples/benchmark/BUILD
index c4bb0a5bd952ea175a4fd2444a3d632dc13445de..98611a9aadf6f456dd4f9fe4f423e3e2ce9722ec 100644
--- a/tensorflow/examples/benchmark/BUILD
+++ b/tensorflow/examples/benchmark/BUILD
@@ -23,9 +23,3 @@ tf_py_logged_benchmark(
     name = "sample_logged_benchmark",
     target = "//tensorflow/examples/benchmark:sample_benchmark",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**/*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/get_started/regression/BUILD b/tensorflow/examples/get_started/regression/BUILD
index 577b970c9063dfa9a2acdb7d18362aa8adba827f..bee94d7d90fb3f70107a5dd9e9223f3013402073 100644
--- a/tensorflow/examples/get_started/regression/BUILD
+++ b/tensorflow/examples/get_started/regression/BUILD
@@ -2,18 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_test(
     name = "test",
     size = "medium",
diff --git a/tensorflow/examples/how_tos/reading_data/BUILD b/tensorflow/examples/how_tos/reading_data/BUILD
index 4a43585d5395b1df94dd8a8767f92f131cfcaea4..64a054d3712035252666ca84e676add3d079e52a 100644
--- a/tensorflow/examples/how_tos/reading_data/BUILD
+++ b/tensorflow/examples/how_tos/reading_data/BUILD
@@ -54,15 +54,3 @@ py_binary(
         "//tensorflow/examples/tutorials/mnist:input_data",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/image_retraining/BUILD b/tensorflow/examples/image_retraining/BUILD
index 9f9244a74c4d073cc67b7c8252b0bcff86e9400f..ecd79a3b004d0ca9f50d2a6f140dbc353efe30cb 100644
--- a/tensorflow/examples/image_retraining/BUILD
+++ b/tensorflow/examples/image_retraining/BUILD
@@ -49,15 +49,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md
index 5bdaeb43ce143e36e78cfe301fd9b59e8b85b034..5d7bd36837b2a2c33ab4bc311a582c174666dcd5 100644
--- a/tensorflow/examples/ios/README.md
+++ b/tensorflow/examples/ios/README.md
@@ -119,11 +119,13 @@ rundown:
    `tensorflow/contrib/makefile/gen/lib` to the Library Search Paths setting.
 
  - You'll also need to add `libprotobuf.a` and `libprotobuf-lite.a` from
-   `tensorflow/contrib/makefile/gen/protobuf_ios/lib` to your _Build Stages_ and
-   _Library Search Paths_.
+   `tensorflow/contrib/makefile/gen/protobuf_ios/lib`
+   and `nsync.a` from `tensorflow/contrib/makefile/downloads/nsync/builds/lipo.ios.c++11` 
+   to your _Build Stages_ and _Library Search Paths_.
 
  - The _Header Search_ paths needs to contain:
    - the root folder of tensorflow,
+   - `tensorflow/contrib/makefile/downloads/nsync/public`
    - `tensorflow/contrib/makefile/downloads/protobuf/src`
    - `tensorflow/contrib/makefile/downloads`,
    - `tensorflow/contrib/makefile/downloads/eigen`, and
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 2abbe9dacca79b8d6e516550e28a9b203b18f123..c50fd93d03953b12113c17d420c4c5306a02ebe9 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -9,6 +9,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+exports_files(["data/grace_hopper.jpg"])
+
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 tf_cc_binary(
@@ -60,17 +62,3 @@ py_binary(
         "//tensorflow:tensorflow_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 63bc39de6c0a420e03adada56cbc8b0f895b6155..baa65d3243ffbebdf3ccf8a786a2434dfb7cfdad 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -137,15 +138,15 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
-  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+  if (tensorflow::str_util::EndsWith(file_name, ".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".gif")) {
     // gif decoder returns 4-D tensor, remove the first dim
     image_reader =
         Squeeze(root.WithOpName("squeeze_first_dim"),
                 DecodeGif(root.WithOpName("gif_reader"), file_reader));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".bmp")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".bmp")) {
     image_reader = DecodeBmp(root.WithOpName("bmp_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index aba7f600b53cf8286d46ee70823a0a425944076f..bdbcb0b1638a400f12f66bb3c4ee9d852fe145d2 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -152,15 +152,3 @@ sh_test(
         "notap",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
index 98819b20bfea5021d52e2c50b004bccdaf1f25e7..3ead8614b68959b95ccad43623d4df4a5c4665bd 100644
--- a/tensorflow/examples/learn/mnist.py
+++ b/tensorflow/examples/learn/mnist.py
@@ -61,8 +61,10 @@ def conv_model(features, labels, mode):
 
   # Densely connected layer with 1024 neurons.
   h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    h_fc1 = tf.layers.dropout(h_fc1, rate=0.5)
+  h_fc1 = tf.layers.dropout(
+      h_fc1, 
+      rate=0.5, 
+      training=(mode == tf.estimator.ModeKeys.TRAIN))
 
   # Compute logits (1 per class) and compute loss.
   logits = tf.layers.dense(h_fc1, N_DIGITS, activation=None)
diff --git a/tensorflow/examples/learn/resnet.py b/tensorflow/examples/learn/resnet.py
index 9542e552504580a6614f8bd2f43c38dfa795750f..c00de932a8707ad5717aaf1251cf5c88464a28b0 100755
--- a/tensorflow/examples/learn/resnet.py
+++ b/tensorflow/examples/learn/resnet.py
@@ -53,6 +53,8 @@ def res_net_model(features, labels, mode):
     ndim = int(sqrt(input_shape[1]))
     x = tf.reshape(x, [-1, ndim, ndim, 1])
 
+  training = (mode == tf.estimator.ModeKeys.TRAIN)
+  
   # First convolution expands to 64 channels
   with tf.variable_scope('conv_layer1'):
     net = tf.layers.conv2d(
@@ -60,7 +62,7 @@ def res_net_model(features, labels, mode):
         filters=64,
         kernel_size=7,
         activation=tf.nn.relu)
-    net = tf.layers.batch_normalization(net)
+    net = tf.layers.batch_normalization(net, training=training)
 
   # Max pool
   net = tf.layers.max_pooling2d(
@@ -88,7 +90,7 @@ def res_net_model(features, labels, mode):
             kernel_size=1,
             padding='valid',
             activation=tf.nn.relu)
-        conv = tf.layers.batch_normalization(conv)
+        conv = tf.layers.batch_normalization(conv, training=training)
 
       with tf.variable_scope(name + '/conv_bottleneck'):
         conv = tf.layers.conv2d(
@@ -97,7 +99,7 @@ def res_net_model(features, labels, mode):
             kernel_size=3,
             padding='same',
             activation=tf.nn.relu)
-        conv = tf.layers.batch_normalization(conv)
+        conv = tf.layers.batch_normalization(conv, training=training)
 
       # 1x1 convolution responsible for restoring dimension
       with tf.variable_scope(name + '/conv_out'):
@@ -108,7 +110,7 @@ def res_net_model(features, labels, mode):
             kernel_size=1,
             padding='valid',
             activation=tf.nn.relu)
-        conv = tf.layers.batch_normalization(conv)
+        conv = tf.layers.batch_normalization(conv, training=training)
 
       # shortcut connections that turn the network into its counterpart
       # residual function (identity shortcut)
@@ -154,7 +156,7 @@ def res_net_model(features, labels, mode):
   loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
-  if mode == tf.estimator.ModeKeys.TRAIN:
+  if training:
     optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
diff --git a/tensorflow/examples/multibox_detector/BUILD b/tensorflow/examples/multibox_detector/BUILD
index 91a5bfa51cda71ed2bca37869c7305d752e1e035..4f9908cd52d98acc20b9238d9a0fdff39284ea32 100644
--- a/tensorflow/examples/multibox_detector/BUILD
+++ b/tensorflow/examples/multibox_detector/BUILD
@@ -27,17 +27,3 @@ tf_cc_binary(
         "//tensorflow/core:tensorflow",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index e38704fd98cea6928231f2fc2bc989705ae46bb4..96ea525a4e74c68da17d0310f0ad475789314215 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -84,10 +85,10 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
-  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+  if (tensorflow::str_util::EndsWith(file_name, ".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".gif")) {
     image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
@@ -131,7 +132,7 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
 
 Status SaveImage(const Tensor& tensor, const string& file_path) {
   LOG(INFO) << "Saving image to " << file_path;
-  CHECK(tensorflow::StringPiece(file_path).ends_with(".png"))
+  CHECK(tensorflow::str_util::EndsWith(file_path, ".png"))
       << "Only saving of png files is supported.";
 
   auto root = tensorflow::Scope::NewRootScope();
diff --git a/tensorflow/examples/saved_model/BUILD b/tensorflow/examples/saved_model/BUILD
index 1cdf5ec6e1d80c8337d7929159860e093ad07364..ebefc6576d646467426a784d03f4be206aeaba38 100644
--- a/tensorflow/examples/saved_model/BUILD
+++ b/tensorflow/examples/saved_model/BUILD
@@ -8,19 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-)
-
 py_binary(
     name = "saved_model_half_plus_two",
     srcs = [
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index 12479211c32a965642d23226406617df6ff5a29c..13bca34a86b0c2fba7e5e8e3527d13587feacaae 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -245,15 +245,3 @@ tf_cc_binary(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/estimators/BUILD b/tensorflow/examples/tutorials/estimators/BUILD
index ecbc1a431d9a2173e80434b6f9350c225fc9bfb4..bab609f208b6ca3dd6daa8ecfd0c0c762ef87a22 100644
--- a/tensorflow/examples/tutorials/estimators/BUILD
+++ b/tensorflow/examples/tutorials/estimators/BUILD
@@ -20,15 +20,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/layers/BUILD b/tensorflow/examples/tutorials/layers/BUILD
index f8a29c79c63cb77d15ff03d0cf5c98ae36ccc3f8..aad78b18409bab1fe6924849ec5b61c6f3a052f7 100644
--- a/tensorflow/examples/tutorials/layers/BUILD
+++ b/tensorflow/examples/tutorials/layers/BUILD
@@ -19,15 +19,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 6d4e67063d8470788a74e0083b62a2db12dd7c64..aa1b2ec2db34f3cb0350bfde88a1598ed71456de 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -132,15 +132,3 @@ py_test(
         "//tensorflow:tensorflow_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/monitors/BUILD b/tensorflow/examples/tutorials/monitors/BUILD
index 4220e8144de1259dc5bd873ddb5810bf95dcafae..1c49e3fe5390ad48a3dea7cd5688996270b1dc9d 100644
--- a/tensorflow/examples/tutorials/monitors/BUILD
+++ b/tensorflow/examples/tutorials/monitors/BUILD
@@ -23,15 +23,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/word2vec/BUILD b/tensorflow/examples/tutorials/word2vec/BUILD
index 42d6355b4f06258a3c22d0ef324bb31880f2d9a3..2e19c038bdf04235ccd2f4fdbfeff250ca72a07e 100644
--- a/tensorflow/examples/tutorials/word2vec/BUILD
+++ b/tensorflow/examples/tutorials/word2vec/BUILD
@@ -13,19 +13,11 @@ py_binary(
         "word2vec_basic.py",
     ],
     srcs_version = "PY2AND3",
+    tags = [
+        "no-internal-py3",
+    ],
     deps = [
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
index c99870c686c18c1b201ec44c8335b3d9ba24f5a1..cc8835728d59b6a57d46167686987aa34ab9d0a0 100644
--- a/tensorflow/examples/wav_to_spectrogram/BUILD
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -49,17 +49,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 469d1e9adb73849e2b3cf963ac9b8fb2a0861c35..0fd2177df7c4a79e12de58d377834915f7355532 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -38,188 +38,6 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 	return list, start + size, nil
 }
 
-// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
-type WriteImageSummaryAttr func(optionalAttr)
-
-// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
-
-// Writes a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-//
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
-//	bad_color: Color to use for pixels with non-finite values.
-//
-// Returns the created operation.
-func WriteImageSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteImageSummary",
-		Input: []tf.Input{
-			writer, step, tag, tensor, bad_color,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Outputs a `tf.Event` protocol buffer.
-//
-// When CreateSummaryDbWriter is being used, this op can be useful for
-// importing data from event logs.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	event: A string containing a binary-encoded tf.Event proto.
-//
-// Returns the created operation.
-func ImportEvent(scope *Scope, writer tf.Output, event tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ImportEvent",
-		Input: []tf.Input{
-			writer, event,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Outputs a `Summary` protocol buffer with a tensor.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tensor: A tensor to serialize.
-//	tag: The summary's tag.
-//	summary_metadata: Serialized SummaryMetadata protocol buffer containing
-// plugin-related metadata for this summary.
-//
-// Returns the created operation.
-func WriteSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteSummary",
-		Input: []tf.Input{
-			writer, step, tensor, tag, summary_metadata,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates summary database writer accessible by given resource handle.
-//
-// This can be used to write tensors from the execution graph directly
-// to a database. Only SQLite is supported right now. This function
-// will create the schema if it doesn't exist. Entries in the Users,
-// Experiments, and Runs tables will be created automatically if they
-// don't already exist.
-//
-// Arguments:
-//	writer: Handle to SummaryWriter resource to overwrite.
-//	db_uri: For example "file:/tmp/foo.sqlite".
-//	experiment_name: Can't contain ASCII control characters or <>. Case
-// sensitive. If empty, then the Run will not be associated with any
-// Experiment.
-//	run_name: Can't contain ASCII control characters or <>. Case sensitive.
-// If empty, then each Tag will not be associated with any Run.
-//	user_name: Must be valid as both a DNS label and Linux username. If
-// empty, then the Experiment will not be associated with any User.
-//
-// Returns the created operation.
-func CreateSummaryDbWriter(scope *Scope, writer tf.Output, db_uri tf.Output, experiment_name tf.Output, run_name tf.Output, user_name tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CreateSummaryDbWriter",
-		Input: []tf.Input{
-			writer, db_uri, experiment_name, run_name, user_name,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a summary file writer accessible by the given resource handle.
-//
-// Arguments:
-//	writer: A handle to the summary writer resource
-//	logdir: Directory where the event file will be written.
-//	max_queue: Size of the queue of pending events and summaries.
-//	flush_millis: How often, in milliseconds, to flush the pending events and
-// summaries to disk.
-//	filename_suffix: Every event file's name is suffixed with this suffix.
-//
-// Returns the created operation.
-func CreateSummaryFileWriter(scope *Scope, writer tf.Output, logdir tf.Output, max_queue tf.Output, flush_millis tf.Output, filename_suffix tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CreateSummaryFileWriter",
-		Input: []tf.Input{
-			writer, logdir, max_queue, flush_millis, filename_suffix,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
 type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
 
@@ -509,481 +327,784 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 	return op.Output(0)
 }
 
-// Replaces the contents of the table with the specified keys and values.
+// Scatter `updates` into a new (initially zero) tensor according to `indices`.
 //
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// Creates a new tensor by applying sparse `updates` to individual
+// values or slices within a zero tensor of the given `shape` according to
+// indices.  This operator is the inverse of the @{tf.gather_nd} operator which
+// extracts values or slices from a given tensor.
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
 //
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "ScatterNd",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			indices, updates, shape,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
+// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
+type QuantizeAndDequantizeV2Attr func(optionalAttr)
 
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
 //
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
+// value: If the quantization is signed or unsigned.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["signed_input"] = value
 	}
 }
 
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
 //
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
+// value: The bitwidth of the quantization.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["num_bits"] = value
 	}
 }
 
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
+// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
+//
+// value: If the range is given or should be computed from the tensor.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["range_given"] = value
 	}
 }
 
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
+// Quantizes then dequantizes a tensor.
 //
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+// This op simulates the precision loss from the quantized forward pass by:
+// 1. Quantizing the tensor to fixed point numbers, which should match the target
+//    quantization method when it is used in inference.
+// 2. Dequantizing it back to floating point numbers for the following ops, most
+//    likely matmul.
+//
+// There are different ways to quantize. This version does not use the full range
+// of the output type, choosing to elide the lowest possible value for symmetry
+// (e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
+// quantization), so that 0.0 maps to 0.
+//
+// To perform this op, we first find the range of values in our tensor. The range
+// we use is always centered on 0, so we find m such that
+//
+// 1. m = max(abs(input_min), abs(input_max)) if range_given is true,
+// 2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
+//
+// Our input tensor range is then [-m, m].
+//
+// Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
+// If signed_input is true, this is
+//
+//   [min_fixed, max_fixed ] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
+//
+// Otherwise, if signed_input is false, the fixed-point range is
+//
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
+//
+// From this we compute our scaling factor, s:
+//
+//   s = (max_fixed - min_fixed) / (2 * m).
+//
+// Now we can quantize and dequantize the elements of our tensor.  An element e
+// is transformed into e':
+//
+//   e' = (e * s).round_to_nearest() / s.
+//
+// Note that we have a different number of buckets in the signed vs. unsigned
+// cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
+// vs. 255 in the unsigned case.
+//
+// For example, suppose num_bits = 8 and m = 1.  Then
+//
+//   [min_fixed, max_fixed] = [-127, 127], and
+//   s = (127 + 127) / 2 = 127.
+//
+// Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
+// {-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
+//
+// Arguments:
+//	input: Tensor to quantize and then dequantize.
+//	input_min: If range_given, this is the min of the range, otherwise this input
+// will be ignored.
+//	input_max: If range_given, this is the max of the range, otherwise this input
+// will be ignored.
+func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapPeek",
+		Type: "QuantizeAndDequantizeV2",
 		Input: []tf.Input{
-			key, indices,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Bitcasts a tensor from one type to another without copying data.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Given a tensor `input`, this operation returns a tensor that has the same buffer
+// data as `input` with datatype `type`.
+//
+// If the input datatype `T` is larger than the output datatype `type` then the
+// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
+//
+// If `T` is smaller than `type`, the operator requires that the rightmost
+// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
+// [..., sizeof(`type`)/sizeof(`T`)] to [...].
+//
+// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
+// endian orderings will give different results.
+func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"type": type_}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "Bitcast",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Forwards the input to the output.
-//
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
+// Extract `patches` from `images` and put them in the "depth" output dimension.
 //
 // Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+// We specify the size-related attributes as:
+//
+// ```python
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
+// ```
+//
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "LoopCond",
+		Type: "ExtractImagePatches",
 		Input: []tf.Input{
-			input,
+			images,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
+// SpaceToDepthAttr is an optional argument to SpaceToDepth.
+type SpaceToDepthAttr func(optionalAttr)
 
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+// SpaceToDepthDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["data_format"] = value
 	}
 }
 
-// Returns x * y element-wise, working on quantized buffers.
+// SpaceToDepth for tensors of type T.
 //
-// Arguments:
+// Rearranges blocks of spatial data, into depth. More specifically,
+// this op outputs a copy of the input tensor where values from the `height`
+// and `width` dimensions are moved to the `depth` dimension.
+// The attr `block_size` indicates the input block size.
 //
+//   * Non-overlapping blocks of size `block_size x block size` are rearranged
+//     into depth at each location.
+//   * The depth of the output tensor is `block_size * block_size * input_depth`.
+//   * The Y, X coordinates within each block of the input become the high order
+//     component of the output channel index.
+//   * The input tensor's height and width must be divisible by block_size.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
+//                         within the output image, bX, bY means coordinates
+//                         within the input block, iC means input channels).
+//      The output would be a transpose to the following layout:
+//      n,oY,oX,bY,bX,iC
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1], [2]],
+//       [[3], [4]]]]
+// ```
+//
+// This operation will output a tensor of shape `[1, 1, 1, 4]`:
+//
+// ```
+// [[[[1, 2, 3, 4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
+// the corresponding output will have a single element (i.e. width and height are
+// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
+// The output element shape is `[1, 1, 4]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// This operation, for block_size of 2, will return the following tensor of shape
+// `[1, 1, 1, 12]`
+//
+// ```
+// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [5],  [6]],
+//       [[3],   [4],  [7],  [8]],
+//       [[9],  [10], [13],  [14]],
+//       [[11], [12], [15],  [16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 2 2 4]`:
+//
+// ```
+// x = [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block.
+func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"block_size": block_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "SpaceToDepth",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
-
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
+	return op.Output(0)
 }
 
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// SpaceToBatch for 4-D tensors of type T.
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// This is a legacy version of the more general SpaceToBatchND.
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
+// More specifically, this op outputs a copy of the input tensor where values from
+// the `height` and `width` dimensions are moved to the `batch` dimension. After
+// the zero-padding, both `height` and `width` of the input must be divisible by the
+// block size.
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, depth]`.
+//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+//   the padding of the input with zeros across the spatial dimensions as follows:
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
 //
-// Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+//   The effective spatial dimensions of the zero-padded input tensor will be:
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
-		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// A placeholder op that passes through `input` when its output is not fed.
+//       height_pad = pad_top + height + pad_bottom
+//       width_pad = pad_left + width + pad_right
 //
-// Arguments:
-//	input: The default value to produce when `output` is not fed.
-//	shape: The (possibly partial) shape of the tensor.
+// The attr `block_size` must be greater than one. It indicates the block size.
 //
-// Returns A placeholder tensor that defaults to `input` if it is not fed.
-func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "PlaceholderWithDefault",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the complex conjugate of a complex number.
+//   * Non-overlapping blocks of size `block_size x block size` in the height and
+//     width dimensions are rearranged into the batch dimension at each location.
+//   * The batch of the output tensor is `batch * block_size * block_size`.
+//   * Both height_pad and width_pad must be divisible by block_size.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
+// The shape of the output will be:
 //
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//      depth]
 //
-// For example:
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
 //
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// x = [[[[1], [2]], [[3], [4]]]]
 // ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Conj",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// The output tensor has shape `[4, 1, 1, 1]` and value:
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
 //
-// Set use_nesterov = True if you want to use Nesterov momentum.
+// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
 //
-// That is for rows we have grad for, we update var and accum as follows:
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// The output tensor has shape `[4, 1, 1, 3]` and value:
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
 //
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+//
+func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "SpaceToBatch",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			input, paddings,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
-//
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
+// SpaceToBatch for N-D tensors of type T.
 //
-// For example:
+// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+// grid of blocks of shape `block_shape`, and interleaves these blocks with the
+// "batch" dimension (0) such that in the output, the spatial dimensions
+// `[1, ..., M]` correspond to the position within the grid, and the batch
+// dimension combines both the position within a spatial block and the original
+// batch position.  Prior to division into blocks, the spatial dimensions of the
+// input are optionally zero padded according to `paddings`.  See below for a
+// precise description.
+//
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has `M` dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
+//
+// This operation is equivalent to the following steps:
+//
+// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+//    input according to `paddings` to produce `padded` of shape `padded_shape`.
+//
+// 2. Reshape `padded` to `reshaped_padded` of shape:
+//
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//        block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1],
+//       block_shape[M-1]] +
+//      remaining_shape
+//
+// 3. Permute dimensions of `reshaped_padded` to produce
+//    `permuted_reshaped_padded` of shape:
+//
+//      block_shape +
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+//    dimension, producing an output tensor of shape:
+//
+//      [batch * prod(block_shape)] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
 //
 // ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// x = [[[[1], [2]], [[3], [4]]]]
 // ```
 //
-// Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+// The output tensor has shape `[4, 1, 1, 1]` and value:
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+//     paddings = `[[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 3, 1]` and value:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "SpaceToBatchND",
 		Input: []tf.Input{
-			start, limit, delta,
+			input, block_shape, paddings,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
+
+// ListDiffOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Computes the difference between two lists of numbers or strings.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
+//
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
+//
+// For example, given this input:
+//
+// ```
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
+// ```
+//
+// This operation would return:
+//
+// ```
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
+// ```
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
+//
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "ListDiff",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x, y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// Inserts a dimension of 1 into a tensor's shape.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// Given a tensor `input`, this operation inserts a dimension of 1 at the
+// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
+// zero; if you specify a negative number for `axis` it is counted backward from
+// the end.
 //
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// This operation is useful if you want to add a batch dimension to a single
+// element. For example, if you have a single image of shape `[height, width,
+// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+// which will make the shape `[1, height, width, channels]`.
+//
+// Other examples:
+//
+// ```
+// # 't' is a tensor of shape [2]
+// shape(expand_dims(t, 0)) ==> [1, 2]
+// shape(expand_dims(t, 1)) ==> [2, 1]
+// shape(expand_dims(t, -1)) ==> [2, 1]
+//
+// # 't2' is a tensor of shape [2, 3, 5]
+// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+// ```
+//
+// This operation requires that:
+//
+// `-1-input.dims() <= dim <= input.dims()`
+//
+// This operation is related to `squeeze()`, which removes dimensions of
+// size 1.
 //
 // Arguments:
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	axis: 0-D (scalar). Specifies the dimension index at which to
+// expand the shape of `input`. Must be in the range
+// `[-rank(input) - 1, rank(input)]`.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns Contains the same data as `input`, but its shape has an additional
+// dimension of size 1 added.
+func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "ExpandDims",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input, axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
+// A placeholder op that passes through `input` when its output is not fed.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+//	input: The default value to produce when `output` is not fed.
+//	shape: The (possibly partial) shape of the tensor.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// Returns A placeholder tensor that defaults to `input` if it is not fed.
+func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "PlaceholderWithDefault",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -991,332 +1112,331 @@ func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
+// A placeholder op for a value that will be fed into the computation.
 //
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
+// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
 //
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
 //
 // Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor. The shape can be any partially-specified
+// shape.  To be unconstrained, pass in a shape with unknown rank.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
+		Type: "PlaceholderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// PlaceholderAttr is an optional argument to Placeholder.
+type PlaceholderAttr func(optionalAttr)
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+// PlaceholderShape sets the optional shape attribute to value.
 //
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
+// shape is unconstrained.
+// If not specified, defaults to <unknown_rank:true >
+func PlaceholderShape(value tf.Shape) PlaceholderAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["shape"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
+// A placeholder op for a value that will be fed into the computation.
 //
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
 //
 // Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+//	dtype: The type of elements in the tensor.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
-		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
-		},
+		Type: "Placeholder",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
 //
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
 //
-// Values in `arr` outside of the range [0, size) are ignored.
+// The folded size of each dimension D of the output is:
+//
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
+// ```
 //
 // Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "MirrorPadGrad",
 		Input: []tf.Input{
-			arr, size, weights,
+			input, paddings,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// For example:
+// Pads a tensor with mirrored values.
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
 //
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
+// The padded size of each dimension D of the output is:
 //
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
+// For example:
 //
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
 // ```
 //
 // Arguments:
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "MirrorPad",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input, paddings,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Pads a tensor.
+//
+// This operation pads `input` according to the `paddings` and `constant_values`
+// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many padding values to add before the contents of `input` in that dimension,
+// and `paddings[D, 1]` indicates how many padding values to add after the contents
+// of `input` in that dimension. `constant_values` is a scalar tensor of the same
+// type as `input` that indicates the value to use for padding `input`.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # 'constant_values' is 0
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sinh",
+		Type: "PadV2",
 		Input: []tf.Input{
-			x,
+			input, paddings, constant_values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
+//
+// This is typically used by gradient computations for a broadcasting operation.
+func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6",
+		Type: "BroadcastGradientArgs",
 		Input: []tf.Input{
-			features,
+			s0, s1,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
-//
-// `num_segments` should equal the number of distinct segment IDs.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+// Returns the gradient of `Tile`.
 //
+// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Since `Tile` takes an input and repeats the input `multiples` times
+// along each dimension, `TileGrad` takes in `multiples` and aggregates
+// each repeated tile of `input` into `output`.
+func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "TileGrad",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input, multiples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns which elements of x are finite.
+// Constructs a tensor by tiling a given tensor.
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
+//
+// Arguments:
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "Tile",
 		Input: []tf.Input{
-			x,
+			input, multiples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
 
-// MatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["begin_mask"] = value
 	}
 }
 
-// MatMulTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["end_mask"] = value
 	}
 }
 
-// Multiply the matrix "a" by the matrix "b".
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1325,9 +1445,9 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "StridedSliceGrad",
 		Input: []tf.Input{
-			a, b,
+			shape, begin, end, strides, dy,
 		},
 		Attrs: attrs,
 	}
@@ -1335,351 +1455,471 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 	return op.Output(0)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
-//
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
-//
-// For example:
-//
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
-//
-// ```
-//
-// Arguments:
-//
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
+// StridedSliceAttr is an optional argument to StridedSlice.
+type StridedSliceAttr func(optionalAttr)
+
+// StridedSliceBeginMask sets the optional begin_mask attribute to value.
 //
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Select",
-		Input: []tf.Input{
-			condition, x, y,
-		},
+// value: a bitmask where a bit i being 1 means to ignore the begin
+// value and instead use the largest interval possible. At runtime
+// begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
+// `[-1, n-1]` if `stride[i] < 0`
+// If not specified, defaults to 0
+func StridedSliceBeginMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
+// StridedSliceEndMask sets the optional end_mask attribute to value.
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: analogous to `begin_mask`
+// If not specified, defaults to 0
+func StridedSliceEndMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "LogicalOr",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
+//
+// value: a bitmask where bit `i` being 1 means the `i`th
+// position is actually an ellipsis. One bit at most can be 1.
+// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
+// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
+// implicitly creates as many range specifications as necessary to fully
+// specify the sliced range for every dimension. For example for a 4-dimensional
+// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
+// If not specified, defaults to 0
+func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
 //
-// The regularized incomplete beta integral is defined as:
+// value: a bitmask where bit `i` being 1 means the `i`th
+// specification creates a new shape 1 dimension. For example
+// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+// If not specified, defaults to 0
+func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
 //
+// value: a bitmask where bit `i` implies that the `i`th
+// specification should shrink the dimensionality. begin and end
+// must imply a slice of size 1 in the dimension. For example in
+// python one might do `foo[:, 3, :]` which would result in
+// `shrink_axis_mask` being 2.
+// If not specified, defaults to 0
+func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Return a strided slice from `input`.
 //
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+// Note, most python users will want to use the Python `Tensor.__getitem__`
+// or `Variable.__getitem__` rather than this op directly.
 //
-// where
+// The goal of this op is to produce a new tensor with a subset of
+// the elements from the `n` dimensional `input` tensor. The subset is chosen using
+// a sequence of `m` sparse range specifications encoded into the arguments
+// of this function. Note, in some cases
+// `m` could be equal to `n`, but this need not be the case. Each
+// range specification entry can be one of the following:
 //
+// - An ellipsis (...). Ellipses are used to imply zero or more
+//   dimensions of full-dimension selection and are produced using
+//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
 //
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+// - A new axis. This is used to insert a new shape=1 dimension and is
+//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
+//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
 //
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Betainc",
-		Input: []tf.Input{
-			a, b, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// - A range `begin:end:stride`. This is used to specify how much to choose from
+//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
+//   which represents the index of the first value to select while `end` represents
+//   the index of the last value to select. The number of values selected in each
+//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
+//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
+//   the second to last. `begin_mask` controls whether to replace the explicitly
+//   given `begin` with an implicit effective value of `0` if `stride > 0` and
+//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
+//   required to create the largest open interval. For example, given a shape
+//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
+//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
+//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
+//   first dimension of a tensor while dropping the last two (in the original
+//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
 //
-// N is the size of the segment being reduced.
+// - A single index. This is used to keep only elements that have a given
+//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
+//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
+//   `shrink_axis_mask`.
 //
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Each conceptual range specification is encoded in the op's argument. This
+// encoding is best understand by considering a non-trivial example. In
+// particular,
+// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// ```
+// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
+// end = [2, 4, x, x, -3, x]
+// strides = [1, 1, x, x, -1, 1]
+// begin_mask = 1<<4 | 1 << 5 = 48
+// end_mask = 1<<5 = 32
+// ellipsis_mask = 1<<3 = 8
+// new_axis_mask = 1<<2 4
+// shrink_axis_mask = 1<<0
+// ```
 //
-// Arguments:
+// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
+// the slice becomes (2, 1, 5, 5, 2, 5).
+// Let us walk step by step through each argument specification.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// 1.  The first argument in the example slice is turned into `begin = 1` and
+// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
+// also set the appropriate bit in `shrink_axis_mask`.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
+// zero bits contributed.
+//
+// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
+// dimension in the final shape. Dummy values are contributed to begin,
+// end and stride, while the new_axis_mask bit is set.
+//
+// 4. `...` grab the full ranges from as many dimensions as needed to
+// fully specify a slice for every dimension of the input shape.
+//
+// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
+// with a dimension that has shape `s` is converted to a positive index
+// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
+// is done internally so begin, end and strides receive x, -3, and -1.
+// The appropriate begin_mask bit is set to indicate the start range is the
+// full range (ignoring the x).
+//
+// 6. `:` indicates that the entire contents of the corresponding dimension
+// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
+// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
+// `end_mask` are also set.
+//
+// *Requirements*:
+//   `0 != strides[i] for i in [0, m)`
+//   `ellipsis_mask must be a power of two (only one ellipsis)`
+//
+// Arguments:
+//
+//	begin: `begin[k]` specifies the offset into the `k`th range specification.
+// The exact dimension this corresponds to will be determined by context.
+// Out-of-bounds values will be silently clamped. If the `k`th bit of
+// `begin_mask` then `begin[k]` is ignored and the full range of the
+// appropriate dimension is used instead. Negative values causes indexing
+// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
+//	end: `end[i]` is like `begin` with the exception that `end_mask` is
+// used to determine full ranges.
+//	strides: `strides[i]` specifies the increment in the `i`th specification
+// after extracting a given element. Negative indices will reverse
+// the original order. Out or range values are
+// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
+func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
+		Type: "StridedSlice",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			input, begin, end, strides,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+// Return a slice from 'input'.
 //
-// where
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
 //
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
 //
-// is the upper incomplete Gama function.
+// Arguments:
 //
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "Slice",
 		Input: []tf.Input{
-			a, x,
+			input, begin, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
-
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
 
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_type"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
+// Returns the size of a tensor.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// This operation returns an integer representing the number of elements in
+// `input`.
 //
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+// For example:
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// ```
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
+		Type: "Size",
 		Input: []tf.Input{
-			true_classes,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
-	return func(m optionalAttr) {
-		m["tolerance"] = value
-	}
-}
-
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Returns the rank of a tensor.
+//
+// This operation returns an integer representing the rank of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "Rank",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise.
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
+	}
+}
+
+// Reverses variable length slices.
+//
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Div",
+		Type: "ReverseSequence",
 		Input: []tf.Input{
-			x, y,
+			input, seq_lengths,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x * y element-wise.
+// Returns the complex conjugate of a complex number.
 //
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
+//
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mul",
+		Type: "Conj",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1688,190 +1928,118 @@ func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
+		Type: "ResourceSparseApplyMomentum",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// Creates a sequence of numbers.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Adds `bias` to `value`.
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// For example:
+//
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "Range",
 		Input: []tf.Input{
-			value, bias,
+			start, limit, delta,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
-
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// The backward operation for "BiasAdd" on the "bias" tensor.
+// Computes gradients for SparseSegmentSqrtN.
 //
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	out_backprop: Any number of dimensions.
-//
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "SparseSegmentSqrtNGrad",
 		Input: []tf.Input{
-			out_backprop,
+			grad, indices, segment_ids, output_dim0,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
+// Computes the mean along sparse segments of a tensor.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x + y element-wise.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Add",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			x, y,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
-//
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
-//
-//     values.shape = input.shape[:-1]
+// Pop the element at the top of the stack.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
 //
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "NthElement",
+		Type: "StackPopV2",
 		Input: []tf.Input{
-			input, n,
+			handle,
 		},
 		Attrs: attrs,
 	}
@@ -1879,337 +2047,383 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 	return op.Output(0)
 }
 
-// Computes the Max along segments of a tensor.
+// Computes the sum along sparse segments of a tensor.
+//
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
 // segments.
 //
-// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum
-// such that:
+// For example:
 //
-// \\(output_i = \max_j data_j\\) where max is over `j` such
-// that `segment_ids[j] == i`.
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
-//  `output[i] = numeric_limits<T>::min()`.
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
 //
 // Arguments:
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.
-//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
 // Returns Has same shape as data, except for dimension 0 which
 // has size `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Exp",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			x,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns an element-wise indication of the sign of a number.
+// Counts the number of occurrences of each value in an integer array.
 //
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "Bincount",
 		Input: []tf.Input{
-			x,
+			arr, size, weights,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
-
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x + y element-wise, working on quantized buffers.
+// Computes the sum along sparse segments of a tensor.
 //
-// Arguments:
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// For example:
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
-
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
+	return op.Output(0)
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Arguments:
-//
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "Sinh",
 		Input: []tf.Input{
-			input, dimension,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// output range specified with 'requested_output_min' and 'requested_output_max'.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Requantize",
+		Type: "Relu6",
 		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes the determinant of one or more square matrices.
+// Computes the sum along segments of a tensor.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			input,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
 //
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+//
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
+//
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			x,
+			ref, begin, end, strides, value,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
 
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["output_type"] = value
 	}
 }
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the filter.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
@@ -2217,241 +2431,384 @@ func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output,
 	return op.Output(0)
 }
 
-// Returns the number of work units this Reader has finished processing.
+// Returns which elements of x are finite.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
+		Type: "IsFinite",
 		Input: []tf.Input{
-			reader_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
+//
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Lgamma",
+		Type: "MatMul",
 		Input: []tf.Input{
-			x,
+			a, b,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+// Selects elements from `x` or `y`, depending on `condition`.
 //
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
+//
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
 //
 // Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
+		Type: "Select",
 		Input: []tf.Input{
-			l, grad,
+			condition, x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the truth value of x OR y element-wise.
+//
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Acosh",
+		Type: "LogicalOr",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Betainc",
+		Input: []tf.Input{
+			a, b, x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// N is the size of the segment being reduced.
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "SparseSegmentSqrtNWithNumSegments",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			data, indices, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+//
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+	opspec := tf.OpSpec{
+		Type: "Igammac",
+		Input: []tf.Input{
+			a, x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
+
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["seed"] = value
 	}
 }
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["seed2"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
+// Generates labels for candidate sampling with a log-uniform distribution.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "LogUniformCandidateSampler",
 		Input: []tf.Input{
-			size,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// Returns (x - y)(x - y) element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
+		Type: "SquaredDifference",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Forwards the input to the output.
+//
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
+//
+// Arguments:
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
+//
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cosh",
+		Type: "LoopCond",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -2459,73 +2816,72 @@ func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf
 	return op.Output(0)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
+// Returns x / y element-wise.
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "Div",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
+// Returns x * y element-wise.
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "Mul",
 		Input: []tf.Input{
-			gradients, features,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
 
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["data_format"] = value
 	}
 }
 
-// Resize `images` to `size` using bicubic interpolation.
+// Adds `bias` to `value`.
 //
-// Input images can be of different types but output images are always float.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2534,9 +2890,9 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			images, size,
+			value, bias,
 		},
 		Attrs: attrs,
 	}
@@ -2544,235 +2900,126 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 	return op.Output(0)
 }
 
-// Computes natural logarithm of x element-wise.
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Round",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RecordInputAttr is an optional argument to RecordInput.
-type RecordInputAttr func(optionalAttr)
-
-// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
-//
-// value: Random seeds used to produce randomized records.
-// If not specified, defaults to 301
-func RecordInputFileRandomSeed(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_random_seed"] = value
-	}
-}
-
-// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
-//
-// value: Shifts the list of files after the list is randomly
-// shuffled.
-// If not specified, defaults to 0
-func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_shuffle_shift_ratio"] = value
-	}
-}
-
-// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
-//
-// value: The randomization shuffling buffer.
-// If not specified, defaults to 10000
-func RecordInputFileBufferSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_buffer_size"] = value
-	}
-}
-
-// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// value: How many sstables are opened and concurrently iterated over.
-// If not specified, defaults to 16
-func RecordInputFileParallelism(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_parallelism"] = value
-	}
-}
-
-// RecordInputBatchSize sets the optional batch_size attribute to value.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
 //
-// value: The batch size.
-// If not specified, defaults to 32
-func RecordInputBatchSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["batch_size"] = value
-	}
-}
-
-// RecordInputCompressionType sets the optional compression_type attribute to value.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// value: The type of compression for the file. Currently ZLIB and
-// GZIP are supported. Defaults to none.
-// If not specified, defaults to ""
-func RecordInputCompressionType(value string) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// Emits randomized records.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	file_pattern: Glob pattern for the data files.
-//
-// Returns A tensor of shape [batch_size].
-func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RecordInput",
-
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes reciprocal of square root of x element-wise.
+// Returns x + y element-wise.
 //
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rsqrt",
+		Type: "AddV2",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inserts a dimension of 1 into a tensor's shape.
-//
-// Given a tensor `input`, this operation inserts a dimension of 1 at the
-// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
-// zero; if you specify a negative number for `axis` it is counted backward from
-// the end.
-//
-// This operation is useful if you want to add a batch dimension to a single
-// element. For example, if you have a single image of shape `[height, width,
-// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-// which will make the shape `[1, height, width, channels]`.
-//
-// Other examples:
-//
-// ```
-// # 't' is a tensor of shape [2]
-// shape(expand_dims(t, 0)) ==> [1, 2]
-// shape(expand_dims(t, 1)) ==> [2, 1]
-// shape(expand_dims(t, -1)) ==> [2, 1]
-//
-// # 't2' is a tensor of shape [2, 3, 5]
-// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-// ```
-//
-// This operation requires that:
-//
-// `-1-input.dims() <= dim <= input.dims()`
-//
-// This operation is related to `squeeze()`, which removes dimensions of
-// size 1.
-//
-// Arguments:
-//
-//	axis: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`. Must be in the range
-// `[-rank(input) - 1, rank(input)]`.
+// Returns x + y element-wise.
 //
-// Returns Contains the same data as `input`, but its shape has an additional
-// dimension of size 1 added.
-func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExpandDims",
+		Type: "Add",
 		Input: []tf.Input{
-			input, axis,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
 
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// NthElementReverse sets the optional reverse attribute to value.
+//
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
 // If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+func NthElementReverse(value bool) NthElementAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["reverse"] = value
 	}
 }
 
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
+// Finds values of the `n`-th order statistic for the last dimension.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
 //
-// The op uses LU decomposition with partial pivoting to compute the inverses.
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
 //
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
+//     values.shape = input.shape[:-1]
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2781,9 +3028,9 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "NthElement",
 		Input: []tf.Input{
-			input,
+			input, n,
 		},
 		Attrs: attrs,
 	}
@@ -2791,50 +3038,73 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 	return op.Output(0)
 }
 
-// Computes square of x element-wise.
+// Computes the Max along segments of a tensor.
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum
+// such that:
+//
+// \\(output_i = \max_j data_j\\) where max is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
+//  `output[i] = numeric_limits<T>::min()`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Square",
+		Type: "UnsortedSegmentMax",
 		Input: []tf.Input{
-			x,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Elu",
+		Type: "Exp",
 		Input: []tf.Input{
-			features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Returns an element-wise indication of the sign of a number.
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reciprocal",
+		Type: "Sign",
 		Input: []tf.Input{
 			x,
 		},
@@ -2843,190 +3113,218 @@ func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// OrderedMapClearAttr is an optional argument to OrderedMapClear.
-type OrderedMapClearAttr func(optionalAttr)
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
 
-// OrderedMapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["output_type"] = value
 	}
 }
 
-// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-// REQUIRES: value >= 0
-func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearContainer(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
-// Returns the created operation.
-func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapClear",
-
+		Type: "ArgMin",
+		Input: []tf.Input{
+			input, dimension,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+// output range specified with 'requested_output_min' and 'requested_output_max'.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "Requantize",
 		Input: []tf.Input{
-			x,
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+// Computes the determinant of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the complex absolute value of a tensor.
-//
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "Sin",
 		Input: []tf.Input{
 			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x AND y element-wise.
-//
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "Erfc",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+//
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "Digamma",
 		Input: []tf.Input{
 			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
 
-// MaxKeepDims sets the optional keep_dims attribute to value.
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the maximum of elements across dimensions of a tensor.
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Max",
+		Type: "Conv2DBackpropFilter",
 		Input: []tf.Input{
-			input, axis,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -3034,391 +3332,388 @@ func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (ou
 	return op.Output(0)
 }
 
-// Quantized Batch normalization.
-//
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
+// Returns the number of work units this Reader has finished processing.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
-//
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Type: "ReaderNumWorkUnitsCompletedV2",
 		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
-
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Lgamma",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Return histogram of values.
-//
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
-//
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
 //
 // Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
 //
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			values, value_range, nbins,
+			l, grad,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+// Computes the mean along sparse segments of a tensor.
 //
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// Arguments:
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
+// Arguments:
 //
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
+		Type: "SparseSegmentMeanWithNumSegments",
 		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
+			data, indices, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Produces the average pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
+		Type: "Cosh",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorSliceDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Updates the table to associates keys with values.
+// Computes natural logarithm of (1 + x) element-wise.
 //
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
 //
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
+		Type: "Relu6Grad",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			gradients, features,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
 
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
 // If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+		m["align_corners"] = value
 	}
 }
 
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// Resize `images` to `size` using bicubic interpolation.
 //
-// `index  0  1  2  3  4`
+// Input images can be of different types but output images are always float.
 //
-// `value  20 5  16 3  7`
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+// Computes natural logarithm of x element-wise.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	opspec := tf.OpSpec{
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
+		Type: "Round",
 		Input: []tf.Input{
-			value,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
 
-// RandomCropSeed sets the optional seed attribute to value.
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["file_random_seed"] = value
 	}
 }
 
-// RandomCropSeed2 sets the optional seed2 attribute to value.
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
 //
-// value: An second seed to avoid seed collision.
+// value: Shifts the list of files after the list is randomly
+// shuffled.
 // If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["file_shuffle_shift_ratio"] = value
 	}
 }
 
-// Randomly crop `image`.
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
 //
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_buffer_size"] = value
+	}
+}
+
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
 //
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_parallelism"] = value
+	}
+}
+
+// RecordInputBatchSize sets the optional batch_size attribute to value.
 //
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["batch_size"] = value
+	}
+}
+
+// RecordInputCompressionType sets the optional compression_type attribute to value.
+//
+// value: The type of compression for the file. Currently ZLIB and
+// GZIP are supported. Defaults to none.
+// If not specified, defaults to ""
+func RecordInputCompressionType(value string) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Emits randomized records.
 //
 // Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//	file_pattern: Glob pattern for the data files.
 //
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomCrop",
+		Type: "RecordInput",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes reciprocal of square root of x element-wise.
+//
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rsqrt",
 		Input: []tf.Input{
-			image, size,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
 
-// TopKV2Sorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
+// Computes the inverse of one or more square invertible matrices or their
 //
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
+// adjoints (conjugate transposes).
 //
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
 //
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// The op uses LU decomposition with partial pivoting to compute the inverses.
 //
-// If two elements are equal, the lower-index element appears first.
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	input: Shape is `[..., M, M]`.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3427,156 +3722,189 @@ func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopKV2",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			input, k,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns x // y element-wise.
+// Computes square of x element-wise.
 //
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorDiv",
+		Type: "Square",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Elu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reciprocal of x element-wise.
 //
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
-//
-// which has shape (2, 4, 4)
-// ```
-//
-// Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
+		Type: "Reciprocal",
 		Input: []tf.Input{
-			diagonal,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
+// OrderedMapClearAttr is an optional argument to OrderedMapClear.
+type OrderedMapClearAttr func(optionalAttr)
+
+// OrderedMapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// REQUIRES: value >= 0
+func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// REQUIRES: value >= 0
+func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearContainer(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
 //
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+// Returns the created operation.
+func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the reciprocal of x element-wise.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "Inv",
 		Input: []tf.Input{
-			predictions, targets,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
-//
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
+
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Computes the complex absolute value of a tensor.
 //
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
+// Returns the truth value of x AND y element-wise.
 //
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "LogicalAnd",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -3585,57 +3913,50 @@ func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
-		Type: "Softmax",
+		Type: "Cast",
 		Input: []tf.Input{
-			logits,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeBmpAttr is an optional argument to DecodeBmp.
-type DecodeBmpAttr func(optionalAttr)
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
 
-// DecodeBmpChannels sets the optional channels attribute to value.
-// If not specified, defaults to 0
-func DecodeBmpChannels(value int64) DecodeBmpAttr {
+// MaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Decode the first frame of a BMP-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// *   0: Use the number of channels in the BMP-encoded image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	contents: 0-D.  The BMP-encoded image.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns 3-D with shape `[height, width, channels]`. RGB order
-func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3644,9 +3965,9 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBmp",
+		Type: "Max",
 		Input: []tf.Input{
-			contents,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -3654,77 +3975,92 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 	return op.Output(0)
 }
 
-// Computes softsign gradients for a softsign operation.
+// Quantized Batch normalization.
+//
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
 //
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
+		Type: "QuantizedBatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			gradients, features,
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
 
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
 	return func(m optionalAttr) {
-		m["adj_y"] = value
+		m["dtype"] = value
 	}
 }
 
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+// Return histogram of values.
 //
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
 //
-// It is computed as:
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 //
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
 //
 // Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3733,9 +4069,9 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "HistogramFixedWidth",
 		Input: []tf.Input{
-			x, y,
+			values, value_range, nbins,
 		},
 		Attrs: attrs,
 	}
@@ -3743,69 +4079,82 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 	return op.Output(0)
 }
 
-// Pads a tensor.
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
-// This operation pads `input` according to the `paddings` and `constant_values`
-// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many padding values to add before the contents of `input` in that dimension,
-// and `paddings[D, 1]` indicates how many padding values to add after the contents
-// of `input` in that dimension. `constant_values` is a scalar tensor of the same
-// type as `input` that indicates the value to use for padding `input`.
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
 //
-// The padded size of each dimension D of the output is:
+// Arguments:
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
 //
-// For example:
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # 'constant_values' is 0
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "PadV2",
+		Type: "QuantizedBiasAdd",
 		Input: []tf.Input{
-			input, paddings, constant_values,
+			input, bias, min_input, max_input, min_bias, max_bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns which elements of x are NaN.
+// Produces the average pool of the input tensor for quantized types.
 //
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "QuantizedAvgPool",
 		Input: []tf.Input{
-			x,
+			input, min_input, max_input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
 
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
 //
 // value: When set to True, it means when pooling, the values at the boundary
 // of adjacent pooling cells are used by both cells. For example:
@@ -3817,237 +4166,565 @@ type FractionalAvgPoolGradAttr func(optionalAttr)
 // If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
 // The result would be [41/3, 26/3] for fractional avg pooling.
 // If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
 		m["overlapping"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "FractionalAvgPool",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
+
+// RandomCropSeed sets the optional seed attribute to value.
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomCropSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly crop `image`.
+//
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
+//
+// Arguments:
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "RandomCrop",
 		Input: []tf.Input{
-			gradients, outputs,
+			image, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
+
+// TopKV2Sorted sets the optional sorted attribute to value.
 //
-// The hash function is deterministic on the content of the string within the
-// process.
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
 //
-// Arguments:
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
 //
-//	num_buckets: The number of buckets.
+//     values.shape = indices.shape = input.shape[:-1] + [k]
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+// If two elements are equal, the lower-index element appears first.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "TopKV2",
 		Input: []tf.Input{
-			string_tensor,
+			input, k,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns x // y element-wise.
+//
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
+// Returns a batched diagonal tensor with a given batched diagonal values.
 //
-// Arguments:
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
 //
+// For example:
 //
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "MatrixDiag",
 		Input: []tf.Input{
-			input_dataset, count,
+			diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear gradients for a Relu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
+// Returns the truth value of (x <= y) element-wise.
 //
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReluGrad",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			gradients, features,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
+// Computes softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "Softmax",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			logits,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
 
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If true, merge repeated classes in output.
-// If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
 	return func(m optionalAttr) {
-		m["merge_repeated"] = value
+		m["channels"] = value
 	}
 }
 
-// Performs beam search decoding on the logits given in input.
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
 //
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//	contents: 0-D.  The BMP-encoded image.
 //
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
+		Type: "DecodeBmp",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
+	return op.Output(0)
+}
+
+// Computes softsign gradients for a softsign operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
+//
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftsignGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchMatMul",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns which elements of x are NaN.
+//
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsNan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear gradients for a Relu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of morphological 2-D dilation with respect to the input.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
+
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
+	}
+}
+
+// Performs beam search decoding on the logits given in input.
+//
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCBeamSearchDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
 	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
 		scope.UpdateErr("CTCBeamSearchDecoder", err)
 		return
@@ -4227,110 +4904,26 @@ func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
-
-// MaxPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	input: 4-D input to pool over.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
 //	ksize: The size of the window for each dimension of the input tensor.
 //	strides: The stride of the sliding window for each dimension of the
 // input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
-//
-// Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
-//
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
-	opspec := tf.OpSpec{
-		Type: "Bucketize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
+		Type: "MaxPoolGradWithArgmax",
 		Input: []tf.Input{
 			input, grad, argmax,
 		},
@@ -4408,6 +5001,194 @@ func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// DepthToSpaceAttr is an optional argument to DepthToSpace.
+type DepthToSpaceAttr func(optionalAttr)
+
+// DepthToSpaceDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthToSpace for tensors of type T.
+//
+// Rearranges data from depth into blocks of spatial data.
+// This is the reverse transformation of SpaceToDepth. More specifically,
+// this op outputs a copy of the input tensor where values from the `depth`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions.
+// The attr `block_size` indicates the input block size and how the data is moved.
+//
+//   * Chunks of data of size `block_size * block_size` from depth are rearranged
+//     into non-overlapping blocks of size `block_size x block_size`
+//   * The width the output tensor is `input_depth * block_size`, whereas the
+//     height is `input_height * block_size`.
+//   * The Y, X coordinates within each block of the output image are determined
+//     by the high order component of the input channel index.
+//   * The depth of the input tensor must be divisible by
+//     `block_size * block_size`.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+//                         within the input image, bX, bY means coordinates
+//                         within the output block, oC means output channels).
+//      The output would be the input transposed to the following layout:
+//      n,iY,bY,iX,bX,oC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1, 2, 3, 4]]]]
+//
+// ```
+//
+// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+//
+// ```
+//    [[[[1], [2]],
+//      [[3], [4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+// the corresponding output will have 2x2 elements and will have a depth of
+// 1 channel (1 = `4 / (block_size * block_size)`).
+// The output element shape is `[2, 2, 1]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// This operation, for block size of 2, will return the following tensor of shape
+// `[1, 2, 2, 3]`
+//
+// ```
+//    [[[[1, 2, 3], [4, 5, 6]],
+//      [[7, 8, 9], [10, 11, 12]]]]
+//
+// ```
+//
+// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+//
+// ```
+// x =  [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 4 4 1]`:
+//
+// ```
+// x = [[[ [1],   [2],  [5],  [6]],
+//       [ [3],   [4],  [7],  [8]],
+//       [ [9],  [10], [13],  [14]],
+//       [ [11], [12], [15],  [16]]]]
+//
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block, same as in Space2Depth.
+func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthToSpace",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInputV2",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes square root of x element-wise.
 //
 // I.e., \\(y = \sqrt{x} = x^{1/2}\\).
@@ -4472,102 +5253,6 @@ func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
-
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
-//
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
-	return func(m optionalAttr) {
-		m["batch_dim"] = value
-	}
-}
-
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
-//
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
-//
-// Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
-//
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
-		Input: []tf.Input{
-			input, seq_lengths,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
@@ -4717,45 +5402,47 @@ func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output
 	return scope.AddOperation(opspec)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["data_format"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
-//
-// All elements selected by `indices` must have the same shape.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "MaxPoolGrad",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -4763,277 +5450,375 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
+
+// CropAndResizeMethod sets the optional method attribute to value.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
 //
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Maximum",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["extrapolation_value"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Outputs all keys and values in the table.
+// Extracts crops from the input image tensor and bilinearly resizes them (possibly
 //
-// Arguments:
-//	table_handle: Handle to the table.
+// with aspect ratio change) to a common output size specified by `crop_size`. This
+// is more general than the `crop_to_bounding_box` op which extracts a fixed size
+// slice from the input image and does not allow resizing or aspect ratio change.
 //
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
+// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
+// method will give identical results to using `tf.image.resize_bilinear()`
+// with `align_corners=True`.
 //
+// Arguments:
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
 //
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "CropAndResize",
 		Input: []tf.Input{
-			table_handle,
+			image, boxes, box_ind, crop_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
 //
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+//
+// Arguments:
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
+//
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
+		Type: "SparseFillEmptyRows",
 		Input: []tf.Input{
-			input, fft_length,
+			indices, values, dense_shape, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Converts two real numbers to a complex number.
+// Reverses specific dimensions of a tensor.
 //
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
 //
-// The input tensors `real` and `imag` must have the same shape.
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
 //
 // For example:
 //
 // ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
 // ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "Reverse",
 		Input: []tf.Input{
-			real, imag,
+			tensor, dims,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+// Computes log softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the imaginary part of a complex number.
+// Computes the inverse permutation of a tensor.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
+//
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+//
+// The values must include 0. There can be no duplicate values or negative values.
 //
 // For example:
 //
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
 // ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "InvertPermutation",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
+
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
 //
-// The Hurwitz zeta function is defined as:
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Arguments:
+//	out_backprop: Any number of dimensions.
+//
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			x, q,
+			out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
+// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
+type FusedBatchNormV2Attr func(optionalAttr)
 
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["epsilon"] = value
 	}
 }
 
-// LRNGradBias sets the optional bias attribute to value.
+// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["data_format"] = value
 	}
 }
 
-// LRNGradAlpha sets the optional alpha attribute to value.
+// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["is_training"] = value
 	}
 }
 
-// LRNGradBeta sets the optional beta attribute to value.
+// Batch normalization.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5042,43 +5827,65 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "FusedBatchNormV2",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Transforms a Tensor into a serialized TensorProto proto.
+//
+// Arguments:
+//	tensor: A Tensor of type `T`.
+//
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeTensor",
+		Input: []tf.Input{
+			tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
 
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
 // If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
+// Solves systems of linear equations.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5087,9 +5894,9 @@ func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			input, axis,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
@@ -5097,105 +5904,122 @@ func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (ou
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
-
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acos",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// Real-valued fast Fourier transform.
 //
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "RFFT",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// RandomUniformSeed sets the optional seed attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["data_format"] = value
 	}
 }
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dilations"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			shape,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -5203,12 +6027,233 @@ func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ..
 	return op.Output(0)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
 
-// AssertSummarize sets the optional summarize attribute to value.
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: Print this many entries of each tensor.
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNGradBias sets the optional bias attribute to value.
+//
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
+//
+// Arguments:
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRNGrad",
+		Input: []tf.Input{
+			input_grads, input_image, output_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical or" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Any",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniform",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
 // If not specified, defaults to 3
 func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
@@ -5332,79 +6377,22 @@ func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf
 	return output_indices, output_values, output_shape
 }
 
-// Returns the truth value of (x < y) element-wise.
-//
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Less",
-		Input: []tf.Input{
-			x, y,
-		},
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
-//
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
-		Input: []tf.Input{
-			features, max_value, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["seed2"] = value
 	}
 }
 
@@ -5503,67 +6491,108 @@ func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_a
 	return scope.AddOperation(opspec)
 }
 
-// Returns element-wise smallest integer in not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
+
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the permuted vector/tensor in the destination data format given the
+//
+// one in the source data format.
+//
+// Arguments:
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+//
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Ceil",
+		Type: "DataFormatVecPermute",
 		Input: []tf.Input{
 			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
+		Type: "Tan",
 		Input: []tf.Input{
-			table_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5572,12 +6601,30 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			grads, original_image,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns which elements of x are Inf.
+//
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsInf",
+		Input: []tf.Input{
+			x,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
@@ -5707,6 +6754,95 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 	return op.Output(0)
 }
 
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
 type UniqueWithCountsAttr func(optionalAttr)
 
@@ -6008,106 +7144,6 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// WriteAudioSummaryAttr is an optional argument to WriteAudioSummary.
-type WriteAudioSummaryAttr func(optionalAttr)
-
-// WriteAudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Writes a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns the created operation.
-func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteAudioSummary",
-		Input: []tf.Input{
-			writer, step, tag, tensor, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the product of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Prod",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResizeBilinearAttr is an optional argument to ResizeBilinear.
 type ResizeBilinearAttr func(optionalAttr)
 
@@ -6275,6 +7311,170 @@ func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output,
 	return scope.AddOperation(opspec)
 }
 
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the average pooling function.
+//
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
+
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes the ids of the positions in sampled_candidates that match true_labels.
+//
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
+//
+// Arguments:
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
+//
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComputeAccidentalHits",
+		Input: []tf.Input{
+			true_classes, sampled_candidates,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // CumsumAttr is an optional argument to Cumsum.
 type CumsumAttr func(optionalAttr)
 
@@ -6482,9 +7682,86 @@ func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...Fix
 	return op.Output(0)
 }
 
-// The gradient operator for the SparseAdd op.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+//
+// Arguments:
+//
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucket",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for the exponential linear (Elu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
+//
+//
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TakeDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
 // as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
 // non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
 // values of A and B.
@@ -6594,145 +7871,6 @@ func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64,
 	return op.Output(0)
 }
 
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: First entry in the range.
-//	stop: Last entry in the range.
-//	num: Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LinSpace",
-		Input: []tf.Input{
-			start, stop, num,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
-
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
-//
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
-	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
-	}
-}
-
-// Deletes the resource specified by the handle.
-//
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
-//
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SummaryWriterAttr is an optional argument to SummaryWriter.
-type SummaryWriterAttr func(optionalAttr)
-
-// SummaryWriterSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func SummaryWriterSharedName(value string) SummaryWriterAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// SummaryWriterContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func SummaryWriterContainer(value string) SummaryWriterAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// Returns a handle to be used to access a summary writer.
-//
-// The summary writer is an in-graph resource which can be used by ops to write
-// summaries to event files.
-//
-// Returns the summary writer resource. Scalar handle.
-func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SummaryWriter",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Applies softmax to a batched N-D `SparseTensor`.
 //
 // The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
@@ -6886,37 +8024,155 @@ func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.O
 	return scope.AddOperation(opspec)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
+// Return the shape of s0 op s1 with broadcast.
+//
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// CumprodExclusive sets the optional exclusive attribute to value.
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
 //
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["src_format"] = value
 	}
 }
 
-// CumprodReverse sets the optional reverse attribute to value.
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["dst_format"] = value
 	}
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
+// Returns the dimension index in the destination data format given the one in
 //
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
+// the source data format.
+//
+// Arguments:
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
+//
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DataFormatDimMap",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
+
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyPowerSign",
+		Input: []tf.Input{
+			var_, m, lr, logbase, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumprodReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
 //
 // By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
 // performed instead:
@@ -7119,6 +8375,79 @@ func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Generates values in an interval.
+//
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LinSpace",
+		Input: []tf.Input{
+			start, stop, num,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+//
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
+//
+// All subsequent operations using the resource will result in a NotFound
+// error status.
+//
+// Arguments:
+//	resource: handle to the resource to delete.
+//
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DestroyResourceOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // LRNAttr is an optional argument to LRN.
 type LRNAttr func(optionalAttr)
 
@@ -7214,27 +8543,6 @@ func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.Data
 	return op.Output(0)
 }
 
-// Writes a `GraphDef` protocol buffer to a `SummaryWriter`.
-//
-// Arguments:
-//	writer: Handle of `SummaryWriter`.
-//	step: The step to write the summary for.
-//	tensor: A scalar string of the serialized tf.GraphDef proto.
-//
-// Returns the created operation.
-func WriteGraphSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteGraphSummary",
-		Input: []tf.Input{
-			writer, step, tensor,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
 type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
@@ -7372,6 +8680,65 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 	return op.Output(0)
 }
 
+// Pads a tensor with zeros.
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Checks whether a resource handle-based variable has been initialized.
+//
+// Arguments:
+//	resource: the input resource handle.
+//
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "VarIsInitializedOp",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
 type StatelessRandomUniformAttr func(optionalAttr)
 
@@ -7416,11 +8783,43 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
+// Makes its input available to the next iteration.
+//
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextIteration",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
+
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
 func AngleTout(value tf.DataType) AngleAttr {
 	return func(m optionalAttr) {
 		m["Tout"] = value
@@ -7650,47 +9049,6 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 	return scope.AddOperation(opspec)
 }
 
-// SizeAttr is an optional argument to Size.
-type SizeAttr func(optionalAttr)
-
-// SizeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func SizeOutType(value tf.DataType) SizeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the size of a tensor.
-//
-// This operation returns an integer representing the number of elements in
-// `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-// size(t) ==> 12
-// ```
-func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Size",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
 type ResourceScatterNdUpdateAttr func(optionalAttr)
 
@@ -7770,6 +9128,117 @@ func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
+
+// SqueezeAxis sets the optional axis attribute to value.
+//
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
+	}
+}
+
+// Removes dimensions of size 1 from the shape of a tensor.
+//
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
+//
+// For example:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
+//
+// Arguments:
+//	input: The `input` to squeeze.
+//
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Squeeze",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adadelta scheme.
+//
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
 type NonMaxSuppressionAttr func(optionalAttr)
 
@@ -7879,43 +9348,47 @@ func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output
 	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
+// value: If `True`, updating of the var, ms, and mom tensors is protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Update '*var' according to the RMSProp algorithm.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7924,114 +9397,37 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns which elements of x are Inf.
+// Returns the truth value of (x > y) element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "Greater",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
 // value: If either `seed` or `seed2` are set to non-zero, the random number
 // generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
@@ -8181,6 +9577,100 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketFast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+//
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
+
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Gather specific elements from the TensorArray into output `value`.
+//
+// All elements selected by `indices` must have the same shape.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGatherV3",
+		Input: []tf.Input{
+			handle, indices, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns x / y element-wise for integer types.
 //
 // Truncation designates that negative numbers will round fractional quantities
@@ -8255,6 +9745,81 @@ func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and
 	return tensors
 }
 
+// Creates a dataset that skips `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Decode web-safe base64-encoded strings.
 //
 // Input may or may not have padding at the end. See EncodeBase64 for padding.
@@ -8379,14 +9944,144 @@ func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Reads the value of a variable.
-//
-// The tensor returned by this operation is immutable.
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
+
+// OneHotAxis sets the optional axis attribute to value.
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns a one-hot tensor.
+//
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
+//
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
+//
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
+//
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
+//
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
+//
+//
+// Examples
+// =========
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+//
+//     ```output =
+//       [5.0 0.0 0.0]  // one_hot(0)
+//       [0.0 0.0 5.0]  // one_hot(2)
+//       [0.0 0.0 0.0]  // one_hot(-1)
+//       [0.0 5.0 0.0]  // one_hot(1)
+//     ```
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+//
+//     ```output =
+//       [0.0 3.0 3.0 3.0]
+//       [3.0 3.0 3.0 0.0]
+//       [3.0 3.0 3.0 3.0]
+//       [3.0 0.0 3.0 3.0]
+//     //  ^                one_hot(0)
+//     //      ^            one_hot(2)
+//     //          ^        one_hot(-1)
+//     //              ^    one_hot(1)
+//     ```
+// Suppose that
+//
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+//
+//     ```output =
+//       [
+//         [1.0, 0.0, 0.0]  // one_hot(0)
+//         [0.0, 0.0, 1.0]  // one_hot(2)
+//       ][
+//         [0.0, 1.0, 0.0]  // one_hot(1)
+//         [0.0, 0.0, 0.0]  // one_hot(-1)
+//       ]```
+//
+// Arguments:
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OneHot",
+		Input: []tf.Input{
+			indices, depth, on_value, off_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads the value of a variable.
+//
+// The tensor returned by this operation is immutable.
+//
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
 // Arguments:
 //	resource: handle to the resource in which to store the variable.
@@ -8572,6 +10267,121 @@ func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Out
 	return op.Output(0)
 }
 
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
@@ -8691,325 +10501,360 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 	return op.Output(0)
 }
 
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
-//
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
-//
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
-//
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
-//
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-//
-//     empty_row_indicator[i] = True iff row i was an empty row.
-//
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
 //
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
 //
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
+			features, labels,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0), op.Output(1)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
+// Fast Fourier transform.
 //
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
+//	input: A complex64 tensor.
 //
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reverse",
+		Type: "FFT",
 		Input: []tf.Input{
-			tensor, dims,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
 //
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
+		Type: "ParseTensor",
 		Input: []tf.Input{
-			logits,
+			serialized,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
 //
-// For example:
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
 //
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
 //
 // Arguments:
-//	x: 1-D.
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
+		Type: "MaxPoolWithArgmax",
 		Input: []tf.Input{
-			x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-//
-// This operation folds the padded areas of `input` by `MirrorPad` according to the
-// `paddings` you specify. `paddings` must be the same as `paddings` argument
-// given to the corresponding `MirrorPad` op.
-//
-// The folded size of each dimension D of the output is:
-//
-// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
-//
-// For example:
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-// # 'paddings' is [[0, 1]], [0, 1]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[ 1,  5]
-//                       [11, 28]]
-// ```
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	input: The input tensor to be folded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: The mode used in the `MirrorPad` op.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns The folded tensor.
-func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MirrorPadGrad",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			input, paddings,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
 //
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
 //
-// Inputs are the logits, not probabilities.
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
 //
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Fast Fourier transform.
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
 //
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FFT",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			input,
+			image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9018,13 +10863,14 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "Multinomial",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			logits, num_samples,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Returns the truth value of NOT x element-wise.
@@ -9367,171 +11213,35 @@ func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtyp
 	return op.Output(0)
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
-
-// HashTableV2Container sets the optional container attribute to value.
+// Inverse 2D fast Fourier transform.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// HashTableV2SharedName sets the optional shared_name attribute to value.
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT2D",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates a non-initialized hash table.
-//
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
-//
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
-
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns a random (key, value)
-//
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
-		Input: []tf.Input{
-			indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
-		return
-	}
-	return key, values
-}
-
-// Inverse 2D fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT2D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a tensor filled with a scalar value.
+// Creates a tensor filled with a scalar value.
 //
 // This operation creates a tensor of shape `dims` and fills it with `value`.
 //
@@ -9940,55 +11650,53 @@ func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values
 	return op.Output(0), op.Output(1)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
-// Arguments:
-//	tensor: A Tensor of type `T`.
+// The Hurwitz zeta function is defined as:
 //
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
+		Type: "Zeta",
 		Input: []tf.Input{
-			tensor,
+			x, q,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
 
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+// ProdKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+func ProdKeepDims(value bool) ProdAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Solves systems of linear equations.
+// Computes the product of elements across dimensions of a tensor.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9997,9 +11705,9 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "Prod",
 		Input: []tf.Input{
-			matrix, rhs,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -10007,30 +11715,60 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 	return op.Output(0)
 }
 
-// Looks up keys in a table, outputs the corresponding values.
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
+// value: If true, rescale input by (new_height - 1) / (height - 1),
+// which exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			table_handle, keys, default_value,
+			input, size, paddings, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -10990,20 +12728,60 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 	return op.Output(0)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
+// Concatenates tensors along one dimension.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReadFile",
+		Input: []tf.Input{
+			filename,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
+
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the minimum of elements across dimensions of a tensor.
 //
 // Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
@@ -11053,256 +12831,252 @@ func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sigmoid",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
 // If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["is_training"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the filter.
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
 }
 
-// Flushes the writer's unwritten events.
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	writer: A handle to the summary writer resource.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns the created operation.
-func FlushSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FlushSummaryWriter",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			writer,
+			shape,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
-
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["mode"] = value
+// Component-wise divides a SparseTensor by a dense Tensor.
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["round_mode"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseDiv",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
+
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+// `index  0  1  2  3  4`
 //
-// *MIN_COMBINED Mode Example*
+// `value  20 5  16 3  7`
 //
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
-//
-// Now we can quantize the elements of our tensor:
-// ```c++
-// result = round(input * s)
-// ```
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalAvgPool function.
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Concatenates tensors along one dimension.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "Concat",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			concat_dim, tf.OutputList(values),
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -11417,327 +13191,86 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// Returns element-wise integer closest to x.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
+//
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "Rint",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// Conv3DDataFormat sets the optional data_format attribute to value.
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["capacity"] = value
 	}
 }
 
-// Conv3DDilations sets the optional dilations attribute to value.
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
-//
-// Our Conv3D implements a form of cross-correlation.
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the (key, value) element with the smallest
 //
-// Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "OrderedMapUnstageNoKey",
 		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
-//
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
-
-// EncodePngCompression sets the optional compression attribute to value.
-//
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
-	return func(m optionalAttr) {
-		m["compression"] = value
-	}
-}
-
-// PNG-encode an image.
-//
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
-//
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
-//
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
-//
-// Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
-//
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodePng",
-		Input: []tf.Input{
-			image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
-
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the permuted vector/tensor in the destination data format given the
-//
-// one in the source data format.
-//
-// Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
-//
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise integer closest to x.
-//
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
-//
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rint",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
-
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the (key, value) element with the smallest
-//
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
-		Input: []tf.Input{
-			indices,
+			indices,
 		},
 		Attrs: attrs,
 	}
@@ -11909,137 +13442,118 @@ func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.Dat
 	return outputs
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
-
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// SerializeManySparseOutType sets the optional out_type attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_type"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "SerializeManySparse",
 		Input: []tf.Input{
-			true_classes,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
+	return op.Output(0)
 }
 
-// Performs max pooling on the input.
-//
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "Acosh",
 		Input: []tf.Input{
-			input, ksize, strides,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayReadV3
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			size,
 		},
 		Attrs: attrs,
 	}
@@ -12047,106 +13561,306 @@ func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
 //
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Batch normalization.
+// DecodeCSVNaValue sets the optional na_value attribute to value.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			records, tf.OutputList(record_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
 
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["capacity"] = value
 	}
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["container"] = value
 	}
 }
 
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
-//
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["shared_name"] = value
 	}
 }
 
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+// Op removes all elements in the underlying container.
 //
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolV2",
+		Input: []tf.Input{
+			input, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
 // If not specified, defaults to 131072
 func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
@@ -12396,119 +14110,106 @@ func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Outp
 	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
+// Computes gradients for SparseSegmentMean.
 //
-// Arguments:
-//	resource: the input resource handle.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
+		Type: "SparseSegmentMeanGrad",
 		Input: []tf.Input{
-			resource,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
+// Returns the truth value of (x >= y) element-wise.
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "GreaterEqual",
 		Input: []tf.Input{
-			input, paddings,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+// Conv3DDataFormat sets the optional data_format attribute to value.
 //
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
+		m["data_format"] = value
 	}
 }
 
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+// Conv3DDilations sets the optional dilations attribute to value.
 //
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["adjoint_b"] = value
+		m["dilations"] = value
 	}
 }
 
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// Our Conv3D implements a form of cross-correlation.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -12516,103 +14217,57 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
-// then the final deserialized `SparseTensor` will be:
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
 //
 // Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "SparseDenseCwiseAdd",
 		Input: []tf.Input{
-			serialized_sparse,
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
+// Read an element from the TensorArray into output `value`.
 //
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
+// Arguments:
+//	handle: The handle to a TensorArray.
 //
-// with the given separator (default is an empty separator).
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -12620,131 +14275,139 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 	return op.Output(0)
 }
 
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
-//
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-		Attrs: attrs,
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
+// *MIN_COMBINED Mode Example*
 //
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
 //
-// For example, if `concat_dim = 1` and the inputs are
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
 //
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// If the mode is 'MIN_FIRST', then this approach is used:
 //
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
 //
-// then the output will be
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
 //
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// *SCALED mode Example*
 //
-// Graphically this is equivalent to doing
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
@@ -12752,118 +14415,45 @@ func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
+// Returns the truth value of (x < y) element-wise.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "Less",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ListDiffAttr is an optional argument to ListDiff.
-type ListDiffAttr func(optionalAttr)
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
 
-// ListDiffOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["out_type"] = value
 	}
 }
 
-// Computes the difference between two lists of numbers or strings.
-//
-// Given a list `x` and a list `y`, this operation returns a list `out` that
-// represents all values that are in `x` but not in `y`. The returned list `out`
-// is sorted in the same order that the numbers appear in `x` (duplicates are
-// preserved). This operation also returns a list `idx` that represents the
-// position of each `out` element in `x`. In other words:
-//
-// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-//
-// For example, given this input:
-//
-// ```
-// x = [1, 2, 3, 4, 5, 6]
-// y = [1, 3, 5]
-// ```
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
-// This operation would return:
+// Arguments:
 //
-// ```
-// out ==> [2, 4, 6]
-// idx ==> [1, 3, 5]
-// ```
 //
-// Arguments:
-//	x: 1-D. Values to keep.
-//	y: 1-D. Values to remove.
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
 //
-// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
-func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12872,108 +14462,72 @@ func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ListDiff",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			x, y,
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
 
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["out_type"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
 //
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
 		Attrs: attrs,
 	}
@@ -12981,66 +14535,44 @@ func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_value
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes numerical negative value element-wise.
-//
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Neg",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
 	return func(m optionalAttr) {
-		m["narrow_range"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "ResourceGather",
 		Input: []tf.Input{
-			inputs, min, max,
+			resource, indices,
 		},
 		Attrs: attrs,
 	}
@@ -13048,29 +14580,23 @@ func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max
 	return op.Output(0)
 }
 
-// Writes a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// Delete the TensorArray from its resource container.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
 // Returns the created operation.
-func WriteHistogramSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "WriteHistogramSummary",
+		Type: "TensorArrayCloseV3",
 		Input: []tf.Input{
-			writer, step, tag, values,
+			handle,
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -13375,59 +14901,6 @@ func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, n
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the element-wise min of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Constructs a tensor by tiling a given tensor.
-//
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
-//
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tile",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Saves the input tensors to disk.
 //
 // The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
@@ -13476,73 +14949,103 @@ func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
 
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["adjoint_a"] = value
 	}
 }
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["adjoint_b"] = value
 	}
 }
 
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-//
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 //
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
 //
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+//
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
 //
-// ```
 //     index = [ 0]
 //             [10]
 //             [20]
 //     values = [1, 2, 3]
 //     shape = [50]
-// ```
 //
 // and
 //
-// ```
 //     index = [ 2]
 //             [10]
 //     values = [4, 5]
 //     shape = [30]
-// ```
 //
-// then the final `SparseTensor` will be:
+// then the final deserialized `SparseTensor` will be:
 //
-// ```
 //     index = [0  0]
 //             [0 10]
 //             [0 20]
@@ -13550,27 +15053,20 @@ func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTens
 //             [1 10]
 //     values = [1, 2, 3, 4, 5]
 //     shape = [2 50]
-// ```
 //
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
-//
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			sparse_handles,
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
@@ -13578,502 +15074,597 @@ func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// with the given separator (default is an empty separator).
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InTopKV2",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			predictions, targets, k,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Assigns a new value to a variable.
+// Returns immutable tensor from memory region.
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
+// The current implementation memmaps the tensor from a file.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
-//
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
 //
 // Arguments:
-//	x: a tensor of type T.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			x,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient of SparseFillEmptyRows.
+// Concatenates a list of `SparseTensor` along the specified dimension.
 //
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
 //
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
 //
 // Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
 //
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			reverse_index_map, grad_values,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+// Generates sparse cross from a list of sparse and dense tensors.
 //
-// if < 0, `scale * features` otherwise.
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
 //
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Selu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
-
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Number of unique elements along last dimension of input `set`.
+// For example, if the inputs are
 //
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the sign and the log of the absolute value of the determinant of
-//
-// one or more square matrices.
-//
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
+// Concatenates quantized tensors along one dimension.
 //
 // Arguments:
-//	input: Shape is `[N, M, M]`.
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
 //
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
+		Type: "QuantizedConcat",
 		Input: []tf.Input{
-			input,
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
-
-// SumKeepDims sets the optional keep_dims attribute to value.
+// Slice a `SparseTensor` based on the `start` and `size`.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a tensor.
+// For example, if the input is
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
 //
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "SparseSlice",
 		Input: []tf.Input{
-			input, axis,
+			indices, values, shape, start, size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Delete the tensor specified by its handle in the session.
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
 //
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			handle,
+			a_indices, a_values, a_shape, b,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// L2 Loss.
-//
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// Returns the set of files matching one or more glob patterns.
 //
-//     output = sum(t ** 2) / 2
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
 //
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "L2Loss",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			t,
+			pattern,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
 
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// MatrixSolveLsFast sets the optional fast attribute to value.
 // If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["fast"] = value
 	}
 }
 
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Solves one or more linear least-squares problems.
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
-
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// Elementwise computes the bitwise OR of `x` and `y`.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
 //
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			input, size, paddings, filter,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// Computes numerical negative value element-wise.
 //
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
+		Type: "Neg",
 		Input: []tf.Input{
-			resource, value,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
 
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["num_bits"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
-//
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// See also `RestoreSlice`.
+// and `max` to 'outputs' tensor of same shape as `inputs`.
 //
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			inputs, min, max,
 		},
 		Attrs: attrs,
 	}
@@ -14081,293 +15672,340 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.
 	return op.Output(0)
 }
 
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// Returns the element-wise min of two SparseTensors.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
-//
-// Input images and output images must be quantized types.
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-//
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			images, size, min, max,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the minimum along segments of a tensor.
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// Arguments:
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
+//
+// Arguments:
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
+//
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			data, segment_ids,
+			sparse_handles,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+// MaxPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: Whether to use Adapative SDCA for the inner loop.
-// If not specified, defaults to false
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["data_format"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
-//
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// Performs max pooling on the input.
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "MaxPool",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "InTopKV2",
+		Input: []tf.Input{
+			predictions, targets, k,
+		},
 	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
+// Assigns a new value to a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
+//
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
+	opspec := tf.OpSpec{
+		Type: "AssignVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
+// Returns a tensor of ones with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
+	opspec := tf.OpSpec{
+		Type: "OnesLike",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Multiply matrix "a" by matrix "b".
+// The gradient of SparseFillEmptyRows.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". This op is optimized for the case where at
-// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
-// matrix multiply on one platform was 30% zero values in the sparse matrix.
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
 //
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
+//
+// Arguments:
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
+//
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
+		Type: "SparseFillEmptyRowsGrad",
 		Input: []tf.Input{
-			a, b,
+			reverse_index_map, grad_values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the power of one value to another.
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
 //
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// if < 0, `scale * features` otherwise.
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "Selu",
 		Input: []tf.Input{
-			x, y,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
 
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Returns the shape of a tensor.
+// Number of unique elements along last dimension of input `set`.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
 //
-// For example:
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// Arguments:
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14376,9 +16014,9 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "SetSize",
 		Input: []tf.Input{
-			input,
+			set_indices, set_values, set_shape,
 		},
 		Attrs: attrs,
 	}
@@ -14386,82 +16024,64 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
+// Computes the sign and the log of the absolute value of the determinant of
+//
+// one or more square matrices.
+//
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
 //
 // Arguments:
-//	input: vector of strings to compute fingerprints on.
+//	input: Shape is `[N, M, M]`.
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "LogMatrixDeterminant",
 		Input: []tf.Input{
 			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
-
-// RandomPoissonV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
 
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+// SumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// Computes the sum of elements across dimensions of a tensor.
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14470,9 +16090,9 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
+		Type: "Sum",
 		Input: []tf.Input{
-			shape, rate,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -14480,109 +16100,191 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 	return op.Output(0)
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
-
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
+// Delete the tensor specified by its handle in the session.
 //
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["lower"] = value
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeleteSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+// L2 Loss.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// Computes half the L2 norm of a tensor without the `sqrt`:
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.triangular_solve
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+//     output = sum(t ** 2) / 2
+//
+// Arguments:
+//	t: Typically 2-D, but may have any dimensions.
+//
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "L2Loss",
+		Input: []tf.Input{
+			t,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
 //
-// backsubstitution.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			matrix, rhs,
+			set1, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Subtracts a value from the current value of a variable.
+//
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
+//
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asinh",
+		Type: "AssignSubVariableOp",
 		Input: []tf.Input{
-			x,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
 //
-// Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
 //
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
 //
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "Restore",
 		Input: []tf.Input{
-			start, stop, step,
+			file_pattern, tensor_name,
 		},
 		Attrs: attrs,
 	}
@@ -14590,212 +16292,229 @@ func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output,
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["align_corners"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the input.
+// Input images and output images must be quantized types.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+//
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "QuantizedResizeBilinear",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			images, size, min, max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds sparse updates to the variable referenced by `resource`.
+// Computes the minimum along segments of a tensor.
 //
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
 // </div>
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
+		Type: "SegmentMin",
 		Input: []tf.Input{
-			resource, indices, updates,
+			data, segment_ids,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: Whether to use Adapative SDCA for the inner loop.
+// If not specified, defaults to false
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+	return func(m optionalAttr) {
+		m["adaptative"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "SdcaOptimizer",
 		Input: []tf.Input{
-			x, y,
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
 
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
-//
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
-//
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
-//
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
+		m["a_is_sparse"] = value
 	}
 }
 
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
-//
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
+		m["b_is_sparse"] = value
 	}
 }
 
-// Transforms a spectrogram into a form that's useful for speech recognition.
+// Multiply matrix "a" by matrix "b".
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". This op is optimized for the case where at
+// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
+// matrix multiply on one platform was 30% zero values in the sparse matrix.
 //
-// Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14804,9 +16523,9 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mfcc",
+		Type: "SparseMatMul",
 		Input: []tf.Input{
-			spectrogram, sample_rate,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -14814,125 +16533,146 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 	return op.Output(0)
 }
 
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+// Computes the power of one value to another.
 //
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
 //
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
+		Type: "Pow",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
 //
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
+// For example:
 //
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Shape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes fingerprints of the input strings.
 //
 // Arguments:
+//	input: vector of strings to compute fingerprints on.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "SdcaFprint",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
+// RandomPoissonV2Seed sets the optional seed attribute to value.
 //
 // value: If either `seed` or `seed2` are set to be non-zero, the random number
 // generator is seeded by the given seed.  Otherwise, it is seeded by a
 // random seed.
 // If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
 //
 // value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+//
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
 //
 // Arguments:
 //	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
 // `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14941,9 +16681,9 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "RandomPoissonV2",
 		Input: []tf.Input{
-			shape, alpha,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -14951,189 +16691,179 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
 
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
+//
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["lower"] = value
 	}
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.triangular_solve
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
+// Solves systems of linear equations with upper or lower triangular matrices by
 //
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// backsubstitution.
 //
-// Arguments:
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
+	return op.Output(0)
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
+		Type: "Asinh",
 		Input: []tf.Input{
-			resource, indices,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
 //
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			handle,
+			start, stop, step,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
-// RandomUniformIntSeed sets the optional seed attribute to value.
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["data_format"] = value
 	}
 }
 
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dilations"] = value
 	}
 }
 
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// Computes the gradients of depthwise convolution with respect to the input.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "DepthwiseConv2dNativeBackpropInput",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -15141,98 +16871,79 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
+// Adds sparse updates to the variable referenced by `resource`.
 //
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
-
-// SkipgramMinCount sets the optional min_count attribute to value.
+// This operation computes
 //
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
 //
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
-	}
-}
-
-// Parses a text file and creates a batch of examples.
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
 //
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
-		Attrs: attrs,
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return scope.AddOperation(opspec)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
+// Says whether the targets are in the top `K` predictions.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Converts each string in the input Tensor to the specified numeric type.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// More formally, let
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "InTopK",
 		Input: []tf.Input{
-			string_tensor,
+			predictions, targets,
 		},
 		Attrs: attrs,
 	}
@@ -15240,160 +16951,161 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "Minimum",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			x, y,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
+// Returns the element-wise sum of a list of tensors.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "AccumulateNV2",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a truncated normal distribution.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			shape,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
 
-// RandomShuffleSeed sets the optional seed attribute to value.
+// RandomGammaSeed sets the optional seed attribute to value.
 //
 // value: If either `seed` or `seed2` are set to be non-zero, the random number
 // generator is seeded by the given seed.  Otherwise, it is seeded by a
 // random seed.
 // If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
+func RandomGammaSeed(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
 //
 // value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+func RandomGammaSeed2(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
 // Arguments:
-//	value: The tensor to be shuffled.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
 //
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15402,9 +17114,9 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			value,
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
@@ -15412,99 +17124,64 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 	return op.Output(0)
 }
 
-// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
-type OrderedMapIncompleteSizeAttr func(optionalAttr)
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
 
-// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["signed_input"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["num_bits"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["range_given"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of incomplete elements in the underlying container.
-func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapIncompleteSize",
-
-		Attrs: attrs,
+		m["input_min"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["little_endian"] = value
+		m["input_max"] = value
 	}
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
-//
-// Arguments:
-//	bytes: All the elements must have the same length.
-//
+// Use QuantizeAndDequantizeV2 instead.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "QuantizeAndDequantize",
 		Input: []tf.Input{
-			bytes,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -15512,149 +17189,180 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
+// Returns locations of nonzero / true values in a tensor.
 //
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
 //
 // For example:
 //
 // ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
 //
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Useful special cases:
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 // ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
-//
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "Where",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			condition,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
 
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Decompress strings.
+// Dequeues a tuple of one or more tensors from the given queue.
 //
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
 //
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	bytes: A Tensor of string which is compressed.
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			bytes,
+			handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
+	}
+	return components
 }
 
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-// WholeFileReaderV2Container sets the optional container attribute to value.
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["seed"] = value
 	}
 }
 
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed2"] = value
 	}
 }
 
-// A Reader that outputs the entire contents of a file as a value.
+// Outputs random integers from a uniform distribution.
 //
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15663,165 +17371,280 @@ func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
+		Type: "RandomUniformInt",
+		Input: []tf.Input{
+			shape, minval, maxval,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a tf.Example proto (as a string) into typed tensors.
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
 //
 // Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
-		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
-		},
+		Type: "Skipgram",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
 	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Acos",
+		Type: "ResourceApplyFtrlV2",
 		Input: []tf.Input{
-			x,
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
 
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["seed"] = value
 	}
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			input,
+			shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
+//
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
 //
 // Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+//	value: The tensor to be shuffled.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "RandomShuffle",
 		Input: []tf.Input{
-			serialized,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -15829,49 +17652,47 @@ func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (outp
 	return op.Output(0)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
+// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
+type OrderedMapIncompleteSizeAttr func(optionalAttr)
 
-// MapClearCapacity sets the optional capacity attribute to value.
+// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
+func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
+func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// MapClearContainer sets the optional container attribute to value.
+// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
+func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapClearSharedName sets the optional shared_name attribute to value.
+// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
+func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+// Op returns the number of incomplete elements in the underlying container.
+func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15880,167 +17701,244 @@ func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapClear",
+		Type: "OrderedMapIncompleteSize",
 
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
 
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
 // If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
 	return func(m optionalAttr) {
-		m["na_value"] = value
+		m["little_endian"] = value
 	}
 }
 
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or empty if the column is required.
+//	bytes: All the elements must have the same length.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"out_type": out_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			bytes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Returns the rank of a tensor.
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// This operation returns an integer representing the rank of `input`.
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
 //
 // For example:
 //
 // ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
 // ```
 //
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rank",
+		Type: "MatrixBandPart",
 		Input: []tf.Input{
-			input,
+			input, num_lower, num_upper,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
+
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+//
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			data,
+			a, b, min_a, max_a, min_b, max_b,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
+// Does nothing. Serves as a control trigger for scheduling.
 //
-// Arguments:
+// Only useful as a placeholder for control edges.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Batch normalization.
 //
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			input_dataset, count,
+			t, m, v, beta, gamma,
 		},
 		Attrs: attrs,
 	}
@@ -16048,432 +17946,386 @@ func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Deprecated. Use TensorArrayReadV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "TensorArrayReadV2",
 		Input: []tf.Input{
-			x,
+			handle, index, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
+
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x * y element-wise, working on quantized buffers.
 //
 // Arguments:
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			data, segment_ids,
+			x, y, min_x, max_x, min_y, max_y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
 
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["Toutput"] = value
 	}
 }
 
-// Computes gradients of the average pooling function.
+// Returns x + y element-wise, working on quantized buffers.
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "QuantizedAdd",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
 
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
 //
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["upper_frequency_limit"] = value
 	}
 }
 
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
 //
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StageClear",
-
-		Attrs: attrs,
+		m["lower_frequency_limit"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
-
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["filterbank_channel_count"] = value
 	}
 }
 
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dct_coefficient_count"] = value
 	}
 }
 
-// Computes the ids of the positions in sampled_candidates that match true_labels.
+// Transforms a spectrogram into a form that's useful for speech recognition.
 //
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
 //
 // Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
-//
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
+		Type: "Mfcc",
 		Input: []tf.Input{
-			true_classes, sampled_candidates,
+			spectrogram, sample_rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
+// Given a quantized tensor described by (input, input_min, input_max), outputs a
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// range that covers the actual values present in that tensor.  This op is
+// typically used to produce the requested_output_min and requested_output_max for
+// Requantize.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "RequantizationRange",
 		Input: []tf.Input{
-			x,
+			input, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
 
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// MapPeekCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+//
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["capacity"] = value
 	}
 }
 
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+//
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "MapPeek",
 		Input: []tf.Input{
-			shape,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
 	}
+	return values
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			table_handle, keys, default_value,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
+//
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
-type FusedBatchNormV2Attr func(optionalAttr)
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
 
-// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["compression"] = value
 	}
 }
 
-// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
+// PNG-encode an image.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16482,387 +18334,303 @@ func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			image,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
+// Updates the table to associates keys with values.
 //
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
+	opspec := tf.OpSpec{
+		Type: "LookupTableInsertV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Draws samples from a multinomial distribution.
+// Returns element-wise smallest integer in not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the number of elements in the given table.
 //
 // Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	table_handle: Handle to the table.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "LookupTableSizeV2",
 		Input: []tf.Input{
-			logits, num_samples,
+			table_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// EncodeJpegFormat sets the optional format attribute to value.
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["format"] = value
+		m["align_corners"] = value
 	}
 }
 
-// EncodeJpegQuality sets the optional quality attribute to value.
+// Computes the gradient of bilinear interpolation.
 //
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
-	}
-}
-
-// EncodeJpegProgressive sets the optional progressive attribute to value.
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBilinearGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+// Outputs all keys and values in the table.
 //
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
+// Arguments:
+//	table_handle: Handle to the table.
+//
+//
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	opspec := tf.OpSpec{
+		Type: "LookupTableExportV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// Replaces the contents of the table with the specified keys and values.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableImportV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
+
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["density_unit"] = value
+		m["capacity"] = value
 	}
 }
 
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["x_density"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// EncodeJpegYDensity sets the optional y_density attribute to value.
-//
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["y_density"] = value
+		m["container"] = value
 	}
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
-//
-// value: If not empty, embed this XMP metadata in the image header.
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["shared_name"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
-//
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
-//
-// Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+// Op removes and returns a random (key, value)
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "MapUnstageNoKey",
 		Input: []tf.Input{
-			image,
+			indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+// HashTableV2Container sets the optional container attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// Computes gradients of the maxpooling function.
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["shared_name"] = value
 	}
 }
 
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Extracts crops from the input image tensor and bilinearly resizes them (possibly
-//
-// with aspect ratio change) to a common output size specified by `crop_size`. This
-// is more general than the `crop_to_bounding_box` op which extracts a fixed size
-// slice from the input image and does not allow resizing or aspect ratio change.
+// Creates a non-initialized hash table.
 //
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
-// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
-// method will give identical results to using `tf.image.resize_bilinear()`
-// with `align_corners=True`.
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
 // Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
-		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
-		},
+		Type: "HashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
-
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
-		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
 type MutableHashTableV2Attr func(optionalAttr)
 
@@ -16927,31 +18695,159 @@ func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.Data
 	return op.Output(0)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
+
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// Dequantize the 'input' tensor into a float Tensor.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// if T == qint8, in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "Dequantize",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			input, min_range, max_range,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
+// Flips all bits elementwise.
 //
-// Returns the created operation.
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Invert",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Disallowed in GraphDef version >= 2.
+//
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Table initializer that takes two tensors for keys and values respectively.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
+//
+// Returns the created operation.
 func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
@@ -17533,30 +19429,6 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// Writes a `Summary` protocol buffer with scalar values.
-//
-// The input `tag` and `value` must have the scalars.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Tag for the summary.
-//	value: Value for the summary.
-//
-// Returns the created operation.
-func WriteScalarSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteScalarSummary",
-		Input: []tf.Input{
-			writer, step, tag, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Computes the matrix exponential of one or more square matrices:
 //
 // exp(A) = \sum_{n=0}^\infty A^n/n!
@@ -17890,101 +19762,6 @@ func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output t
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
-//
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-// sufficiently large.
-//
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
-		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SvdAttr is an optional argument to Svd.
 type SvdAttr func(optionalAttr)
 
@@ -18343,254 +20120,223 @@ func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shar
 	return op.Output(0)
 }
 
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
-		Input: []tf.Input{
-			basename, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+// BatchToSpace for N-D tensors of type T.
 //
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
-	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
+// This operation is equivalent to the following steps:
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
-
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
-	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
-	}
-}
-
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
 //
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
 //
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
+//       input_shape[M+1], ..., input_shape[N-1]]
 //
-// The remappings are 1-D tensors with the following properties:
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
 //
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
 //
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
 //
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
 //
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
 //
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
+//       input_shape[M+1], ..., input_shape[N-1]]
 //
-// Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+// Some examples:
 //
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
+		Type: "BatchToSpaceND",
 		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+			input, block_shape, crops,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
 
-// TFRecordReaderV2Container sets the optional container attribute to value.
+// UnpackAxis sets the optional axis attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["axis"] = value
 	}
 }
 
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// A Reader that outputs the records from a TensorFlow Records file.
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
+//
+// Arguments:
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+//
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num": num}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
+		Type: "Unpack",
+		Input: []tf.Input{
+			value,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
 	}
+	return output
 }
 
-// Quantizes then dequantizes a tensor.
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+// Arguments:
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
+//
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
+		Type: "ResourceCountUpTo",
 		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
+			resource,
 		},
 		Attrs: attrs,
 	}
@@ -18598,38 +20344,79 @@ func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output,
 	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
-
-// IdentityReaderV2Container sets the optional container attribute to value.
+// Delete the stack from its resource container.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StackCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilespec",
+		Input: []tf.Input{
+			basename, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
+
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+//
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["skip_header_lines"] = value
+	}
+}
+
+// TextLineReaderV2Container sets the optional container attribute to value.
 //
 // value: If non-empty, this reader is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
 // value: If non-empty, this reader is named in the given bucket
 // with this shared_name. Otherwise, the node name is used instead.
 // If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// A Reader that outputs the queued work as both the key and value.
-//
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
 // Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18638,7 +20425,7 @@ func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
+		Type: "TextLineReaderV2",
 
 		Attrs: attrs,
 	}
@@ -18646,133 +20433,366 @@ func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_ha
 	return op.Output(0)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
 
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["max_rows_in_memory"] = value
 	}
 }
 
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+//
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
+//
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
+//
+// The remappings are 1-D tensors with the following properties:
+//
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
+//
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+//
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
+//
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
+//
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle,
-		},
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
-//
-// Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
-		},
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
+// A Reader that outputs the records from a TensorFlow Records file.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+		Type: "TFRecordReaderV2",
+
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["signed_input"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
+//
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV3",
+		Input: []tf.Input{
+			input, input_min, input_max, num_bits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
+//
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
+//
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IdentityReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the next record (key, value pair) produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
+//
+// Arguments:
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
+//
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadUpToV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle, num_records,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
 //
 // value: If `True`, uses the nesterov update.
 // If not specified, defaults to false
@@ -18841,29 +20861,6 @@ func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
-//
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-//
-// Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
-//
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
-		Input: []tf.Input{
-			pattern,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
 type ResizeBicubicGradAttr func(optionalAttr)
 
@@ -18994,138 +20991,23 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
 
-// DecodeJpegChannels sets the optional channels attribute to value.
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["output_type"] = value
 	}
 }
 
-// DecodeJpegRatio sets the optional ratio attribute to value.
+// Extract the shape information of a JPEG-encoded image.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
-
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
-//
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Extract the shape information of a JPEG-encoded image.
-//
-// This op only parses the image header, so it is much faster than DecodeJpeg.
+// This op only parses the image header, so it is much faster than DecodeJpeg.
 //
 // Arguments:
 //	contents: 0-D. The JPEG-encoded image.
@@ -19724,6 +21606,61 @@ func Iterator(scope *Scope, shared_name string, container string, output_types [
 	return op.Output(0)
 }
 
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
+//
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ShuffleDatasetAttr is an optional argument to ShuffleDataset.
 type ShuffleDatasetAttr func(optionalAttr)
 
@@ -20591,47 +22528,6 @@ func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, out
 	return op.Output(0)
 }
 
-// PlaceholderAttr is an optional argument to Placeholder.
-type PlaceholderAttr func(optionalAttr)
-
-// PlaceholderShape sets the optional shape attribute to value.
-//
-// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
-// shape is unconstrained.
-// If not specified, defaults to <unknown_rank:true >
-func PlaceholderShape(value tf.Shape) PlaceholderAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// A placeholder op for a value that will be fed into the computation.
-//
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Placeholder",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that executes a SQL query and emits rows of the result set.
 //
 // Arguments:
@@ -20681,111 +22577,40 @@ func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf
 	return op.Output(0)
 }
 
-// Slice a `SparseTensor` based on the `start` and `size`.
-//
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
+// Gradients for batch normalization.
 //
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			indices, values, shape, start, size,
+			t, m, v, gamma, backprop,
 		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Concatenates quantized tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. See `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-//
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
-		Input: []tf.Input{
-			t, m, v, gamma, backprop,
-		},
-		Attrs: attrs,
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
@@ -21048,48 +22873,6 @@ func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_han
 	return op.Output(0)
 }
 
-// ShapeNAttr is an optional argument to ShapeN.
-type ShapeNAttr func(optionalAttr)
-
-// ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeNOutType(value tf.DataType) ShapeNAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns shape of tensors.
-//
-// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ShapeN",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ShapeN", err)
-		return
-	}
-	return output
-}
-
 // IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
 type IteratorFromStringHandleAttr func(optionalAttr)
 
@@ -21700,181 +23483,184 @@ func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// StridedSliceAttr is an optional argument to StridedSlice.
-type StridedSliceAttr func(optionalAttr)
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
 
-// StridedSliceBeginMask sets the optional begin_mask attribute to value.
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
 //
-// value: a bitmask where a bit i being 1 means to ignore the begin
-// value and instead use the largest interval possible. At runtime
-// begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
-// `[-1, n-1]` if `stride[i] < 0`
-// If not specified, defaults to 0
-func StridedSliceBeginMask(value int64) StridedSliceAttr {
+// value: The type of each component in a value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["begin_mask"] = value
+		m["component_types"] = value
 	}
 }
 
-// StridedSliceEndMask sets the optional end_mask attribute to value.
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: analogous to `begin_mask`
-// If not specified, defaults to 0
-func StridedSliceEndMask(value int64) StridedSliceAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["end_mask"] = value
+		m["capacity"] = value
 	}
 }
 
-// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
+// PriorityQueueV2Container sets the optional container attribute to value.
 //
-// value: a bitmask where bit `i` being 1 means the `i`th
-// position is actually an ellipsis. One bit at most can be 1.
-// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
-// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
-// implicitly creates as many range specifications as necessary to fully
-// specify the sliced range for every dimension. For example for a 4-dimensional
-// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
-// If not specified, defaults to 0
-func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
+		m["container"] = value
 	}
 }
 
-// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: a bitmask where bit `i` being 1 means the `i`th
-// specification creates a new shape 1 dimension. For example
-// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
-// If not specified, defaults to 0
-func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+		m["shared_name"] = value
 	}
 }
 
-// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// A queue that produces elements sorted by the first component value.
 //
-// value: a bitmask where bit `i` implies that the `i`th
-// specification should shrink the dimensionality. begin and end
-// must imply a slice of size 1 in the dimension. For example in
-// python one might do `foo[:, 3, :]` which would result in
-// `shrink_axis_mask` being 2.
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
+//
+// Arguments:
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+//
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PriorityQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
+
+// UnstageCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
-func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
+//
+// REQUIRES: value >= 0
+func UnstageCapacity(value int64) UnstageAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["capacity"] = value
 	}
 }
 
-// Return a strided slice from `input`.
-//
-// Note, most python users will want to use the Python `Tensor.__getitem__`
-// or `Variable.__getitem__` rather than this op directly.
-//
-// The goal of this op is to produce a new tensor with a subset of
-// the elements from the `n` dimensional `input` tensor. The subset is chosen using
-// a sequence of `m` sparse range specifications encoded into the arguments
-// of this function. Note, in some cases
-// `m` could be equal to `n`, but this need not be the case. Each
-// range specification entry can be one of the following:
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// - An ellipsis (...). Ellipses are used to imply zero or more
-//   dimensions of full-dimension selection and are produced using
-//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
+// REQUIRES: value >= 0
+func UnstageMemoryLimit(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// UnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnstageContainer(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnstageSharedName(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op is similar to a lightweight Dequeue.
 //
-// - A new axis. This is used to insert a new shape=1 dimension and is
-//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
-//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
-//
-//
-// - A range `begin:end:stride`. This is used to specify how much to choose from
-//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
-//   which represents the index of the first value to select while `end` represents
-//   the index of the last value to select. The number of values selected in each
-//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
-//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
-//   the second to last. `begin_mask` controls whether to replace the explicitly
-//   given `begin` with an implicit effective value of `0` if `stride > 0` and
-//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
-//   required to create the largest open interval. For example, given a shape
-//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
-//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
-//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
-//   first dimension of a tensor while dropping the last two (in the original
-//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
-//
-// - A single index. This is used to keep only elements that have a given
-//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
-//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
-//   `shrink_axis_mask`.
-//
-// Each conceptual range specification is encoded in the op's argument. This
-// encoding is best understand by considering a non-trivial example. In
-// particular,
-// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
-//
-// ```
-// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
-// end = [2, 4, x, x, -3, x]
-// strides = [1, 1, x, x, -1, 1]
-// begin_mask = 1<<4 | 1 << 5 = 48
-// end_mask = 1<<5 = 32
-// ellipsis_mask = 1<<3 = 8
-// new_axis_mask = 1<<2 4
-// shrink_axis_mask = 1<<0
-// ```
-//
-// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
-// the slice becomes (2, 1, 5, 5, 2, 5).
-// Let us walk step by step through each argument specification.
-//
-// 1.  The first argument in the example slice is turned into `begin = 1` and
-// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
-// also set the appropriate bit in `shrink_axis_mask`.
-//
-// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
-// zero bits contributed.
-//
-// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
-// dimension in the final shape. Dummy values are contributed to begin,
-// end and stride, while the new_axis_mask bit is set.
-//
-// 4. `...` grab the full ranges from as many dimensions as needed to
-// fully specify a slice for every dimension of the input shape.
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unstage",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("Unstage", err)
+		return
+	}
+	return values
+}
+
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
+
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
-// with a dimension that has shape `s` is converted to a positive index
-// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
-// is done internally so begin, end and strides receive x, -3, and -1.
-// The appropriate begin_mask bit is set to indicate the start range is the
-// full range (ignoring the x).
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues a tuple of one or more tensors in the given queue.
 //
-// 6. `:` indicates that the entire contents of the corresponding dimension
-// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
-// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
-// `end_mask` are also set.
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// *Requirements*:
-//   `0 != strides[i] for i in [0, m)`
-//   `ellipsis_mask must be a power of two (only one ellipsis)`
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
 //
-//	begin: `begin[k]` specifies the offset into the `k`th range specification.
-// The exact dimension this corresponds to will be determined by context.
-// Out-of-bounds values will be silently clamped. If the `k`th bit of
-// `begin_mask` then `begin[k]` is ignored and the full range of the
-// appropriate dimension is used instead. Negative values causes indexing
-// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
-//	end: `end[i]` is like `begin` with the exception that `end_mask` is
-// used to determine full ranges.
-//	strides: `strides[i]` specifies the increment in the `i`th specification
-// after extracting a given element. Negative indices will reverse
-// the original order. Out or range values are
-// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
-func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
+// Returns the created operation.
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21883,186 +23669,107 @@ func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StridedSlice",
+		Type: "QueueEnqueueV2",
 		Input: []tf.Input{
-			input, begin, end, strides,
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
-type PriorityQueueV2Attr func(optionalAttr)
-
-// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
-//
-// value: The type of each component in a value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["component_types"] = value
-	}
-}
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
 
-// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
 // If not specified, defaults to -1
-func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// PriorityQueueV2Container sets the optional container attribute to value.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements sorted by the first component value.
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
 //
-// Note that the PriorityQueue requires the first component of any element
-// to be a scalar int64, in addition to the other elements declared by
-// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-// entry in their input (resp. output) lists.
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	shapes: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns The handle to the queue.
-func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PriorityQueueV2",
-
+		Type: "QueueDequeueManyV2",
+		Input: []tf.Input{
+			handle, n,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
+		return
+	}
+	return components
 }
 
-// UnstageAttr is an optional argument to Unstage.
-type UnstageAttr func(optionalAttr)
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
 
-// UnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// REQUIRES: value >= 0
-func UnstageCapacity(value int64) UnstageAttr {
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["pad"] = value
 	}
 }
 
-// UnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Encode strings into web-safe base64 format.
 //
-// REQUIRES: value >= 0
-func UnstageMemoryLimit(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// UnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnstageContainer(value string) UnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnstageSharedName(value string) UnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op is similar to a lightweight Dequeue.
-//
-// The basic functionality is similar to dequeue with many fewer
-// capabilities and options.  This Op is optimized for performance.
-func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unstage",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("Unstage", err)
-		return
-	}
-	return values
-}
-
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
-
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the largest value across dimensions of a tensor.
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// Web-safe means that the encoder uses - and _ instead of + and /.
 //
 // Arguments:
+//	input: Strings to be encoded.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22071,9 +23778,9 @@ func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
-			input, dimension,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -22081,106 +23788,77 @@ func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+// Deprecated. Use TensorArrayCloseV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+//
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
+// Forwards the value of an available tensor from `inputs` to `output`.
 //
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "Merge",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
-type QueueEnqueueV2Attr func(optionalAttr)
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
 
-// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
 //
-// value: If the queue is full, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["cancel_pending_enqueues"] = value
 	}
 }
 
-// Enqueues a tuple of one or more tensors in the given queue.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// Closes the given queue.
 //
-// N.B. If the queue is full, this operation will block until the given
-// element has been enqueued (or 'timeout_ms' elapses, if specified).
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
 //
 // Arguments:
 //	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should be taken.
 //
 // Returns the created operation.
-func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22189,191 +23867,154 @@ func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, opti
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueV2",
+		Type: "QueueCloseV2",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			handle,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
-
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atanh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
-//
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// Returns true if queue is closed.
 //
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
+// This operation returns true if the queue is closed and false if the queue
+// is open.
 //
 // Arguments:
 //	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
-//
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
+		Type: "QueueIsClosedV2",
 		Input: []tf.Input{
-			handle, n,
+			handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
+// Returns the batched diagonal part of a batched tensor.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
-	return func(m optionalAttr) {
-		m["pad"] = value
-	}
-}
-
-// Encode strings into web-safe base64 format.
+// This operation returns a tensor with the `diagonal` part
+// of the batched `input`. The `diagonal` part is computed as follows:
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
 //
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+//
+// The input must be at least a matrix.
+//
+// For example:
+//
+// ```
+// # 'input' is [[[1, 0, 0, 0]
+//                [0, 2, 0, 0]
+//                [0, 0, 3, 0]
+//                [0, 0, 0, 4]],
+//               [[5, 0, 0, 0]
+//                [0, 6, 0, 0]
+//                [0, 0, 7, 0]
+//                [0, 0, 0, 8]]]
+//
+// and input.shape = (2, 4, 4)
+//
+// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// which has shape (2, 4)
+// ```
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	input: Rank `k` tensor where `k >= 2`.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns The extracted diagonal(s) having shape
+// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
+func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "MatrixDiagPart",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayCloseV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+// Computes the absolute value of a tensor.
 //
-// Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
+		Type: "Abs",
 		Input: []tf.Input{
-			handle,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
 
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
+// StackV2StackName sets the optional stack_name attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["stack_name"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// A stack that produces elements in first-in last-out order.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
-//
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "StackV2",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			max_size,
 		},
 		Attrs: attrs,
 	}
@@ -22381,355 +24022,77 @@ func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_
 	return op.Output(0)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReadFile",
-		Input: []tf.Input{
-			filename,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
 
-// Concatenates tensors along one dimension.
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
 //
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
-//
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
-//
-// Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Merge",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
 //
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
 	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Closes the given queue.
+// Stage (key, values) in the underlying container which behaves like a ordered
 //
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// associative container.   Elements are ordered by key.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
 //
 // Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "OrderedMapStage",
 		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns true if queue is closed.
-//
-// This operation returns true if the queue is closed and false if the queue
-// is open.
-//
-// Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the batched diagonal part of a batched tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the batched `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-//
-// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-//
-// The input must be at least a matrix.
-//
-// For example:
-//
-// ```
-// # 'input' is [[[1, 0, 0, 0]
-//                [0, 2, 0, 0]
-//                [0, 0, 3, 0]
-//                [0, 0, 0, 4]],
-//               [[5, 0, 0, 0]
-//                [0, 6, 0, 0]
-//                [0, 0, 7, 0]
-//                [0, 0, 0, 8]]]
-//
-// and input.shape = (2, 4, 4)
-//
-// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// which has shape (2, 4)
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor where `k >= 2`.
-//
-// Returns The extracted diagonal(s) having shape
-// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiagPart",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the absolute value of a tensor.
-//
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Abs",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Flushes and closes the summary writer.
-//
-// Also removes it from the resource manager. To reopen, use another
-// CreateSummaryFileWriter op.
-//
-// Arguments:
-//	writer: A handle to the summary writer resource.
-//
-// Returns the created operation.
-func CloseSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CloseSummaryWriter",
-		Input: []tf.Input{
-			writer,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
-
-// StackV2StackName sets the optional stack_name attribute to value.
-//
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
-	return func(m optionalAttr) {
-		m["stack_name"] = value
-	}
-}
-
-// A stack that produces elements in first-in last-out order.
-//
-// Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
-//
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StackV2",
-		Input: []tf.Input{
-			max_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
+			key, indices, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
@@ -22850,15 +24213,62 @@ func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
 //
-// If the given TensorArray gradient already exists, returns a reference to it.
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
 //
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
 //
-// **A note about the input flow_in:**
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
 //
-// The handle flow_in forces the execution of the gradient lookup to occur
+// Arguments:
+//	bytes: A Tensor of string which is compressed.
+//
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCompressed",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorArray for storing the gradients of values in the given handle.
+//
+// If the given TensorArray gradient already exists, returns a reference to it.
+//
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
+//
+// **A note about the input flow_in:**
+//
+// The handle flow_in forces the execution of the gradient lookup to occur
 // only after certain other operations have occurred.  For example, when
 // the forward TensorArray is dynamically sized, writes to this TensorArray
 // may resize the object.  The gradient TensorArray is statically sized based
@@ -24114,194 +25524,6 @@ func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...
 	return values
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DepthToSpaceAttr is an optional argument to DepthToSpace.
-type DepthToSpaceAttr func(optionalAttr)
-
-// DepthToSpaceDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthToSpace for tensors of type T.
-//
-// Rearranges data from depth into blocks of spatial data.
-// This is the reverse transformation of SpaceToDepth. More specifically,
-// this op outputs a copy of the input tensor where values from the `depth`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions.
-// The attr `block_size` indicates the input block size and how the data is moved.
-//
-//   * Chunks of data of size `block_size * block_size` from depth are rearranged
-//     into non-overlapping blocks of size `block_size x block_size`
-//   * The width the output tensor is `input_depth * block_size`, whereas the
-//     height is `input_height * block_size`.
-//   * The Y, X coordinates within each block of the output image are determined
-//     by the high order component of the input channel index.
-//   * The depth of the input tensor must be divisible by
-//     `block_size * block_size`.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-//                         within the input image, bX, bY means coordinates
-//                         within the output block, oC means output channels).
-//      The output would be the input transposed to the following layout:
-//      n,iY,bY,iX,bX,oC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1, 2, 3, 4]]]]
-//
-// ```
-//
-// This operation will output a tensor of shape `[1, 2, 2, 1]`:
-//
-// ```
-//    [[[[1], [2]],
-//      [[3], [4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-// the corresponding output will have 2x2 elements and will have a depth of
-// 1 channel (1 = `4 / (block_size * block_size)`).
-// The output element shape is `[2, 2, 1]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// This operation, for block size of 2, will return the following tensor of shape
-// `[1, 2, 2, 3]`
-//
-// ```
-//    [[[[1, 2, 3], [4, 5, 6]],
-//      [[7, 8, 9], [10, 11, 12]]]]
-//
-// ```
-//
-// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
-//
-// ```
-// x =  [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 4 4 1]`:
-//
-// ```
-// x = [[[ [1],   [2],  [5],  [6]],
-//       [ [3],   [4],  [7],  [8]],
-//       [ [9],  [10], [13],  [14]],
-//       [ [11], [12], [15],  [16]]]]
-//
-// ```
-//
-// Arguments:
-//
-//	block_size: The size of the spatial block, same as in Space2Depth.
-func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthToSpace",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MapStageAttr is an optional argument to MapStage.
 type MapStageAttr func(optionalAttr)
 
@@ -24690,37 +25912,152 @@ func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSi
 	return op.Output(0)
 }
 
-// CTCLossAttr is an optional argument to CTCLoss.
-type CTCLossAttr func(optionalAttr)
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
 
-// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
-//
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+// ShapeNOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
 	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
+		m["out_type"] = value
 	}
 }
 
-// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+// Returns shape of tensors.
 //
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
-//
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ShapeN",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
+	}
+	return output
+}
+
+// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
+type UniformCandidateSamplerAttr func(optionalAttr)
+
+// UniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// CTCLossAttr is an optional argument to CTCLoss.
+type CTCLossAttr func(optionalAttr)
+
+// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+//
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+//
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
 // If not specified, defaults to false
 func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
 	return func(m optionalAttr) {
@@ -24972,336 +26309,49 @@ func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Scatter `updates` into a new (initially zero) tensor according to `indices`.
-//
-// Creates a new tensor by applying sparse `updates` to individual
-// values or slices within a zero tensor of the given `shape` according to
-// indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-// extracts values or slices from a given tensor.
-//
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [0, 11, 0, 10, 9, 0, 0, 12]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
+// AbortAttr is an optional argument to Abort.
+type AbortAttr func(optionalAttr)
+
+// AbortErrorMsg sets the optional error_msg attribute to value.
 //
-// The resulting tensor would look like this:
+// value: A string which is the message associated with the exception.
+// If not specified, defaults to ""
+func AbortErrorMsg(value string) AbortAttr {
+	return func(m optionalAttr) {
+		m["error_msg"] = value
+	}
+}
+
+// AbortExitWithoutError sets the optional exit_without_error attribute to value.
+// If not specified, defaults to false
+func AbortExitWithoutError(value bool) AbortAttr {
+	return func(m optionalAttr) {
+		m["exit_without_error"] = value
+	}
+}
+
+// Raise a exception to abort the process when called.
 //
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+// If exit_without_error is true, the process will exit normally,
+// otherwise it will exit with a SIGABORT signal.
 //
-// Arguments:
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
+// Returns nothing but an exception.
 //
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+// Returns the created operation.
+func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNd",
-		Input: []tf.Input{
-			indices, updates, shape,
-		},
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SpaceToDepthAttr is an optional argument to SpaceToDepth.
-type SpaceToDepthAttr func(optionalAttr)
+	opspec := tf.OpSpec{
+		Type: "Abort",
 
-// SpaceToDepthDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+		Attrs: attrs,
 	}
-}
-
-// SpaceToDepth for tensors of type T.
-//
-// Rearranges blocks of spatial data, into depth. More specifically,
-// this op outputs a copy of the input tensor where values from the `height`
-// and `width` dimensions are moved to the `depth` dimension.
-// The attr `block_size` indicates the input block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` are rearranged
-//     into depth at each location.
-//   * The depth of the output tensor is `block_size * block_size * input_depth`.
-//   * The Y, X coordinates within each block of the input become the high order
-//     component of the output channel index.
-//   * The input tensor's height and width must be divisible by block_size.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
-//                         within the output image, bX, bY means coordinates
-//                         within the input block, iC means input channels).
-//      The output would be a transpose to the following layout:
-//      n,oY,oX,bY,bX,iC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1], [2]],
-//       [[3], [4]]]]
-// ```
-//
-// This operation will output a tensor of shape `[1, 1, 1, 4]`:
-//
-// ```
-// [[[[1, 2, 3, 4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-// the corresponding output will have a single element (i.e. width and height are
-// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-// The output element shape is `[1, 1, 4]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// This operation, for block_size of 2, will return the following tensor of shape
-// `[1, 1, 1, 12]`
-//
-// ```
-// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [5],  [6]],
-//       [[3],   [4],  [7],  [8]],
-//       [[9],  [10], [13],  [14]],
-//       [[11], [12], [15],  [16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 2 2 4]`:
-//
-// ```
-// x = [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
-//
-// Arguments:
-//
-//	block_size: The size of the spatial block.
-func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SpaceToDepth",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AbortAttr is an optional argument to Abort.
-type AbortAttr func(optionalAttr)
-
-// AbortErrorMsg sets the optional error_msg attribute to value.
-//
-// value: A string which is the message associated with the exception.
-// If not specified, defaults to ""
-func AbortErrorMsg(value string) AbortAttr {
-	return func(m optionalAttr) {
-		m["error_msg"] = value
-	}
-}
-
-// AbortExitWithoutError sets the optional exit_without_error attribute to value.
-// If not specified, defaults to false
-func AbortExitWithoutError(value bool) AbortAttr {
-	return func(m optionalAttr) {
-		m["exit_without_error"] = value
-	}
-}
-
-// Raise a exception to abort the process when called.
-//
-// If exit_without_error is true, the process will exit normally,
-// otherwise it will exit with a SIGABORT signal.
-//
-// Returns nothing but an exception.
-//
-// Returns the created operation.
-func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Abort",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
-type UniformCandidateSamplerAttr func(optionalAttr)
-
-// UniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniformCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
 // FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
@@ -25463,31 +26513,149 @@ func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Elementwise computes the bitwise AND of `x` and `y`.
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
 //
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Elementwise computes the bitwise left-shift of `x` and `y`.
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
+//
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
+//
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a tf.Example proto (as a string) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleExample",
+		Input: []tf.Input{
+			serialized, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise left-shift of `x` and `y`.
+//
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
 		Type: "LeftShift",
@@ -25917,31 +27085,6 @@ func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf
 	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Concat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 //
 // The lower regularized incomplete Gamma function is defined as:
@@ -26131,129 +27274,122 @@ func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Flips all bits elementwise.
+// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
+type QuantizedInstanceNormAttr func(optionalAttr)
+
+// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If True, `given_y_min` and `given_y_min`
+// and `given_y_max` are used as the output range. Otherwise,
+// the implementation computes the output range.
+// If not specified, defaults to false
+func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["output_range_given"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Invert",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+//
+// value: Output in `y_min` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_min"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
+// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
+//
+// value: Output in `y_max` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_max"] = value
+	}
+}
 
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
+// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
+//
+// value: A small float number to avoid dividing by 0.
+// If not specified, defaults to 1e-05
+func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["variance_epsilon"] = value
 	}
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
+// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
 //
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
+// value: Minimum value of `y_max - y_min`
+// If not specified, defaults to 0.001
+func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["min_separation"] = value
+	}
+}
+
+// Quantized Instance normalization.
 //
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+// Arguments:
+//	x: A 4D input Tensor.
+//	x_min: The value represented by the lowest quantized input.
+//	x_max: The value represented by the highest quantized input.
 //
-// ```
-// if T == qint8, in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
+func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedInstanceNorm",
+		Input: []tf.Input{
+			x, x_min, x_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the diagonal part of the tensor.
 //
-// *MIN_COMBINED Mode Example*
+// This operation returns a tensor with the `diagonal` part
+// of the `input`. The `diagonal` part is computed as follows:
 //
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
+// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
 //
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
+// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
 //
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
+// For example:
 //
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
 // ```
+// # 'input' is [[1, 0, 0, 0]
+//               [0, 2, 0, 0]
+//               [0, 0, 3, 0]
+//               [0, 0, 0, 4]]
 //
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
+// tf.diag_part(input) ==> [1, 2, 3, 4]
 // ```
 //
 // Arguments:
+//	input: Rank k tensor where k is even and not zero.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// Returns The extracted diagonal.
+func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "DiagPart",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -26527,1621 +27663,115 @@ func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Outp
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Eagerly executes a python function to compute func(input)->output. The
-//
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
-	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
-	}
-	return output
-}
-
-// Stops gradient computation.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
-//
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
-//
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StopGradient",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
-
-// PreventGradientMessage sets the optional message attribute to value.
-//
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// An identity op that triggers an error if a gradient is requested.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
-//
-// Arguments:
-//	input: any tensor.
-//
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PreventGradient",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks a tensor for NaN and Inf values.
-//
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-//
-// Arguments:
-//
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"message": message}
-	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Shuffle dimensions of x according to a permutation and conjugate the result.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
-func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConjugateTranspose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniqueV2Attr is an optional argument to UniqueV2.
-type UniqueV2Attr func(optionalAttr)
-
-// UniqueV2OutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`.
-//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
-// find the unique elements.
-//
-// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
-// value of x in the output y.
-func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniqueV2",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Return a slice from 'input'.
-//
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
-//
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
-//
-// Arguments:
-//
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Slice",
-		Input: []tf.Input{
-			input, begin, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
-type StridedSliceGradAttr func(optionalAttr)
-
-// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// StridedSliceGradEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Returns the gradient of `StridedSlice`.
-//
-// Since `StridedSlice` cuts out pieces of its `input` which is size
-// `shape`, its gradient will have the same shape (which is passed here
-// as `shape`). The gradient will be zero in any element that the slice
-// does not select.
-//
-// Arguments are the same as StridedSliceGrad with the exception that
-// `dy` is the input gradient to be propagated and `shape` is the
-// shape of `StridedSlice`'s `input`.
-func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StridedSliceGrad",
-		Input: []tf.Input{
-			shape, begin, end, strides, dy,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the gradient of `Tile`.
-//
-// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
-//
-// Since `Tile` takes an input and repeats the input `multiples` times
-// along each dimension, `TileGrad` takes in `multiples` and aggregates
-// each repeated tile of `input` into `output`.
-func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TileGrad",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
-
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
-//
-// Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
-//
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
-//
-// This is typically used by gradient computations for a broadcasting operation.
-func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastGradientArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Pads a tensor with mirrored values.
-//
-// This operation pads a `input` with mirrored values according to the `paddings`
-// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many values to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many values to add after the contents of `input`
-// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-// (if false, respectively).
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6]].
-// # 'paddings' is [[1, 1]], [2, 2]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-//                       [2, 1, 1, 2, 3, 3, 2]
-//                       [5, 4, 4, 5, 6, 6, 5]
-//                       [5, 4, 4, 5, 6, 6, 5]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be padded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-// do not include the borders, while in symmetric mode the padded regions
-// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-// it is `[1, 2, 3, 3, 2]` in symmetric mode.
-//
-// Returns The padded tensor.
-func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder op for a value that will be fed into the computation.
-//
-// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
-//
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor. The shape can be any partially-specified
-// shape.  To be unconstrained, pass in a shape with unknown rank.
-//
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "PlaceholderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the adadelta scheme.
-//
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
-
-// SqueezeAxis sets the optional axis attribute to value.
-//
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
-	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
-	}
-}
-
-// Removes dimensions of size 1 from the shape of a tensor.
-//
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
-//
-// For example:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
-//
-// Or, to remove specific size 1 dimensions:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
-//
-// Arguments:
-//	input: The `input` to squeeze.
-//
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Squeeze",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SpaceToBatch for N-D tensors of type T.
-//
-// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-// grid of blocks of shape `block_shape`, and interleaves these blocks with the
-// "batch" dimension (0) such that in the output, the spatial dimensions
-// `[1, ..., M]` correspond to the position within the grid, and the batch
-// dimension combines both the position within a spatial block and the original
-// batch position.  Prior to division into blocks, the spatial dimensions of the
-// input are optionally zero padded according to `paddings`.  See below for a
-// precise description.
-//
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has `M` dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
-//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
-//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
-//    input according to `paddings` to produce `padded` of shape `padded_shape`.
-//
-// 2. Reshape `padded` to `reshaped_padded` of shape:
-//
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//        block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1],
-//       block_shape[M-1]] +
-//      remaining_shape
-//
-// 3. Permute dimensions of `reshaped_padded` to produce
-//    `permuted_reshaped_padded` of shape:
-//
-//      block_shape +
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
-//    dimension, producing an output tensor of shape:
-//
-//      [batch * prod(block_shape)] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
-//     paddings = `[[0, 0], [2, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[8, 1, 3, 1]` and value:
-//
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SpaceToBatchND",
-		Input: []tf.Input{
-			input, block_shape, paddings,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
-type QuantizeAndDequantizeV2Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
-//
-// value: If the quantization is signed or unsigned.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
-//
-// value: If the range is given or should be computed from the tensor.
-// If not specified, defaults to false
-func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
-//
-// This op simulates the precision loss from the quantized forward pass by:
-// 1. Quantizing the tensor to fixed point numbers, which should match the target
-//    quantization method when it is used in inference.
-// 2. Dequantizing it back to floating point numbers for the following ops, most
-//    likely matmul.
-//
-// There are different ways to quantize. This version does not use the full range
-// of the output type, choosing to elide the lowest possible value for symmetry
-// (e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
-// quantization), so that 0.0 maps to 0.
-//
-// To perform this op, we first find the range of values in our tensor. The range
-// we use is always centered on 0, so we find m such that
-//
-// 1. m = max(abs(input_min), abs(input_max)) if range_given is true,
-// 2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
-//
-// Our input tensor range is then [-m, m].
-//
-// Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
-// If signed_input is true, this is
-//
-//   [min_fixed, max_fixed ] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
-//
-// Otherwise, if signed_input is false, the fixed-point range is
-//
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
-//
-// From this we compute our scaling factor, s:
-//
-//   s = (max_fixed - min_fixed) / (2 * m).
-//
-// Now we can quantize and dequantize the elements of our tensor.  An element e
-// is transformed into e':
-//
-//   e' = (e * s).round_to_nearest() / s.
-//
-// Note that we have a different number of buckets in the signed vs. unsigned
-// cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
-// vs. 255 in the unsigned case.
-//
-// For example, suppose num_bits = 8 and m = 1.  Then
-//
-//   [min_fixed, max_fixed] = [-127, 127], and
-//   s = (127 + 127) / 2 = 127.
-//
-// Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
-// {-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
-//
-// Arguments:
-//	input: Tensor to quantize and then dequantize.
-//	input_min: If range_given, this is the min of the range, otherwise this input
-// will be ignored.
-//	input_max: If range_given, this is the max of the range, otherwise this input
-// will be ignored.
-func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV2",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SpaceToBatch for 4-D tensors of type T.
-//
-// This is a legacy version of the more general SpaceToBatchND.
-//
-// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-// More specifically, this op outputs a copy of the input tensor where values from
-// the `height` and `width` dimensions are moved to the `batch` dimension. After
-// the zero-padding, both `height` and `width` of the input must be divisible by the
-// block size.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, depth]`.
-//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-//   the padding of the input with zeros across the spatial dimensions as follows:
-//
-//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
-//
-//   The effective spatial dimensions of the zero-padded input tensor will be:
-//
-//       height_pad = pad_top + height + pad_bottom
-//       width_pad = pad_left + width + pad_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` in the height and
-//     width dimensions are rearranged into the batch dimension at each location.
-//   * The batch of the output tensor is `batch * block_size * block_size`.
-//   * Both height_pad and width_pad must be divisible by block_size.
-//
-// The shape of the output will be:
-//
-//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//      depth]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[8, 1, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-//
-func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	opspec := tf.OpSpec{
-		Type: "SpaceToBatch",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
-
-// UnpackAxis sets the optional axis attribute to value.
-//
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
-//
-// This is the opposite of `pack`.
-//
-// Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
-//
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num": num}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unpack",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
-	}
-	return output
-}
-
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
-//
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
-//
-//
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
-	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Delete the stack from its resource container.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// BatchToSpace for N-D tensors of type T.
-//
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
-//
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
-//
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
-//
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
-//
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
-//
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
-		Input: []tf.Input{
-			input, block_shape, crops,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Extract `patches` from `images` and put them in the "depth" output dimension.
-//
-// Arguments:
-//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `images`.
-//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
-// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-// input stride, specifying how far two consecutive patch samples are in the
-// input. Equivalent to extracting patches with
-// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`. This is equivalent to
-// `rate` in dilated (a.k.a. Atrous) convolutions.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_rows, ksize_cols, 1]
-//       strides = [1, strides_rows, strides_cols, 1]
-//       rates = [1, rates_rows, rates_cols, 1]
-// ```
-//
-// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-// ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-// `out_rows` and `out_cols` are the dimensions of the output patches.
-func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "ExtractImagePatches",
-		Input: []tf.Input{
-			images,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Bitcasts a tensor from one type to another without copying data.
-//
-// Given a tensor `input`, this operation returns a tensor that has the same buffer
-// data as `input` with datatype `type`.
-//
-// If the input datatype `T` is larger than the output datatype `type` then the
-// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
-//
-// If `T` is smaller than `type`, the operator requires that the rightmost
-// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-// [..., sizeof(`type`)/sizeof(`T`)] to [...].
-//
-// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-// endian orderings will give different results.
-func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"type": type_}
-	opspec := tf.OpSpec{
-		Type: "Bitcast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
-
-// OneHotAxis sets the optional axis attribute to value.
-//
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
-//
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
-//
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
-//
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
-//
-//
-// Examples
-// =========
-//
-// Suppose that
-//
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[4 x 3]`:
-//
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
-//
-// Suppose that
-//
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
-//
-// Then output is `[3 x 4]`:
-//
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
-// Suppose that
-//
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[2 x 2 x 3]`:
-//
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
-//
-// Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
-//
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OneHot",
-		Input: []tf.Input{
-			indices, depth, on_value, off_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
-
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues a tuple of one or more tensors from the given queue.
-//
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
-//
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
-//
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
-		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
-}
-
-// Returns locations of nonzero / true values in a tensor.
-//
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
-//
-// For example:
-//
-// ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
-//
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-//
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-//
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+
+// Eagerly executes a python function to compute func(input)->output. The
+//
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "EagerPyFunc",
 		Input: []tf.Input{
-			condition,
+			tf.OutputList(input),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
-
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
 	}
+	return output
 }
 
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
+// Stops gradient computation.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StopGradient",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
+//
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
 	return func(m optionalAttr) {
-		m["input_max"] = value
+		m["message"] = value
 	}
 }
 
-// Use QuantizeAndDequantizeV2 instead.
+// An identity op that triggers an error if a gradient is requested.
 //
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28150,7 +27780,7 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
+		Type: "PreventGradient",
 		Input: []tf.Input{
 			input,
 		},
@@ -28160,109 +27790,86 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 	return op.Output(0)
 }
 
-// Returns the diagonal part of the tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-//
-// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-//
-// For example:
-//
-// ```
-// # 'input' is [[1, 0, 0, 0]
-//               [0, 2, 0, 0]
-//               [0, 0, 3, 0]
-//               [0, 0, 0, 4]]
+// Checks a tensor for NaN and Inf values.
 //
-// tf.diag_part(input) ==> [1, 2, 3, 4]
-// ```
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
 //
 // Arguments:
-//	input: Rank k tensor where k is even and not zero.
 //
-// Returns The extracted diagonal.
-func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"message": message}
 	opspec := tf.OpSpec{
-		Type: "DiagPart",
+		Type: "CheckNumerics",
 		Input: []tf.Input{
-			input,
+			tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
-type QuantizedInstanceNormAttr func(optionalAttr)
-
-// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
+// Shuffle dimensions of x according to a permutation and conjugate the result.
 //
-// value: If True, `given_y_min` and `given_y_min`
-// and `given_y_max` are used as the output range. Otherwise,
-// the implementation computes the output range.
-// If not specified, defaults to false
-func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["output_range_given"] = value
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
-//
-// value: Output in `y_min` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_min"] = value
+	opspec := tf.OpSpec{
+		Type: "ConjugateTranspose",
+		Input: []tf.Input{
+			x, perm,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
-//
-// value: Output in `y_max` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_max"] = value
-	}
-}
+// UniqueV2Attr is an optional argument to UniqueV2.
+type UniqueV2Attr func(optionalAttr)
 
-// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
-//
-// value: A small float number to avoid dividing by 0.
-// If not specified, defaults to 1e-05
-func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
+// UniqueV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
 	return func(m optionalAttr) {
-		m["variance_epsilon"] = value
+		m["out_idx"] = value
 	}
 }
 
-// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
+// Finds unique elements in a 1-D tensor.
 //
-// value: Minimum value of `y_max - y_min`
-// If not specified, defaults to 0.001
-func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["min_separation"] = value
-	}
-}
-
-// Quantized Instance normalization.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
 //
 // Arguments:
-//	x: A 4D input Tensor.
-//	x_min: The value represented by the lowest quantized input.
-//	x_max: The value represented by the highest quantized input.
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+// find the unique elements.
 //
-// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
-func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.
+func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28271,12 +27878,12 @@ func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedInstanceNorm",
+		Type: "UniqueV2",
 		Input: []tf.Input{
-			x, x_min, x_max,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 9dee1aa72bf0d76ee35931f1e852bfd22556a540..1be4c838f3526bcdf32d8bda5a1ada776a8c1b21 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -311,9 +311,11 @@ tf_cc_test(
     srcs = [
         "src/gen/cc/source_writer_test.cc",
     ],
+    data = [
+        "src/gen/resources/test.snippet.java",
+    ],
     deps = [
         ":java_op_gen_lib",
-        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -385,15 +387,3 @@ genrule(
     cmd = "cp $< $@",
     output_to_bindir = 1,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 7f3a83b195308b3261049c2845d018280dd61601..c99d04869a714c95e78db6f14caab515a175cb38 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc0</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index cc436ff8409a5a17d69dbe0fa0ce9c6333349987..4561c2c8ade2826f779ff20c2ae1702fc97fa797 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc0</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 47f678382aad910559395cd0b304321c30c92ede..82a2b8e7694d15b7de921c1005ece30236d755ee 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc0</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 42d32810a263df7ba1044e1ff3e7c4e8124d478e..4c1ec0cc8032009e7b206537dd15f1fedece4855 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.7.0-rc0</version>
+  <version>1.7.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 463893ce62dc7e72f4cd1c909ae56d097d060035..fcd8236bad315319cb1b8b57cf6ec5eb5f277705 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc0</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 60e7f3c199f1451a054f6a779a63a98ff428bd87..241581713ad9b129d8df4a297e9e4a3e712117fc 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc0</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/gen/cc/java_defs.h b/tensorflow/java/src/gen/cc/java_defs.h
index 615cdc165b36abdc3cf5e717ddb8b385367c067f..59f8beaee78a2f40f6743ca10f72435e757db090 100644
--- a/tensorflow/java/src/gen/cc/java_defs.h
+++ b/tensorflow/java/src/gen/cc/java_defs.h
@@ -17,10 +17,7 @@ limitations under the License.
 #define TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
 
 #include <string>
-#include <vector>
-#include <deque>
-
-#include "tensorflow/core/platform/env.h"
+#include <list>
 
 namespace tensorflow {
 namespace java {
@@ -104,17 +101,17 @@ class Type {
     description_ = description;
     return *this;
   }
-  const std::vector<Type>& parameters() const { return parameters_; }
+  const std::list<Type>& parameters() const { return parameters_; }
   Type& add_parameter(const Type& parameter) {
     parameters_.push_back(parameter);
     return *this;
   }
-  const std::vector<Annotation>& annotations() const { return annotations_; }
+  const std::list<Annotation>& annotations() const { return annotations_; }
   Type& add_annotation(const Annotation& annotation) {
     annotations_.push_back(annotation);
     return *this;
   }
-  const std::deque<Type>& supertypes() const { return supertypes_; }
+  const std::list<Type>& supertypes() const { return supertypes_; }
   Type& add_supertype(const Type& type) {
     if (type.kind_ == CLASS) {
       supertypes_.push_front(type);  // keep superclass at the front of the list
@@ -141,9 +138,9 @@ class Type {
   string name_;
   string package_;
   string description_;
-  std::vector<Type> parameters_;
-  std::vector<Annotation> annotations_;
-  std::deque<Type> supertypes_;
+  std::list<Type> parameters_;
+  std::list<Annotation> annotations_;
+  std::list<Type> supertypes_;
 };
 
 // Definition of a Java annotation
@@ -223,16 +220,12 @@ class Method {
     return_description_ = description;
     return *this;
   }
-  const std::vector<Variable>& arguments() const { return arguments_; }
-  Method& add_arguments(const std::vector<Variable>& args) {
-    arguments_.insert(arguments_.cend(), args.cbegin(), args.cend());
-    return *this;
-  }
+  const std::list<Variable>& arguments() const { return arguments_; }
   Method& add_argument(const Variable& var) {
     arguments_.push_back(var);
     return *this;
   }
-  const std::vector<Annotation>& annotations() const { return annotations_; }
+  const std::list<Annotation>& annotations() const { return annotations_; }
   Method& add_annotation(const Annotation& annotation) {
     annotations_.push_back(annotation);
     return *this;
@@ -244,29 +237,13 @@ class Method {
   bool constructor_;
   string description_;
   string return_description_;
-  std::vector<Variable> arguments_;
-  std::vector<Annotation> annotations_;
+  std::list<Variable> arguments_;
+  std::list<Annotation> annotations_;
 
   Method(const string& name, const Type& return_type, bool constructor)
     : name_(name), return_type_(return_type), constructor_(constructor) {}
 };
 
-// A piece of code to read from a file.
-class Snippet {
- public:
-  static Snippet Create(const string& fname, Env* env = Env::Default()) {
-    return Snippet(fname, env);
-  }
-  const string& data() const { return data_; }
-
- private:
-  string data_;
-
-  Snippet(const string& fname, Env* env) {
-    TF_CHECK_OK(ReadFileToString(env, fname, &data_));
-  }
-};
-
 }  // namespace java
 }  // namespace tensorflow
 
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
index 2da81f2911e60be6a47ac13fe8be6142fa283780..214999af9a6f9ee244d336a64830238e6b7ea872 100644
--- a/tensorflow/java/src/gen/cc/source_writer.cc
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -14,49 +14,318 @@ limitations under the License.
 ==============================================================================*/
 
 #include <string>
+#include <algorithm>
+#include <deque>
 
 #include "tensorflow/java/src/gen/cc/source_writer.h"
 
 namespace tensorflow {
+namespace java {
 
-SourceWriter& SourceWriter::Append(const StringPiece& str) {
-  if (!str.empty()) {
-    if (newline_) {
-      DoAppend(left_margin_ + line_prefix_);
-      newline_ = false;
-    }
-    DoAppend(str);
-  }
+SourceWriter::SourceWriter() {
+  // push an empty generic namespace at start, for simplification
+  generic_namespaces_.push(new GenericNamespace());
+}
+
+SourceWriter& SourceWriter::Indent(int tab) {
+  left_margin_.resize(
+      std::max(static_cast<int>(left_margin_.size() + tab), 0), ' ');
+  return *this;
+}
+
+SourceWriter& SourceWriter::Prefix(const char* line_prefix) {
+  line_prefix_ = line_prefix;
   return *this;
 }
 
-SourceWriter& SourceWriter::Write(const string& str) {
+SourceWriter& SourceWriter::Write(const StringPiece& str) {
   size_t line_pos = 0;
   do {
     size_t start_pos = line_pos;
     line_pos = str.find('\n', start_pos);
     if (line_pos != string::npos) {
       ++line_pos;
-      Append(StringPiece(str.data() + start_pos, line_pos - start_pos));
+      Append(str.substr(start_pos, line_pos - start_pos));
       newline_ = true;
     } else {
-      Append(StringPiece(str.data() + start_pos, str.size() - start_pos));
+      Append(str.substr(start_pos, str.size() - start_pos));
     }
   } while (line_pos != string::npos && line_pos < str.size());
 
   return *this;
 }
 
+SourceWriter& SourceWriter::WriteFromFile(const string& fname, Env* env) {
+  string data_;
+  TF_CHECK_OK(ReadFileToString(env, fname, &data_));
+  return Write(data_);
+}
+
+SourceWriter& SourceWriter::Append(const StringPiece& str) {
+  if (!str.empty()) {
+    if (newline_) {
+      DoAppend(left_margin_ + line_prefix_);
+      newline_ = false;
+    }
+    DoAppend(str);
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::AppendType(const Type& type) {
+  if (type.kind() == Type::Kind::GENERIC && type.name().empty()) {
+    Append("?");
+  } else {
+    Append(type.name());
+  }
+  if (!type.parameters().empty()) {
+    Append("<");
+    for (const Type& t : type.parameters()) {
+      if (&t != &type.parameters().front()) {
+        Append(", ");
+      }
+      AppendType(t);
+    }
+    Append(">");
+  }
+  return *this;
+}
+
 SourceWriter& SourceWriter::EndLine() {
   Append("\n");
   newline_ = true;
   return *this;
 }
 
-SourceWriter& SourceWriter::Indent(int tab) {
-  left_margin_.resize(std::max(static_cast<int>(left_margin_.size() + tab), 0),
-                      ' ');
+SourceWriter& SourceWriter::BeginMethod(const Method& method, int modifiers) {
+  GenericNamespace* generic_namespace = PushGenericNamespace(modifiers);
+  if (!method.constructor()) {
+    generic_namespace->Visit(method.return_type());
+  }
+  for (const Variable& v : method.arguments()) {
+    generic_namespace->Visit(v.type());
+  }
+  EndLine();
+  WriteDoc(method.description(), method.return_description(),
+      &method.arguments());
+  if (!method.annotations().empty()) {
+    WriteAnnotations(method.annotations());
+  }
+  WriteModifiers(modifiers);
+  if (!generic_namespace->declared_types().empty()) {
+    WriteGenerics(generic_namespace->declared_types());
+    Append(" ");
+  }
+  if (!method.constructor()) {
+    AppendType(method.return_type()).Append(" ");
+  }
+  Append(method.name()).Append("(");
+  for (const Variable& v : method.arguments()) {
+    if (&v != &method.arguments().front()) {
+      Append(", ");
+    }
+    AppendType(v.type()).Append(v.variadic() ? "... " : " ").Append(v.name());
+  }
+  return Append(")").BeginBlock();
+}
+
+SourceWriter& SourceWriter::EndMethod() {
+  EndBlock();
+  PopGenericNamespace();
   return *this;
 }
 
+SourceWriter& SourceWriter::BeginType(const Type& type,
+    const std::list<Type>* dependencies, int modifiers) {
+  if (!type.package().empty()) {
+    Append("package ").Append(type.package()).Append(";").EndLine();
+  }
+  if (dependencies != nullptr && !dependencies->empty()) {
+    TypeImporter type_importer(type.package());
+    for (const Type& t : *dependencies) {
+      type_importer.Visit(t);
+    }
+    EndLine();
+    for (const string& s : type_importer.imports()) {
+      Append("import ").Append(s).Append(";").EndLine();
+    }
+  }
+  return BeginInnerType(type, modifiers);
+}
+
+SourceWriter& SourceWriter::BeginInnerType(const Type& type, int modifiers) {
+  GenericNamespace* generic_namespace = PushGenericNamespace(modifiers);
+  generic_namespace->Visit(type);
+  EndLine();
+  WriteDoc(type.description());
+  if (!type.annotations().empty()) {
+    WriteAnnotations(type.annotations());
+  }
+  WriteModifiers(modifiers);
+  CHECK_EQ(Type::Kind::CLASS, type.kind()) << ": Not supported yet";
+  Append("class ").Append(type.name());
+  if (!generic_namespace->declared_types().empty()) {
+    WriteGenerics(generic_namespace->declared_types());
+  }
+  if (!type.supertypes().empty()) {
+    bool first_interface = true;
+    for (const Type& t : type.supertypes()) {
+      if (t.kind() == Type::CLASS) {  // superclass is always first in list
+        Append(" extends ");
+      } else if (first_interface) {
+        Append(" implements ");
+        first_interface = false;
+      } else {
+        Append(", ");
+      }
+      AppendType(t);
+    }
+  }
+  return BeginBlock();
+}
+
+SourceWriter& SourceWriter::EndType() {
+  EndBlock();
+  PopGenericNamespace();
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteFields(const std::list<Variable>& fields,
+    int modifiers) {
+  EndLine();
+  for (const Variable& v : fields) {
+    WriteModifiers(modifiers);
+    AppendType(v.type()).Append(" ").Append(v.name()).Append(";");
+    EndLine();
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteModifiers(int modifiers) {
+  if (modifiers & PUBLIC) {
+    Append("public ");
+  } else if (modifiers & PROTECTED) {
+    Append("protected ");
+  } else if (modifiers & PRIVATE) {
+    Append("private ");
+  }
+  if (modifiers & STATIC) {
+    Append("static ");
+  }
+  if (modifiers & FINAL) {
+    Append("final ");
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteDoc(const string& description,
+    const string& return_description, const std::list<Variable>* parameters) {
+  if (description.empty() && return_description.empty()
+      && (parameters == nullptr || parameters->empty())) {
+    return *this;  // no doc to write
+  }
+  bool do_line_break = false;
+  Append("/**").EndLine().Prefix(" * ");
+  if (!description.empty()) {
+    Write(description).EndLine();
+    do_line_break = true;
+  }
+  if (parameters != nullptr && !parameters->empty()) {
+    if (do_line_break) {
+      EndLine();
+      do_line_break = false;
+    }
+    for (const Variable& v : *parameters) {
+      Append("@param ").Append(v.name());
+      if (!v.description().empty()) {
+        Append(" ").Write(v.description());
+      }
+      EndLine();
+    }
+  }
+  if (!return_description.empty()) {
+    if (do_line_break) {
+      EndLine();
+      do_line_break = false;
+    }
+    Append("@return ").Write(return_description).EndLine();
+  }
+  return Prefix("").Append(" **/").EndLine();
+}
+
+SourceWriter& SourceWriter::WriteAnnotations(
+    const std::list<Annotation>& annotations) {
+  for (const Annotation& a : annotations) {
+    Append("@" + a.name());
+    if (!a.attributes().empty()) {
+      Append("(").Append(a.attributes()).Append(")");
+    }
+    EndLine();
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteGenerics(
+    const std::list<const Type*>& generics) {
+  Append("<");
+  for (const Type* pt : generics) {
+    if (pt != generics.front()) {
+      Append(", ");
+    }
+    Append(pt->name());
+    if (!pt->supertypes().empty()) {
+      Append(" extends ").AppendType(pt->supertypes().front());
+    }
+  }
+  return Append(">");
+}
+
+SourceWriter::GenericNamespace* SourceWriter::PushGenericNamespace(
+    int modifiers) {
+  GenericNamespace* generic_namespace;
+  if (modifiers & STATIC) {
+    generic_namespace = new GenericNamespace();
+  } else {
+    generic_namespace = new GenericNamespace(generic_namespaces_.top());
+  }
+  generic_namespaces_.push(generic_namespace);
+  return generic_namespace;
+}
+
+void SourceWriter::PopGenericNamespace() {
+  GenericNamespace* generic_namespace = generic_namespaces_.top();
+  generic_namespaces_.pop();
+  delete generic_namespace;
+}
+
+void SourceWriter::TypeVisitor::Visit(const Type& type) {
+  DoVisit(type);
+  for (const Type& t : type.parameters()) {
+    DoVisit(t);
+  }
+  for (const Annotation& t : type.annotations()) {
+    DoVisit(t);
+  }
+  for (const Type& t : type.supertypes()) {
+    DoVisit(t);
+  }
+}
+
+void SourceWriter::GenericNamespace::DoVisit(const Type& type) {
+  // ignore non-generic parameters, wildcards and generics already declared
+  if (type.kind() == Type::GENERIC
+      && !type.IsWildcard()
+      && generic_names_.find(type.name()) == generic_names_.end()) {
+    declared_types_.push_back(&type);
+    generic_names_.insert(type.name());
+  }
+}
+
+void SourceWriter::TypeImporter::DoVisit(const Type& type) {
+  if (!type.package().empty() && type.package() != current_package_) {
+    imports_.insert(type.package() + '.' + type.name());
+  }
+}
+
+}  // namespace java
 }  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
index bff26eb185db0cf933632f33f916b87d8a757edd..6abe13b5d217b30d826d013e14a590eeb91719fb 100644
--- a/tensorflow/java/src/gen/cc/source_writer.h
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -17,45 +17,23 @@ limitations under the License.
 #define TENSORFLOW_JAVA_SRC_GEN_CC_SOURCE_WRITER_H_
 
 #include <string>
+#include <stack>
+#include <list>
+#include <set>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/java/src/gen/cc/java_defs.h"
 
 namespace tensorflow {
+namespace java {
 
-// A utility class for writing source code, normally generated at
-// compile-time.
-//
-// Source writers are language-agnostic and therefore only expose generic
-// methods common to most languages. Extend or wrap this class to implement
-// language-specific features.
-//
-// Note: if you are looking to reuse this class for generating code in another
-// language than Java, please do by moving it at the '//tensorflow/core/lib/io'
-// level.
+// A class for writing Java source code.
 class SourceWriter {
  public:
+  SourceWriter();
   virtual ~SourceWriter() = default;
 
-  // Returns true if the writer is at the beginnig of a new line
-  bool newline() const { return newline_; }
-
-  // Appends a piece of code or text.
-  //
-  // It is expected that no newline character is present in the data provided,
-  // otherwise Write() must be used.
-  SourceWriter& Append(const StringPiece& str);
-
-  // Writes a block of code or text.
-  //
-  // The data might potentially contain newline characters, therefore it will
-  // be scanned to ensure that each line is indented and prefixed properly,
-  // making it a bit slower than Append().
-  SourceWriter& Write(const string& text);
-
-  // Appends a newline character and start writing on a new line.
-  SourceWriter& EndLine();
-
   // Indents following lines with white spaces.
   //
   // Indentation is cumulative, i.e. the provided tabulation is added to the
@@ -75,18 +53,166 @@ class SourceWriter {
   // Indent(2)->Prefix("//") will result in prefixing lines with "  //".
   //
   // An empty value ("") will remove any line prefix that was previously set.
-  SourceWriter& Prefix(const char* line_prefix) {
-    line_prefix_ = line_prefix;
-    return *this;
+  SourceWriter& Prefix(const char* line_prefix);
+
+  // Writes a source code snippet.
+  //
+  // The data might potentially contain newline characters, therefore it will
+  // be scanned to ensure that each line is indented and prefixed properly,
+  // making it a bit slower than Append().
+  SourceWriter& Write(const StringPiece& text);
+
+  // Writes a source code snippet read from a file.
+  //
+  // All lines of the file at the provided path will be read and written back
+  // to the output of this writer in regard of its current attributes (e.g.
+  // the indentation, prefix, etc.)
+  SourceWriter& WriteFromFile(const string& fname, Env* env = Env::Default());
+
+  // Appends a piece of source code.
+  //
+  // It is expected that no newline character is present in the data provided,
+  // otherwise Write() must be used.
+  SourceWriter& Append(const StringPiece& str);
+
+  // Appends a type to the current line.
+  //
+  // The type is written in its simple form (i.e. not prefixed by its package)
+  // and followed by any parameter types it has enclosed in brackets (<>).
+  SourceWriter& AppendType(const Type& type);
+
+  // Appends a newline character.
+  //
+  // Data written after calling this method will start on a new line, in respect
+  // of the current indentation.
+  SourceWriter& EndLine();
+
+  // Begins a block of source code.
+  //
+  // This method appends a new opening brace to the current data and indent the
+  // next lines according to Google Java Style Guide. The block can optionally
+  // be preceded by an expression (e.g. Append("if(true)").BeginBlock();)
+  SourceWriter& BeginBlock() {
+    return Append(newline_ ? "{" : " {").EndLine().Indent(2);
+  }
+
+  // Ends the current block of source code.
+  //
+  // This method appends a new closing brace to the current data and outdent the
+  // next lines back to the margin used before BeginBlock() was invoked.
+  SourceWriter& EndBlock() {
+    return Indent(-2).Append("}").EndLine();
   }
 
+  // Begins to write a method.
+  //
+  // This method outputs the signature of the Java method from the data passed
+  // in the 'method' parameter and starts a new block. Additionnal modifiers can
+  // also be passed in parameter to define the accesses and the scope of this
+  // method.
+  SourceWriter& BeginMethod(const Method& method, int modifiers = 0);
+
+  // Ends the current method.
+  //
+  // This method ends the block of code that has begun when invoking
+  // BeginMethod() prior to this.
+  SourceWriter& EndMethod();
+
+  // Begins to write the main type of a source file.
+  //
+  // This method outputs the declaration of the Java type from the data passed
+  // in the 'type' parameter and starts a new block. Additionnal modifiers can
+  // also be passed in parameter to define the accesses and the scope of this
+  // type.
+  //
+  // If not null, all types found in the 'dependencies' list will be imported
+  // before declaring the new type.
+  SourceWriter& BeginType(const Type& clazz,
+      const std::list<Type>* dependencies, int modifiers = 0);
+
+  // Begins to write a new inner type.
+  //
+  // This method outputs the declaration of the Java type from the data passed
+  // in the 'type' parameter and starts a new block. Additionnal modifiers can
+  // also be passed in parameter to define the accesses and the scope of this
+  // type.
+  SourceWriter& BeginInnerType(const Type& type, int modifiers = 0);
+
+  // Ends the current type.
+  //
+  // This method ends the block of code that has begun when invoking
+  // BeginType() or BeginInnerType() prior to this.
+  SourceWriter& EndType();
+
+  // Writes a list of variables as fields of a type.
+  //
+  // This method must be called within the definition of a type (see BeginType()
+  // or BeginInnerType()). Additional modifiers can also be passed in parameter
+  // to define the accesses and the scope of those fields.
+  SourceWriter& WriteFields(const std::list<Variable>& fields,
+      int modifiers = 0);
+
  protected:
   virtual void DoAppend(const StringPiece& str) = 0;
 
  private:
+  // A utility base class for visiting elements of a type.
+  class TypeVisitor {
+   public:
+    virtual ~TypeVisitor() = default;
+    void Visit(const Type& type);
+
+   protected:
+    virtual void DoVisit(const Type& type) = 0;
+  };
+
+  // A utility class for keeping track of declared generics in a given scope.
+  class GenericNamespace : public TypeVisitor {
+   public:
+    GenericNamespace() = default;
+    explicit GenericNamespace(const GenericNamespace* parent)
+      : generic_names_(parent->generic_names_) {}
+    std::list<const Type*> declared_types() {
+      return declared_types_;
+    }
+   protected:
+    virtual void DoVisit(const Type& type);
+
+   private:
+    std::list<const Type*> declared_types_;
+    std::set<string> generic_names_;
+  };
+
+  // A utility class for collecting a list of import statements to declare.
+  class TypeImporter : public TypeVisitor {
+   public:
+    explicit TypeImporter(const string& current_package)
+      : current_package_(current_package) {}
+    virtual ~TypeImporter() = default;
+    const std::set<string> imports() {
+      return imports_;
+    }
+   protected:
+    virtual void DoVisit(const Type& type);
+
+   private:
+    string current_package_;
+    std::set<string> imports_;
+  };
+
   string left_margin_;
   string line_prefix_;
   bool newline_ = true;
+  std::stack<GenericNamespace*> generic_namespaces_;
+
+  SourceWriter& WriteModifiers(int modifiers);
+  SourceWriter& WriteDoc(const string& description,
+    const string& return_description = "",
+    const std::list<Variable>* parameters = nullptr);
+  SourceWriter& WriteAnnotations(const std::list<Annotation>& annotations);
+  SourceWriter& WriteGenerics(const std::list<const Type*>& generics);
+  GenericNamespace* PushGenericNamespace(int modifiers);
+  void PopGenericNamespace();
 };
 
 // A writer that outputs source code into a file.
@@ -128,6 +254,7 @@ class SourceBufferWriter : public SourceWriter {
   string* buffer_;
 };
 
+}  // namespace java
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_JAVA_SRC_GEN_CC_SOURCE_WRITER_H_
diff --git a/tensorflow/java/src/gen/cc/source_writer_test.cc b/tensorflow/java/src/gen/cc/source_writer_test.cc
index e9738957548184726395c4e6634ba12a5a9a0109..6926a5a411d070e25f2382c72589d879d3ca2180 100644
--- a/tensorflow/java/src/gen/cc/source_writer_test.cc
+++ b/tensorflow/java/src/gen/cc/source_writer_test.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/java/src/gen/cc/source_writer.h"
+#include <list>
+
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/java/src/gen/cc/java_defs.h"
+#include "tensorflow/java/src/gen/cc/source_writer.h"
 
 namespace tensorflow {
+namespace java {
 namespace {
 
 TEST(AppendTest, SingleLineText) {
@@ -211,5 +215,366 @@ TEST(MarginTest, EmptyPrefix) {
   ASSERT_STREQ(expected, writer.str().data());
 }
 
+TEST(StreamTest, BlocksAndLines) {
+  SourceBufferWriter writer;
+
+  writer.Append("int i = 0;").EndLine()
+        .Append("int j = 10;").EndLine()
+        .Append("if (true)")
+        .BeginBlock()
+          .Append("int aLongWayToTen = 0;").EndLine()
+          .Append("while (++i <= j)")
+          .BeginBlock()
+            .Append("++aLongWayToTen;").EndLine()
+          .EndBlock()
+        .EndBlock();
+
+  const char* expected =
+      "int i = 0;\n"
+      "int j = 10;\n"
+      "if (true) {\n"
+      "  int aLongWayToTen = 0;\n"
+      "  while (++i <= j) {\n"
+      "    ++aLongWayToTen;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(StreamTest, Types) {
+  SourceBufferWriter writer;
+  Type generic = Type::Generic("T").add_supertype(Type::Class("Number"));
+
+  writer.AppendType(Type::Int()).Append(", ")
+        .AppendType(Type::Class("String")).Append(", ")
+        .AppendType(generic).Append(", ")
+        .AppendType(Type::ListOf(generic)).Append(", ")
+        .AppendType(Type::ListOf(Type::IterableOf(generic))).Append(", ")
+        .AppendType(Type::ListOf(Type::Generic()));
+
+  const char* expected =
+      "int, String, T, List<T>, List<Iterable<T>>, List<?>";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(StreamTest, FileSnippet) {
+  SourceBufferWriter writer;
+  const string& fname = "tensorflow/java/src/gen/resources/test.snippet.java";
+
+  writer.WriteFromFile(fname)
+        .BeginBlock()
+        .WriteFromFile(fname)
+        .EndBlock();
+
+  const char* expected =
+      "// Here is a little snippet\n"
+      "System.out.println(\"Hello!\");\n"
+      "{\n"
+      "  // Here is a little snippet\n"
+      "  System.out.println(\"Hello!\");\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, SimpleClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, SimpleClassWithDependencies) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  std::list<Type> deps;
+  deps.push_back(Type::Class("TypeA", "org.test.sub"));
+  deps.push_back(Type::Class("TypeA", "org.test.sub"));  // a second time
+  deps.push_back(Type::Class("TypeB", "org.other"));
+  deps.push_back(Type::Class("SamePackageType", "org.tensorflow"));
+  deps.push_back(Type::Class("NoPackageType"));
+
+  writer.BeginType(clazz, &deps, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "import org.other.TypeB;\n"
+      "import org.test.sub.TypeA;\n\n"
+      "public class Test {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, AnnotatedAndDocumentedClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  clazz.description("This class has a\n<p>\nmultiline description.");
+  clazz.add_annotation(Annotation::Create("Bean"));
+  clazz.add_annotation(Annotation::Create("SuppressWarnings")
+      .attributes("\"rawtypes\""));
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "/**\n"
+      " * This class has a\n"
+      " * <p>\n"
+      " * multiline description.\n"
+      " **/\n"
+      "@Bean\n"
+      "@SuppressWarnings(\"rawtypes\")\n"
+      "public class Test {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, ParameterizedClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  clazz.add_parameter(Type::Generic("T"));
+  clazz.add_parameter(Type::Generic("U").add_supertype(Type::Class("Number")));
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T, U extends Number> {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, ParameterizedClassAndSupertypes) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T");
+  clazz.add_parameter(type_t);
+  Type type_u = Type::Generic("U").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_u);
+  clazz.add_supertype(Type::Interface("Parametrizable").add_parameter(type_u));
+  clazz.add_supertype(Type::Interface("Runnable"));
+  clazz.add_supertype(Type::Class("SuperTest").add_parameter(type_t));
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T, U extends Number>"
+      " extends SuperTest<T> implements Parametrizable<U>, Runnable {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, ParameterizedClassFields) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  std::list<Variable> static_fields;
+  static_fields.push_back(Variable::Create("field1", Type::Class("String")));
+  std::list<Variable> member_fields;
+  member_fields.push_back(Variable::Create("field2", Type::Class("String")));
+  member_fields.push_back(Variable::Create("field3", type_t));
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .WriteFields(static_fields, STATIC | PUBLIC | FINAL)
+          .WriteFields(member_fields, PRIVATE)
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public static final String field1;\n"
+      "  \n"
+      "  private String field2;\n"
+      "  private T field3;\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, SimpleInnerClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type inner_class = Type::Class("InnerTest");
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginInnerType(inner_class, PUBLIC)
+          .EndType()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  public class InnerTest {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, StaticParameterizedInnerClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  Type inner_class = Type::Class("InnerTest");
+  inner_class.add_parameter(type_t);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginInnerType(inner_class, PUBLIC | STATIC)
+          .EndType()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public static class InnerTest<T extends Number> {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, SimpleMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Method method = Method::Create("doNothing", Type::Void());
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC).EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  public void doNothing() {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, AnnotatedAndDocumentedMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Method method = Method::Create("doNothing", Type::Void());
+  method.description("This method has a\n<p>\nmultiline description.");
+  method.add_annotation(Annotation::Create("Override"));
+  method.add_annotation(Annotation::Create("SuppressWarnings")
+      .attributes("\"rawtypes\""));
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC).EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  /**\n"
+      "   * This method has a\n"
+      "   * <p>\n"
+      "   * multiline description.\n"
+      "   **/\n"
+      "  @Override\n"
+      "  @SuppressWarnings(\"rawtypes\")\n"
+      "  public void doNothing() {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, DocumentedMethodWithArguments) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Method method = Method::Create("boolToInt", Type::Int());
+  method.description("Converts a boolean to an int");
+  method.return_description("int value for this boolean");
+  method.add_argument(Variable::Create("b", Type::Boolean()));
+  Variable reverse = Variable::Create("reverse", Type::Boolean());
+  reverse.description("if true, value is reversed");
+  method.add_argument(reverse);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC)
+            .Append("if (b && !reverse)")
+            .BeginBlock()
+              .Append("return 1;").EndLine()
+            .EndBlock()
+          .Append("return 0;").EndLine()
+          .EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  /**\n"
+      "   * Converts a boolean to an int\n"
+      "   * \n"
+      "   * @param b\n"
+      "   * @param reverse if true, value is reversed\n"
+      "   * @return int value for this boolean\n"
+      "   **/\n"
+      "  public int boolToInt(boolean b, boolean reverse) {\n"
+      "    if (b && !reverse) {\n"
+      "      return 1;\n"
+      "    }\n"
+      "    return 0;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, ParameterizedMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  Method method = Method::Create("doNothing", type_t);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC)
+            .Append("return null;").EndLine()
+          .EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public T doNothing() {\n"
+      "    return null;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, StaticParameterizedMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  Method method = Method::Create("doNothing", type_t);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC | STATIC)
+            .Append("return null;").EndLine()
+          .EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public static <T extends Number> T doNothing() {\n"
+      "    return null;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
 }  // namespace
+}  // namespace java
 }  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/resources/test.snippet.java b/tensorflow/java/src/gen/resources/test.snippet.java
new file mode 100644
index 0000000000000000000000000000000000000000..5e412a9aef436bb73a4d013d1b698b75ad9fbab4
--- /dev/null
+++ b/tensorflow/java/src/gen/resources/test.snippet.java
@@ -0,0 +1,2 @@
+// Here is a little snippet
+System.out.println("Hello!");
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 8b65b3f057651a0fa80443ca650cb98d1f182542..9dad747ac0bf4623cd81adc9c88d2d36d3f04920 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -28,6 +28,7 @@ load("//tensorflow:tensorflow.bzl", "py_tests")
 load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library_additional_deps_impl")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
@@ -59,14 +60,29 @@ py_library(
         "//tensorflow/tools/api/generator:__pkg__",
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
+    deps = [
+        ":no_contrib",
+        "//tensorflow/contrib:contrib_py",
+    ],
+)
+
+py_library(
+    name = "no_contrib",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:__pkg__",
+    ],
     deps = [
         ":array_ops",
         ":bitwise_ops",
+        ":boosted_trees_ops",
         ":check_ops",
         ":client",
         ":client_testlib",
         ":confusion_matrix",
         ":control_flow_ops",
+        ":cudnn_rnn_ops_gen",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -107,7 +123,6 @@ py_library(
         ":training",
         ":util",
         ":weights_broadcast_ops",
-        "//tensorflow/contrib:contrib_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data",
         "//tensorflow/python/estimator:estimator_py",
@@ -271,6 +286,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "py_exception_registry",
+    srcs = ["lib/core/py_exception_registry.cc"],
+    hdrs = ["lib/core/py_exception_registry.h"],
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "kernel_registry",
     srcs = ["util/kernel_registry.cc"],
@@ -287,6 +313,7 @@ cc_library(
     srcs = ["util/util.cc"],
     hdrs = ["util/util.h"],
     deps = [
+        ":safe_ptr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//util/python:python_headers",
@@ -400,6 +427,7 @@ tf_cc_shared_object(
             "-lm",
         ],
         "//tensorflow:darwin": [],
+        "//tensorflow:windows": [],
     }),
     deps = [
         "//tensorflow/core:framework_headers_lib",
@@ -1032,6 +1060,11 @@ cuda_py_tests(
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
     ],
+    shard_count = 10,
+    tags = [
+        "noasan",
+        "optonly",
+    ],
 )
 
 py_test(
@@ -1048,7 +1081,7 @@ py_test(
 
 py_test(
     name = "framework_importer_test",
-    size = "medium",
+    size = "large",
     srcs = ["framework/importer_test.py"],
     main = "framework/importer_test.py",
     srcs_version = "PY2AND3",
@@ -1355,6 +1388,20 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "boosted_trees_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "summary_ops_gen",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = ["//tensorflow/core:summary_ops_op_lib"],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "audio_ops_gen",
     require_shape_functions = True,
@@ -1364,6 +1411,13 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "cudnn_rnn_ops_gen",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "candidate_sampling_ops_gen",
     visibility = ["//learning/brain/python/ops:__pkg__"],
@@ -1591,6 +1645,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "boosted_trees_ops",
+    srcs = ["ops/boosted_trees_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":boosted_trees_ops_gen",
+        ":framework",
+        ":ops",
+        ":training",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+    ],
+)
+
 py_library(
     name = "sets",
     srcs = [
@@ -1800,6 +1867,7 @@ py_library(
         ":platform",
         ":spectral_grad",
         ":util",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
         "//third_party/py/numpy",
@@ -2583,6 +2651,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":user_ops_gen",
+        ":util",
         "@six_archive//:six",
     ],
 )
@@ -2850,9 +2919,11 @@ py_library(
         ":client",
         ":control_flow_ops",
         ":data_flow_ops",
+        ":device",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
+        ":framework_ops",
         ":gradients",
         ":init_ops",
         ":io_ops",
@@ -2877,6 +2948,8 @@ py_library(
         ":variable_scope",
         ":variables",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -2906,6 +2979,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "distribute_test",
+    size = "small",
+    srcs = ["training/distribute_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":training",
+        ":variable_scope",
+    ],
+)
+
 py_test(
     name = "evaluation_test",
     size = "small",
@@ -3094,6 +3179,8 @@ tf_proto_library(
     srcs = ["framework/cpp_shape_inference.proto"],
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
+    # TODO(b/74620627): remove when _USE_C_SHAPES is removed
+    visibility = ["//tensorflow:internal"],
 )
 
 py_test(
@@ -3178,6 +3265,7 @@ cuda_py_tests(
         ":client_testlib",
         ":framework_test_lib",
         ":platform_test",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -3237,6 +3325,7 @@ tf_py_wrap_cc(
         "grappler/model_analyzer.i",
         "grappler/tf_optimizer.i",
         "lib/core/bfloat16.i",
+        "lib/core/py_exception_registry.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
         "lib/io/file_io.i",
@@ -3268,6 +3357,7 @@ tf_py_wrap_cc(
         ":kernel_registry",
         ":numpy_lib",
         ":safe_ptr",
+        ":py_exception_registry",
         ":py_func_lib",
         ":py_record_reader_lib",
         ":py_record_writer_lib",
@@ -3307,7 +3397,7 @@ tf_py_wrap_cc(
 
 # Build a cc_binary from tf_custom_op_library_additional_deps_impl,
 # it contains all object code from its dependencies.
-cc_binary(
+tf_native_cc_binary(
     name = "tf_custom_op_library_additional_deps.so",
     linkshared = 1,
     linkstatic = 1,
@@ -3694,6 +3784,7 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -3925,6 +4016,7 @@ py_test(
     srcs = ["training/saver_large_partitioned_variable_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_windows",
         "noasan",  # http://b/30782289
         "notsan",  # http://b/30782289
     ],
@@ -4152,6 +4244,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":constant_op",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -4162,6 +4255,7 @@ py_library(
         ":pywrap_tensorflow",
         ":summary_op_util",
         ":summary_ops",
+        ":summary_ops_gen",
         ":util",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
@@ -4389,18 +4483,6 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cuda_py_test(
     name = "accumulate_n_benchmark",
     size = "large",
@@ -4702,6 +4784,7 @@ py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        ":tf_item",
         ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
@@ -4762,6 +4845,29 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "constant_folding_test",
+    size = "medium",
+    srcs = [
+        "grappler/constant_folding_test.py",
+    ],
+    additional_deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":array_ops",
+        ":control_flow_ops",
+        ":dtypes",
+        ":functional_ops",
+        ":math_ops",
+        ":ops",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+    ],
+)
+
 cuda_py_test(
     name = "layout_optimizer_test",
     size = "medium",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 5a9cd7531db6482313005feebaadf953c6da3bb4..ab1d01a8351d63544b2c612ad228515d48975aca 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -98,6 +98,12 @@ from tensorflow.python.summary import summary
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
 
+# Import boosted trees ops to make sure the ops are registered (but unused).
+from tensorflow.python.ops import gen_boosted_trees_ops as _gen_boosted_trees_ops
+
+# Import cudnn rnn ops to make sure their ops are registered.
+from tensorflow.python.ops import gen_cudnn_rnn_ops as _
+
 
 # Import the names from python/training.py as train.Name.
 from tensorflow.python.training import training as train
diff --git a/tensorflow/python/client/device_lib.i b/tensorflow/python/client/device_lib.i
index 51c04584a5492e13f5fead627685954d4f810dfa..944e855cee2ab9da7a4a801d1b993bec4d8ebc55 100644
--- a/tensorflow/python/client/device_lib.i
+++ b/tensorflow/python/client/device_lib.i
@@ -15,19 +15,39 @@ limitations under the License.
 
 %include "tensorflow/python/platform/base.i"
 
+%typemap(in) const tensorflow::ConfigProto& (tensorflow::ConfigProto temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The ConfigProto could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
 %{
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace swig {
 
-static std::vector<string> ListDevices(TF_Status* out_status) {
+static std::vector<string> ListDevicesWithSessionConfig(
+    const tensorflow::ConfigProto& config, TF_Status* out_status) {
   std::vector<string> output;
   SessionOptions options;
+  options.config = config;
   std::vector<Device*> devices;
   Status status = DeviceFactory::AddDevices(
       options, "" /* name_prefix */, &devices);
@@ -35,7 +55,8 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
     Set_TF_Status_from_Status(out_status, status);
   }
 
-  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(), devices.end());
+  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(),
+                                                     devices.end());
 
   for (const Device* device : devices) {
     const DeviceAttributes& attr = device->attributes();
@@ -53,6 +74,11 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
   return output;
 }
 
+std::vector<string> ListDevices(TF_Status* out_status) {
+  tensorflow::ConfigProto session_config;
+  return ListDevicesWithSessionConfig(session_config, out_status);
+}
+
 }  // namespace swig
 }  // namespace tensorflow
 
@@ -62,21 +88,28 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
 
 %unignore tensorflow;
 %unignore tensorflow::swig;
+%unignore tensorflow::swig::ListDevicesWithSessionConfig;
 %unignore tensorflow::swig::ListDevices;
 
 // Wrap this function
 namespace tensorflow {
 namespace swig {
 std::vector<string> ListDevices(TF_Status* out_status);
+static std::vector<string> ListDevicesWithSessionConfig(
+    const tensorflow::ConfigProto& config, TF_Status* out_status);
 }  // namespace swig
 }  // namespace tensorflow
 
 %insert("python") %{
-def list_devices():
+def list_devices(session_config=None):
   from tensorflow.python.framework import errors
 
   with errors.raise_exception_on_not_ok_status() as status:
-    return ListDevices(status)
+    if session_config:
+      return ListDevicesWithSessionConfig(session_config.SerializeToString(),
+                                          status)
+    else:
+      return ListDevices(status)
 %}
 
 %unignoreall
diff --git a/tensorflow/python/client/device_lib.py b/tensorflow/python/client/device_lib.py
index ad430cbae5a42a388cc8c41bf8be9db253aa92f2..9d90d5395e288e5988c60df64b9d962f5cccc22a 100644
--- a/tensorflow/python/client/device_lib.py
+++ b/tensorflow/python/client/device_lib.py
@@ -22,9 +22,12 @@ from tensorflow.core.framework import device_attributes_pb2
 from tensorflow.python import pywrap_tensorflow
 
 
-def list_local_devices():
+def list_local_devices(session_config=None):
   """List the available devices available in the local process.
 
+  Args:
+    session_config: a session config proto or None to use the default config.
+
   Returns:
     A list of `DeviceAttribute` protocol buffers.
   """
@@ -33,4 +36,7 @@ def list_local_devices():
     m.ParseFromString(pb_str)
     return m
 
-  return [_convert(s) for s in pywrap_tensorflow.list_devices()]
+  return [
+      _convert(s)
+      for s in pywrap_tensorflow.list_devices(session_config=session_config)
+  ]
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index aaf41626ab0078489026036d2b838f33a893a540..fec41f50b6c130704d587d6c7b80297c95183005 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import device_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -31,6 +32,10 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     self.assertGreater(len(devices), 0)
     self.assertEqual(devices[0].device_type, "CPU")
 
+    devices = device_lib.list_local_devices(config_pb2.ConfigProto())
+    self.assertGreater(len(devices), 0)
+    self.assertEqual(devices[0].device_type, "CPU")
+
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 924d62992a6f7c6e39b73752530eab70c6b5ce91..4c84d78f2e11922e4819e45aaee79374c8c5ec34 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -21,12 +21,12 @@ from __future__ import print_function
 import functools
 import re
 import threading
+import warnings
 
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -628,14 +628,12 @@ class BaseSession(SessionInterface):
     self._session = None
     opts = tf_session.TF_NewSessionOptions(target=self._target, config=config)
     try:
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          # pylint: disable=protected-access
-          self._session = tf_session.TF_NewSession(self._graph._c_graph, opts,
-                                                   status)
-          # pylint: enable=protected-access
-        else:
-          self._session = tf_session.TF_NewDeprecatedSession(opts, status)
+      if self._created_with_new_api:
+        # pylint: disable=protected-access
+        self._session = tf_session.TF_NewSession(self._graph._c_graph, opts)
+        # pylint: enable=protected-access
+      else:
+        self._session = tf_session.TF_NewDeprecatedSession(opts)
     finally:
       tf_session.TF_DeleteSessionOptions(opts)
 
@@ -662,22 +660,20 @@ class BaseSession(SessionInterface):
     Returns:
       A list of devices in the session.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        raw_device_list = tf_session.TF_SessionListDevices(
-            self._session, status)
-      else:
-        raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
-            self._session, status)
-      device_list = []
-      size = tf_session.TF_DeviceListCount(raw_device_list)
-      for i in range(size):
-        name = tf_session.TF_DeviceListName(raw_device_list, i, status)
-        device_type = tf_session.TF_DeviceListType(raw_device_list, i, status)
-        memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i, status)
-        device_list.append(_DeviceAttributes(name, device_type, memory))
-      tf_session.TF_DeleteDeviceList(raw_device_list)
-      return device_list
+    if self._created_with_new_api:
+      raw_device_list = tf_session.TF_SessionListDevices(self._session)
+    else:
+      raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
+          self._session)
+    device_list = []
+    size = tf_session.TF_DeviceListCount(raw_device_list)
+    for i in range(size):
+      name = tf_session.TF_DeviceListName(raw_device_list, i)
+      device_type = tf_session.TF_DeviceListType(raw_device_list, i)
+      memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i)
+      device_list.append(_DeviceAttributes(name, device_type, memory))
+    tf_session.TF_DeleteDeviceList(raw_device_list)
+    return device_list
 
   def close(self):
     """Closes this session.
@@ -691,15 +687,13 @@ class BaseSession(SessionInterface):
     if self._created_with_new_api:
       if self._session and not self._closed:
         self._closed = True
-        with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_CloseSession(self._session, status)
+        tf_session.TF_CloseSession(self._session)
 
     else:
       with self._extend_lock:
         if self._opened and not self._closed:
           self._closed = True
-          with errors.raise_exception_on_not_ok_status() as status:
-            tf_session.TF_CloseDeprecatedSession(self._session, status)
+          tf_session.TF_CloseDeprecatedSession(self._session)
 
   def __del__(self):
     # cleanly ignore all exceptions
@@ -709,11 +703,10 @@ class BaseSession(SessionInterface):
       pass
     if self._session is not None:
       try:
-        status = c_api_util.ScopedTFStatus()
         if self._created_with_new_api:
-          tf_session.TF_DeleteSession(self._session, status)
+          tf_session.TF_DeleteSession(self._session)
         else:
-          tf_session.TF_DeleteDeprecatedSession(self._session, status)
+          tf_session.TF_DeleteDeprecatedSession(self._session)
       except AttributeError:
         # At shutdown, `c_api_util` or `tf_session` may have been garbage
         # collected, causing the above method calls to fail. In this case,
@@ -888,6 +881,8 @@ class BaseSession(SessionInterface):
       Either a single value if `fetches` is a single graph element, or
       a list of values if `fetches` is a list, or a dictionary with the
       same keys as `fetches` if that is a dictionary (described above).
+      Order in which `fetches` operations are evaluated inside the call
+      is undefined.
 
     Raises:
       RuntimeError: If this `Session` is in an invalid state (e.g. has been
@@ -1028,11 +1023,11 @@ class BaseSession(SessionInterface):
     # Set up a graph with feeds and fetches for partial run.
     def _setup_fn(session, feed_list, fetch_list, target_list):
       self._extend_graph()
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          return tf_session.TF_SessionPRunSetup_wrapper(
-              session, feed_list, fetch_list, target_list, status)
-        else:
+      if self._created_with_new_api:
+        return tf_session.TF_SessionPRunSetup_wrapper(
+            session, feed_list, fetch_list, target_list)
+      else:
+        with errors.raise_exception_on_not_ok_status() as status:
           return tf_session.TF_PRunSetup(session, feed_list, fetch_list,
                                          target_list, status)
 
@@ -1342,8 +1337,7 @@ class BaseSession(SessionInterface):
   def _extend_graph(self):
     if self._created_with_new_api:
       with self._graph._lock:  # pylint: disable=protected-access
-        with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.ExtendSession(self._session, status)
+        tf_session.ExtendSession(self._session)
     else:
       # Ensure any changes to the graph are reflected in the runtime.
       with self._extend_lock:
@@ -1409,25 +1403,82 @@ class BaseSession(SessionInterface):
 
   def _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list,
                           run_metadata):
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        return tf_session.TF_SessionRun_wrapper(
-            self._session, options, feed_dict, fetch_list, target_list,
-            run_metadata, status)
-      else:
+    if self._created_with_new_api:
+      return tf_session.TF_SessionRun_wrapper(
+          self._session, options, feed_dict, fetch_list, target_list,
+          run_metadata)
+    else:
+      with errors.raise_exception_on_not_ok_status() as status:
         return tf_session.TF_Run(
             self._session, options, feed_dict, fetch_list, target_list,
             status, run_metadata)
 
   def _call_tf_sessionprun(self, handle, feed_dict, fetch_list):
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        return tf_session.TF_SessionPRun_wrapper(
-            self._session, handle, feed_dict, fetch_list, status)
-      else:
+    if self._created_with_new_api:
+      return tf_session.TF_SessionPRun_wrapper(
+          self._session, handle, feed_dict, fetch_list)
+    else:
+      with errors.raise_exception_on_not_ok_status() as status:
         return tf_session.TF_PRun(
             self._session, handle, feed_dict, fetch_list, status)
 
+  # pylint: disable=protected-access
+  class _Callable(object):
+    """Experimental wrapper for the C++ `Session::MakeCallable()` API."""
+
+    def __init__(self, session, callable_options):
+      self._session = session
+      self._handle = None
+      options_ptr = tf_session.TF_NewBufferFromString(
+          compat.as_bytes(callable_options.SerializeToString()))
+      try:
+        with errors.raise_exception_on_not_ok_status() as status:
+          if session._created_with_new_api:
+            self._handle = tf_session.TF_SessionMakeCallable(
+                session._session, options_ptr, status)
+          else:
+            self._handle = tf_session.TF_DeprecatedSessionMakeCallable(
+                session._session, options_ptr, status)
+      finally:
+        tf_session.TF_DeleteBuffer(options_ptr)
+
+    def __call__(self, *args):
+      # TODO(b/74355905): Support argument and return value nested structures,
+      # and tensor-like objects such as SparseTensors.
+      with errors.raise_exception_on_not_ok_status() as status:
+        if self._session._created_with_new_api:
+          return tf_session.TF_SessionRunCallable(
+              self._session._session, self._handle, args, status, None)
+        else:
+          return tf_session.TF_DeprecatedSessionRunCallable(
+              self._session._session, self._handle, args, status, None)
+
+    def __del__(self):
+      if self._handle is not None:
+        with errors.raise_exception_on_not_ok_status() as status:
+          if self._session._created_with_new_api:
+            tf_session.TF_SessionReleaseCallable(
+                self._session._session, self._handle, status)
+          else:
+            tf_session.TF_DeprecatedSessionReleaseCallable(
+                self._session._session, self._handle, status)
+  # pylint: enable=protected-access
+
+  # TODO(b/74355905): Reimplement `Session.make_callable()` using this method
+  # where possible.
+  def _make_callable_from_options(self, callable_options):
+    """Returns a handle to a "callable" with the given options.
+
+    Args:
+      callable_options: A `CallableOptions` protocol buffer message describing
+        the computation that will be performed by the callable.
+
+    Returns:
+      A handle to the new callable.
+    """
+    self._extend_graph()
+    return BaseSession._Callable(self, callable_options)
+
 
 @tf_export('Session')
 class Session(BaseSession):
@@ -1624,6 +1675,9 @@ class InteractiveSession(BaseSession):
   ```
   """
 
+  _count_lock = threading.Lock()
+  _active_session_count = 0  # GUARDED_BY(_count_lock)
+
   def __init__(self, target='', graph=None, config=None):
     """Creates a new interactive TensorFlow session.
 
@@ -1652,6 +1706,19 @@ class InteractiveSession(BaseSession):
     config.graph_options.place_pruned_graph = True
 
     super(InteractiveSession, self).__init__(target, graph, config)
+    with InteractiveSession._count_lock:
+      if InteractiveSession._active_session_count > 0:
+        warnings.warn('An interactive session is already active. This can '
+                      'cause out-of-memory errors in some cases. You must '
+                      'explicitly call `InteractiveSession.close()` to release '
+                      'resources held by the other session(s).')
+      InteractiveSession._active_session_count += 1
+    # NOTE(mrry): We do not use `Session._closed` here because it has unhelpful
+    # semantics (in particular, it is not set to true if `Session.close()` is
+    # called on a session that has not been "opened" by running a step) and we
+    # cannot change those semantics without breaking existing code.
+    self._explicitly_closed = False
+
     self._default_session = self.as_default()
     self._default_session.enforce_nesting = False
     self._default_session.__enter__()
@@ -1664,6 +1731,14 @@ class InteractiveSession(BaseSession):
   def close(self):
     """Closes an `InteractiveSession`."""
     super(InteractiveSession, self).close()
+    with InteractiveSession._count_lock:
+      if not self._explicitly_closed:
+        InteractiveSession._active_session_count -= 1
+        self._explicitly_closed = True
+      else:
+        return
     if self._explicit_graph is not None:
       self._default_graph.__exit__(None, None, None)
+      self._default_graph = None
     self._default_session.__exit__(None, None, None)
+    self._default_session = None
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index 5a7413c12e9db92cb85d54a69602753ff6476425..38a3acb2dc304968915e84c8054621e441294e61 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -23,7 +23,6 @@ from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
 from tensorflow.python.client import session
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -42,21 +41,13 @@ class SessionListDevicesTestMethods(object):
 
   def testInvalidDeviceNumber(self):
     opts = tf_session.TF_NewSessionOptions()
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_session = tf_session.TF_NewSession(
-          ops.get_default_graph()._c_graph, opts, status)
-      raw_device_list = tf_session.TF_SessionListDevices(
-          c_session, status)
+    c_session = tf_session.TF_NewSession(ops.get_default_graph()._c_graph, opts)
+    raw_device_list = tf_session.TF_SessionListDevices(c_session)
     size = tf_session.TF_DeviceListCount(raw_device_list)
-    # Test that invalid device numbers return -1 rather than a Swig-wrapped
-    # pointer.
-    status_no_exception = c_api_util.ScopedTFStatus()
-    memory = tf_session.TF_DeviceListMemoryBytes(
-        raw_device_list, size, status_no_exception)
-    self.assertEqual(memory, -1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      tf_session.TF_DeviceListMemoryBytes(raw_device_list, size)
     tf_session.TF_DeleteDeviceList(raw_device_list)
-    with errors.raise_exception_on_not_ok_status() as status:
-      tf_session.TF_CloseSession(c_session, status)
+    tf_session.TF_CloseSession(c_session)
 
   def testListDevicesGrpcSession(self):
     server = server_lib.Server.create_local_server()
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index ccd7a5117ae05fb51d6c5b3e3e81c9aba5a26d50..92497272c66b5c3be36aba75b9e3b7f3d99b062d 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -22,13 +22,13 @@ import os
 import sys
 import threading
 import time
+import warnings
 
 import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.core.framework import types_pb2
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -62,10 +62,13 @@ from tensorflow.python.util import compat
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 
-# TODO(skyewm): reenable when this works with _USE_C_SHAPES=False
-# @test_util.with_c_api
+@test_util.with_c_api
 class SessionTest(test_util.TensorFlowTestCase):
 
+  def setUp(self):
+    super(SessionTest, self).setUp()
+    warnings.simplefilter('always')
+
   def testUseExistingGraph(self):
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
       a = constant_op.constant(6.0, shape=[1, 1])
@@ -190,12 +193,10 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(0.0, shape=[2, 3])
       # NOTE(mrry): The original_op is nonsense, but used here to test that the
       #   errors are reported correctly.
-      # pylint: disable=protected-access
       with sess.graph._original_op(a.op):
         b = array_ops.identity(a, name='id')
       with sess.graph._original_op(b.op):
         c = array_ops.placeholder(dtypes.float32)
-      # pylint: enable=protected-access
 
       def exc_predicate(e):
         return (e.op == c.op and e.op._original_op == b.op and
@@ -1193,6 +1194,33 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[24.0]], e.eval())
       sess.close()
 
+  def testMultipleInteractiveSessionsWarning(self):
+    # Reinitialize the global state to ensure that the expected warnings will
+    # be emitted.
+    session.InteractiveSession._active_session_count = 0  # pylint: disable=protected-access
+
+    sess = session.InteractiveSession()
+    sess.run(constant_op.constant(4.0))  # Run so that the session is "opened".
+    sess.close()
+    # Opening and closing interactive sessions serially should not warn.
+    with warnings.catch_warnings(record=True) as w:
+      sess = session.InteractiveSession()
+      sess.close()
+    self.assertEqual(0, len(w))
+
+    with warnings.catch_warnings(record=True) as w:
+      sess = session.InteractiveSession()
+    self.assertEqual(0, len(w))
+    with warnings.catch_warnings(record=True) as w:
+      sess2 = session.InteractiveSession()
+    self.assertEqual(1, len(w))
+    self.assertTrue('An interactive session is already active. This can cause '
+                    'out-of-memory errors in some cases. You must explicitly '
+                    'call `InteractiveSession.close()` to release resources '
+                    'held by the other session(s).' in str(w[0].message))
+    sess2.close()
+    sess.close()
+
   def testInteractivePlacePrunedGraph(self):
     sess = session.InteractiveSession()
 
@@ -1343,6 +1371,18 @@ class SessionTest(test_util.TensorFlowTestCase):
                               run_metadata=run_metadata))
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
+  def testOptimizedMakeCallable(self):
+    with session.Session() as sess:
+      ph = array_ops.placeholder(dtypes.float32)
+      a = math_ops.add(ph, 1.0)
+      callable_opts = config_pb2.CallableOptions()
+      callable_opts.feed.append(ph.name)
+      callable_opts.fetch.append(a.name)
+      for _ in range(3):
+        callable_fn = sess._make_callable_from_options(callable_opts)
+        for _ in range(5):
+          self.assertEqual([2.0], callable_fn(np.array(1.0, dtype=np.float32)))
+
   def testFeedError(self):
     with session.Session() as sess:
       feed_t = array_ops.placeholder(dtype=dtypes.float32)
@@ -1785,8 +1825,8 @@ class SessionTest(test_util.TensorFlowTestCase):
     # Ensure that errors from building the graph get propagated.
     data = array_ops.placeholder(dtypes.float32, shape=[])
     # pylint: disable=protected-access
-    enter_1 = gen_control_flow_ops._enter(data, 'foo_1', False)
-    enter_2 = gen_control_flow_ops._enter(data, 'foo_2', False)
+    enter_1 = gen_control_flow_ops.enter(data, 'foo_1', False)
+    enter_2 = gen_control_flow_ops.enter(data, 'foo_2', False)
     # pylint: enable=protected-access
     res = math_ops.add(enter_1, enter_2)
     with self.assertRaisesOpError('has inputs from different frames'):
@@ -1855,144 +1895,5 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(a, feed_dict={a: 1})
 
 
-class GraphMutationTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._original_use_c_api_value = ops._USE_C_API
-    ops._USE_C_API = True
-    super(GraphMutationTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self._original_use_c_api_value
-    super(GraphMutationTest, self).tearDown()
-
-  def testUpdateInputAfterRunning(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-    with session.Session(graph=g) as sess:
-      self.assertAllEqual(3.0, sess.run(c))
-      c.op._update_input(1, a)  # pylint: disable=protected-access
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          'add.*was changed by updating input tensor after it was run'):
-        sess.run(c)
-
-      # Check that running the graph with a new session is fine
-      with session.Session(graph=g) as sess2:
-        self.assertAllEqual(2.0, sess2.run(c))
-
-  def testSetDeviceAfterRunning(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-    with session.Session(graph=g) as sess:
-      self.assertAllEqual(3.0, sess.run(c))
-      c.op._set_device('/cpu:0')  # pylint: disable=protected-access
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          'add.*was changed by setting device after it was run'):
-        sess.run(c)
-
-  def testSetAttrAfterRunning(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0, dtype=dtypes.float32)
-      b = math_ops.cast(a, dtypes.float64)
-
-    with session.Session(graph=g) as sess:
-      self.assertAllEqual(1.0, sess.run(b))
-      b.op._set_attr('DstT', attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          'Cast.*was changed by setting attribute after it was run'):
-        sess.run(b)
-
-  def testRunModifyRun(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-      with session.Session(graph=g) as sess:
-        self.assertAllEqual(3.0, sess.run(c))
-
-        d = b + c
-        d.op._update_input(0, a)  # pylint: disable=protected-access
-        self.assertAllEqual(3.0, sess.run(c))
-        self.assertAllEqual(4.0, sess.run(d))
-
-  def testRunModifyRunTwoSessions(self):
-    with ops.Graph().as_default() as g:
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-      with session.Session(graph=g) as sess1:
-        with session.Session(graph=g) as sess2:
-          self.assertAllEqual(3.0, sess1.run(c))
-          self.assertAllEqual(3.0, sess2.run(c))
-
-          d = b + c
-          d.op._update_input(0, a)  # pylint: disable=protected-access
-          self.assertAllEqual(3.0, sess2.run(c))
-          self.assertAllEqual(4.0, sess2.run(d))
-
-          d.op._update_input(0, b)  # pylint: disable=protected-access
-          self.assertAllEqual(3.0, sess1.run(c))
-          self.assertAllEqual(5.0, sess1.run(d))
-
-          with self.assertRaisesRegexp(
-              errors.FailedPreconditionError,
-              'add.*was changed by updating input tensor after it was run'):
-            sess2.run(c)
-
-  def testTwoSessionsOneRunBeforeModification(self):
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-    with session.Session(graph=g) as sess1:
-      with session.Session(graph=g) as sess2:
-        sess1.run(c)
-
-        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
-
-        with self.assertRaisesRegexp(
-            errors.FailedPreconditionError,
-            'add.*was changed by setting device after it was run'):
-          sess1.run(c)
-
-        # sess2 was not run before modification
-        self.assertAllEqual(3.0, sess2.run(c))
-
-  def testTwoSessionsBothRunBeforeModification(self):
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      a = constant_op.constant(1.0)
-      b = constant_op.constant(2.0)
-      c = a + b
-
-    with session.Session(graph=g) as sess1:
-      with session.Session(graph=g) as sess2:
-        sess1.run(c)
-        sess2.run(c)
-
-        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
-
-        with self.assertRaisesRegexp(
-            errors.FailedPreconditionError,
-            'add.*was changed by setting device after it was run'):
-          sess1.run(c)
-
-        with self.assertRaisesRegexp(
-            errors.FailedPreconditionError,
-            'add.*was changed by setting device after it was run'):
-          sess2.run(c)
-
-
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index e88fc0c01a8bb7534f47e2a0389965c102bbad7b..b82182d5d3690e4601b4fe8423cef972139f2283 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -18,11 +18,11 @@ limitations under the License.
 %{
 
 #include "tensorflow/c/python_api.h"
-#include "tensorflow/python/client/tf_session_helper.h"
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/python/client/tf_session_helper.h"
 
 // Helper function to convert a Python list of Tensors to a C++ vector of
 // TF_Outputs.
@@ -72,7 +72,7 @@ void PyInt64ListToVector(PyObject* py_int_seq, std::vector<int64_t>* vec) {
   int size = PySequence_Fast_GET_SIZE(py_int_seq);
   for (int i = 0; i < size; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(py_int_seq, i);
-    vec->push_back(PyInt_AsLong(item));
+    vec->push_back(PyLong_AsLongLong(item));
   }
 }
 
@@ -157,6 +157,25 @@ tensorflow::ImportNumpy();
   }
 }
 
+// We use TF_OperationGetControlOutputs_wrapper instead of
+// TF_OperationGetControlOutputs
+%ignore TF_OperationGetControlOutputs;
+%unignore TF_OperationGetControlOutputs_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_OperationGetControlOutputs_wrapper;
+
+// Build a Python list of TF_Operation* and return it.
+%typemap(out) std::vector<TF_Operation*> tensorflow::TF_OperationGetControlOutputs_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOperation($1[i]));
+  }
+}
+
 %ignore TF_OperationOutputConsumers;
 %unignore TF_OperationOutputConsumers_wrapper;
 // See comment for "%noexception TF_SessionRun_wrapper;"
@@ -419,6 +438,30 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
   $result = new_result;
 }
 
+%typemap(in, numinputs=0) int64_t* out_handle (int64_t out_handle) {
+  $1 = &out_handle;
+}
+
+%typemap(argout) int64_t* out_handle {
+  $result = PyLong_FromLongLong(*$1);
+}
+
+%typemap(in) int64_t handle {
+  if (!PyLong_Check($input)) {
+    SWIG_exception_fail(
+        SWIG_TypeError,
+        tensorflow::strings::Printf(
+            "Expected a python long for conversion to callable handle but got %s",
+            Py_TYPE($input)->tp_name).c_str());
+  }
+  $1 = PyLong_AsLongLong($input);
+}
+
+// Override default py3 behavior of attempting to encode into Unicode.
+%typemap(out) std::string tensorflow::ResourceHandleShapeAndType {
+  $result = PyBytes_FromStringAndSize($1.data(), $1.size());
+}
+
 // TODO(skyewm): SWIG emits a warning for the const char* in TF_WhileParams,
 // skip for now
 %ignore TF_WhileParams;
@@ -452,6 +495,17 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
 // See comment for "%noexception TF_SessionRun_wrapper;"
 %noexception TF_SessionPRun_wrapper;
 
+%unignore TF_DeprecatedSessionMakeCallable;
+%unignore TF_SessionMakeCallable;
+%unignore TF_DeprecatedSessionRunCallable;
+%unignore TF_SessionRunCallable;
+%unignore TF_DeprecatedSessionReleaseCallable;
+%unignore TF_SessionReleaseCallable;
+
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_DeprecatedSessionRunCallable;
+%noexception TF_SessionRunCallable;
+
 %rename("_TF_SetTarget") TF_SetTarget;
 %rename("_TF_SetConfig") TF_SetConfig;
 %rename("_TF_NewSessionOptions") TF_NewSessionOptions;
@@ -469,9 +523,8 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
       _TF_SetTarget(opts, target)
     if config is not None:
       from tensorflow.python.framework import errors
-      with errors.raise_exception_on_not_ok_status() as status:
-        config_str = config.SerializeToString()
-        _TF_SetConfig(opts, config_str, status)
+      config_str = config.SerializeToString()
+      _TF_SetConfig(opts, config_str)
     return opts
 %}
 
@@ -723,6 +776,7 @@ def TF_Reset(target, containers=None, config=None):
 %unignore TF_TryEvaluateConstant_wrapper;
 %noexception TF_TryEvaluateConstant_wrapper;
 %unignore ExtendSession;
+%unignore ResourceHandleShapeAndType;
 
 %include "tensorflow/python/client/tf_session_helper.h"
 
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index a8ab91749a86749a1eef25e2674634334682d0f3..b48d758e4a051b01db09121921f95050a8c4c3d1 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -155,6 +155,156 @@ void TF_Run_wrapper(TF_DeprecatedSession* session, const TF_Buffer* run_options,
   ClearDecrefCache();
 }
 
+namespace {
+void MakeCallableHelper(tensorflow::Session* session,
+                        const TF_Buffer* callable_options, int64_t* out_handle,
+                        TF_Status* out_status) {
+  tensorflow::CallableOptions callable_options_proto;
+  if (callable_options != nullptr &&
+      !callable_options_proto.ParseFromArray(callable_options->data,
+                                             callable_options->length)) {
+    Set_TF_Status_from_Status(
+        out_status,
+        errors::InvalidArgument("Unparseable CallableOptions proto"));
+    return;
+  }
+  tensorflow::Session::CallableHandle handle;
+  Status s = session->MakeCallable(callable_options_proto, &handle);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+  *out_handle = handle;
+}
+}  // namespace
+
+void TF_DeprecatedSessionMakeCallable(TF_DeprecatedSession* session,
+                                      const TF_Buffer* callable_options,
+                                      int64_t* out_handle,
+                                      TF_Status* out_status) {
+  MakeCallableHelper(session->session, callable_options, out_handle,
+                     out_status);
+}
+void TF_SessionMakeCallable(TF_Session* session,
+                            const TF_Buffer* callable_options,
+                            int64_t* out_handle, TF_Status* out_status) {
+  MakeCallableHelper(session->session, callable_options, out_handle,
+                     out_status);
+}
+
+namespace {
+void RunCallableHelper(tensorflow::Session* session, int64_t handle,
+                       PyObject* feed_values, TF_Status* out_status,
+                       PyObjectVector* out_values, TF_Buffer* run_metadata) {
+  // Convert feed values to a vector of tensorflow::Tensor objects.
+  std::vector<Tensor> input_tensors;
+  Status s;
+  {
+    feed_values =
+        PySequence_Fast(feed_values, "feed_values must be a sequence");
+    if (feed_values == nullptr) return;
+    Safe_PyObjectPtr feed_values_holder(make_safe(feed_values));
+    Py_ssize_t len = PySequence_Fast_GET_SIZE(feed_values);
+    input_tensors.reserve(len);
+    for (Py_ssize_t i = 0; i < len; ++i) {
+      PyObject* elem = PySequence_Fast_GET_ITEM(feed_values, i);
+      if (!elem) {
+        Set_TF_Status_from_Status(
+            out_status, errors::Internal("Could not get feed value ", i));
+        return;
+      }
+      Tensor t;
+      s = NdarrayToTensor(elem, &t);
+      if (!s.ok()) {
+        Set_TF_Status_from_Status(out_status, s);
+        return;
+      }
+      input_tensors.push_back(std::move(t));
+    }
+  }
+
+  // Allocate a RunMetadata protobuf object to receive the metadata,
+  // if the caller is expecting any.
+  std::unique_ptr<RunMetadata> run_metadata_proto;
+  if (run_metadata != nullptr) {
+    run_metadata_proto.reset(new RunMetadata);
+  }
+
+  // Run the callable.
+  std::vector<Tensor> output_tensors;
+  Py_BEGIN_ALLOW_THREADS;
+  s = session->RunCallable(handle, input_tensors, &output_tensors,
+                           run_metadata_proto.get());
+  Py_END_ALLOW_THREADS;
+
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+
+  // If requested, serialize the RunMetadata to pass it back to the caller.
+  if (run_metadata != nullptr) {
+    s = MessageToBuffer(*run_metadata_proto, run_metadata);
+    if (!s.ok()) {
+      Set_TF_Status_from_Status(out_status, s);
+      return;
+    }
+  }
+
+  // Convert results to NumPy arrays. Since this can fail, stage the
+  // results via a safe container that takes care of decreasing the
+  // reference count on failure.
+  std::vector<Safe_PyObjectPtr> py_outputs_safe;
+  py_outputs_safe.reserve(output_tensors.size());
+  for (const Tensor& output : output_tensors) {
+    PyObject* py_array;
+    s = TensorToNdarray(output, &py_array);
+    if (!s.ok()) {
+      Set_TF_Status_from_Status(out_status, s);
+      return;
+    }
+    py_outputs_safe.push_back(make_safe(py_array));
+  }
+
+  // If we reach this point, we have successfully built a list of objects
+  // so we can release them from the safe container.
+  out_values->reserve(py_outputs_safe.size());
+  for (auto& output : py_outputs_safe) {
+    out_values->push_back(output.release());
+  }
+}
+}  // namespace
+
+void TF_DeprecatedSessionRunCallable(TF_DeprecatedSession* session,
+                                     int64_t handle, PyObject* feed_values,
+                                     TF_Status* out_status,
+                                     PyObjectVector* out_values,
+                                     TF_Buffer* run_metadata) {
+  RunCallableHelper(session->session, handle, feed_values, out_status,
+                    out_values, run_metadata);
+  ClearDecrefCache();
+}
+void TF_SessionRunCallable(TF_Session* session, int64_t handle,
+                           PyObject* feed_values, TF_Status* out_status,
+                           PyObjectVector* out_values,
+                           TF_Buffer* run_metadata) {
+  RunCallableHelper(session->session, handle, feed_values, out_status,
+                    out_values, run_metadata);
+  ClearDecrefCache();
+}
+
+void TF_DeprecatedSessionReleaseCallable(TF_DeprecatedSession* session,
+                                         int64_t handle,
+                                         TF_Status* out_status) {
+  Set_TF_Status_from_Status(out_status,
+                            session->session->ReleaseCallable(handle));
+}
+void TF_SessionReleaseCallable(TF_Session* session, int64_t handle,
+                               TF_Status* out_status) {
+  Set_TF_Status_from_Status(out_status,
+                            session->session->ReleaseCallable(handle));
+}
+
 // Wrapper for TF_PRunSetup that converts the arguments to appropriate types.
 // If *out_status is OK, the caller becomes the owner of *out_handle.
 void TF_PRunSetup_wrapper(TF_DeprecatedSession* session,
@@ -400,6 +550,15 @@ std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
   return control_inputs;
 }
 
+std::vector<TF_Operation*> TF_OperationGetControlOutputs_wrapper(
+    TF_Operation* oper) {
+  std::vector<TF_Operation*> control_outputs(
+      TF_OperationNumControlOutputs(oper));
+  TF_OperationGetControlOutputs(oper, control_outputs.data(),
+                                control_outputs.size());
+  return control_outputs;
+}
+
 std::vector<const char*> TF_OperationOutputConsumers_wrapper(
     TF_Output oper_out) {
   int num_consumers = TF_OperationOutputNumConsumers(oper_out);
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 83318dc178f6da3828a8dc41e81b7fc3e2e19e22..d2b4abc476ea79aa1afc2cf480b887f97903ab6b 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -59,6 +59,31 @@ void TF_Run_wrapper(TF_DeprecatedSession* session, const TF_Buffer* run_options,
                     const NameVector& target_nodes, TF_Status* out_status,
                     PyObjectVector* out_values, TF_Buffer* run_outputs);
 
+// Python wrappers for the `Session::MakeCallable()` API.
+void TF_DeprecatedSessionMakeCallable(TF_DeprecatedSession* session,
+                                      const TF_Buffer* callable_options,
+                                      int64_t* out_handle,
+                                      TF_Status* out_status);
+void TF_SessionMakeCallable(TF_Session* session,
+                            const TF_Buffer* callable_options,
+                            int64_t* out_handle, TF_Status* out_status);
+
+// Python wrappers for the `Session::RunCallable()` API.
+void TF_DeprecatedSessionRunCallable(TF_DeprecatedSession* session,
+                                     int64_t handle, PyObject* feed_values,
+                                     TF_Status* out_status,
+                                     PyObjectVector* out_values,
+                                     TF_Buffer* run_metadata);
+void TF_SessionRunCallable(TF_Session* session, int64_t handle,
+                           PyObject* feed_values, TF_Status* out_status,
+                           PyObjectVector* out_values, TF_Buffer* run_metadata);
+
+// Python wrappers for the `Session::ReleaseCallable()` API.
+void TF_DeprecatedSessionReleaseCallable(TF_DeprecatedSession* session,
+                                         int64_t handle, TF_Status* out_status);
+void TF_SessionReleaseCallable(TF_Session* session, int64_t handle,
+                               TF_Status* out_status);
+
 // Set up the graph with the intended feeds and fetches for partial run.
 // *out_handle is owned by the caller.
 //
@@ -111,8 +136,7 @@ string EqualAttrValueWrapper(const string& actual, const string& expected);
 //
 // If shape is unknown, sets unknown_shape to true.
 tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
-    TF_Graph* graph, TF_Output output, TF_Status* out_status,
-    bool* unknown_shape);
+    TF_Graph* graph, TF_Output output, TF_Status* status, bool* unknown_shape);
 
 // Runs the graph associated with the session starting with the supplied inputs.
 // On success, `py_outputs` is populated with a numpy ndarray for each output
@@ -124,7 +148,7 @@ void TF_SessionRun_wrapper(TF_Session* session, const TF_Buffer* run_options,
                            const std::vector<PyObject*>& input_ndarrays,
                            const std::vector<TF_Output>& outputs,
                            const std::vector<TF_Operation*>& targets,
-                           TF_Buffer* run_metadata, TF_Status* out_status,
+                           TF_Buffer* run_metadata, TF_Status* status,
                            std::vector<PyObject*>* py_outputs);
 
 // Set up the graph with the intended feeds (inputs) and fetches (output) for
@@ -140,8 +164,7 @@ void TF_SessionPRunSetup_wrapper(TF_Session* session,
                                  const std::vector<TF_Output>& inputs,
                                  const std::vector<TF_Output>& outputs,
                                  const std::vector<TF_Operation*>& targets,
-                                 const char** out_handle,
-                                 TF_Status* out_status);
+                                 const char** out_handle, TF_Status* status);
 
 // Continue to run the graph with additional feeds and fetches. The
 // execution state is uniquely identified by the handle.
@@ -157,7 +180,7 @@ void TF_SessionPRun_wrapper(TF_Session* session, const char* handle,
                             const std::vector<TF_Output>& inputs,
                             const std::vector<PyObject*>& input_ndarrays,
                             const std::vector<TF_Output>& outputs,
-                            TF_Status* out_status,
+                            TF_Status* status,
                             std::vector<PyObject*>* py_outputs);
 
 // Retrieves the inputs of this operation.
@@ -167,6 +190,10 @@ std::vector<TF_Output> GetOperationInputs(TF_Operation* oper);
 std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
     TF_Operation* oper);
 
+// Retrieves the control outputs of this operation.
+std::vector<TF_Operation*> TF_OperationGetControlOutputs_wrapper(
+    TF_Operation* oper);
+
 // Retrieves the op names of the consumers of `oper_out`. The returned strings
 // have the lifetime of the underlying TF_Graph.
 std::vector<const char*> TF_OperationOutputConsumers_wrapper(
@@ -179,7 +206,7 @@ TF_Function* TF_GraphToFunction_wrapper(
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
     const NameVector& output_names, const TF_FunctionOptions* opts,
-    const char* description, TF_Status* out_status);
+    const char* description, TF_Status* status);
 
 // Set the shapes and types for the output's handle.
 //
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index 9641b8b7f2735e2e0477aec59edd539e999fa969..c046e9cfd45d7d7677a1dbab0a7168e526c89bca 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.client import timeline
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -155,9 +156,10 @@ class TimelineTest(test.TestCase):
     ctf = step_analysis.chrome_trace.format_to_string()
     self._validateTrace(ctf)
     maximums = step_analysis.allocator_maximums
-    self.assertTrue('cpu' in maximums)
+    cpuname = 'mklcpu' if test_util.IsMklEnabled() else 'cpu'
+    self.assertTrue(cpuname in maximums)
     cpu_max = maximums[
-        'cuda_host_bfc'] if 'cuda_host_bfc' in maximums else maximums['cpu']
+        'cuda_host_bfc'] if 'cuda_host_bfc' in maximums else maximums[cpuname]
     # At least num1 + num2, both float32s (4 bytes each)
     self.assertGreater(cpu_max.num_bytes, 8)
     self.assertGreater(cpu_max.timestamp, 0)
diff --git a/tensorflow/python/data/BUILD b/tensorflow/python/data/BUILD
index b5bee36dcdfd463056d0e883acb3c701509b1eee..3e08c1587e3e0df70e3cd5be58d24103c4a78339 100644
--- a/tensorflow/python/data/BUILD
+++ b/tensorflow/python/data/BUILD
@@ -15,15 +15,3 @@ py_library(
         "//tensorflow/python/data/ops:readers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 8b8adefa65a5c54d40bc28d8f50953513cfd3605..ed0c11e6c117dcbb810fd3acfc484128ed3519fa 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -367,15 +367,3 @@ tf_py_test(
         "no_windows",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
index 02720a2e985914d3a6774dc6f64d1316890c46bf..25269dc810ae2e3107f8b5317496a35a8ff59d0c 100644
--- a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
@@ -297,6 +297,21 @@ class MemoryCacheDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(i2.get_next())
 
+  def testCacheTakeRepeat(self):
+    dataset = dataset_ops.Dataset.range(10).cache().take(5).repeat(2)
+    itr = dataset.make_one_shot_iterator()
+    n = itr.get_next()
+
+    expected_values = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
+
+    with self.test_session() as sess:
+      for i, expected in enumerate(expected_values):
+        self.assertEqual(expected, sess.run(n),
+                         "Unexpected value at index %s" % i)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
index 4a14a915bdb33f1ac6e8fc1839b32bc81fa8de05..0af282a02475384cb2d0f8e273324d6211e1b50d 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -717,6 +718,14 @@ class IteratorTest(test.TestCase):
       self.assertTrue(
           iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE in str(warning.message))
 
+  def testEagerIteratorAsync(self):
+    with context.eager_mode(), context.execution_mode(context.ASYNC):
+      val = 0
+      dataset = dataset_ops.Dataset.range(10)
+      for foo in dataset:
+        self.assertEqual(val, foo.numpy())
+        val += 1
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 3119ab003794cb9bc0c748dfeb47597e0877f5fd..fa2e86eab18b0b97ea01a96e309b0ea82d91b267 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -59,15 +59,3 @@ py_library(
         "//tensorflow/python/eager:context",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 390ce852b1337bf3ae387660e4b83b0b678f737f..8729e085a32f6df87ba9feb515ccfac6a105cfef 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -563,7 +563,7 @@ class Dataset(object):
 
     Args:
       buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
-        maximum number elements that will be buffered when prefetching.
+        maximum number of elements that will be buffered when prefetching.
 
     Returns:
       Dataset: A `Dataset`.
@@ -1950,47 +1950,13 @@ class FlatMapDataset(Dataset):
     return self._output_types
 
 
-class InterleaveDataset(Dataset):
+class InterleaveDataset(FlatMapDataset):
   """A `Dataset` that maps a function over its input and interleaves the result.
   """
 
   def __init__(self, input_dataset, map_func, cycle_length, block_length):
     """See `Dataset.interleave()` for details."""
-    super(InterleaveDataset, self).__init__()
-    self._input_dataset = input_dataset
-
-    @function.Defun(*nest.flatten(
-        sparse.as_dense_types(input_dataset.output_types,
-                              input_dataset.output_classes)))
-    def tf_map_func(*args):
-      """A wrapper for Defun that facilitates shape inference."""
-      # Pass in shape information from the input_dataset.
-      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
-                                            input_dataset.output_classes)
-      for arg, shape in zip(args, nest.flatten(dense_shapes)):
-        arg.set_shape(shape)
-
-      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-      nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types, input_dataset.output_shapes,
-          input_dataset.output_classes)
-      if _should_unpack_args(nested_args):
-        dataset = map_func(*nested_args)
-      else:
-        dataset = map_func(nested_args)
-
-      if not isinstance(dataset, Dataset):
-        raise TypeError("`map_func` must return a `Dataset` object.")
-
-      self._output_classes = dataset.output_classes
-      self._output_types = dataset.output_types
-      self._output_shapes = dataset.output_shapes
-
-      return dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-    self._map_func = tf_map_func
-    self._map_func.add_to_graph(ops.get_default_graph())
-
+    super(InterleaveDataset, self).__init__(input_dataset, map_func)
     self._cycle_length = ops.convert_to_tensor(
         cycle_length, dtype=dtypes.int64, name="cycle_length")
     self._block_length = ops.convert_to_tensor(
@@ -1999,27 +1965,15 @@ class InterleaveDataset(Dataset):
   def _as_variant_tensor(self):
     return gen_dataset_ops.interleave_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,
+        self._map_func.captured_inputs,  # pylint: disable=protected-access
         self._cycle_length,
         self._block_length,
-        f=self._map_func,
+        f=self._map_func,  # pylint: disable=protected-access
         output_types=nest.flatten(
             sparse.as_dense_types(self.output_types, self.output_classes)),
         output_shapes=nest.flatten(
             sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
 
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
 
 class FilterDataset(Dataset):
   """A `Dataset` that filters its input according to a predicate function."""
@@ -2089,6 +2043,8 @@ class PrefetchDataset(Dataset):
     """See `Dataset.prefetch()` for details."""
     super(PrefetchDataset, self).__init__()
     self._input_dataset = input_dataset
+    if buffer_size is None:
+      buffer_size = -1  # This is the sentinel for auto-tuning.
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index d79b9d6011b6ebd00a47d572165cdbba8a31bd32..0c76afd29d4626be9120c059d60218daab5cc0ac 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -488,23 +488,27 @@ class EagerIterator(object):
   def _next_internal(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
     """
-    with ops.device(self._device):
-      # TODO(ashankar): Consider removing this ops.device() contextmanager
-      # and instead mimic ops placement in graphs: Operations on resource
-      # handles execute on the same device as where the resource is placed.
-      # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
-      # because in eager mode this code will run synchronously on the calling
-      # thread. Therefore we do not need to make a defensive context switch
-      # to a background thread, and can achieve a small constant performance
-      # boost by invoking the iterator synchronously.
-      ret = gen_dataset_ops.iterator_get_next_sync(
-          self._resource,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
-
-    return sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(self._output_types, ret), self._output_types,
-        self._output_shapes, self._output_classes)
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      with ops.device(self._device):
+        # TODO(ashankar): Consider removing this ops.device() contextmanager
+        # and instead mimic ops placement in graphs: Operations on resource
+        # handles execute on the same device as where the resource is placed.
+        # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
+        # because in eager mode this code will run synchronously on the calling
+        # thread. Therefore we do not need to make a defensive context switch
+        # to a background thread, and can achieve a small constant performance
+        # boost by invoking the iterator synchronously.
+        ret = gen_dataset_ops.iterator_get_next_sync(
+            self._resource,
+            output_types=self._flat_output_types,
+            output_shapes=self._flat_output_shapes)
+
+      return sparse.deserialize_sparse_tensors(
+          nest.pack_sequence_as(self._output_types, ret), self._output_types,
+          self._output_shapes, self._output_classes)
 
   def next(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 6c493d8163b051b2e724335923d7b4c721523083..fe033f5546498d57dd98289d2cda1a8bbb1c7822 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -22,7 +22,6 @@ from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -121,51 +120,14 @@ class _TFRecordDataset(dataset_ops.Dataset):
     return dtypes.string
 
 
-class ParallelInterleaveDataset(dataset_ops.Dataset):
+class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
   """A `Dataset` that maps a function over its input and flattens the result."""
 
   def __init__(self, input_dataset, map_func, cycle_length, block_length,
                sloppy, buffer_output_elements, prefetch_input_elements):
     """See `tf.contrib.data.parallel_interleave()` for details."""
-    super(ParallelInterleaveDataset, self).__init__()
-    self._input_dataset = input_dataset
-
-    @function.Defun(*nest.flatten(
-        sparse.as_dense_types(input_dataset.output_types,
-                              input_dataset.output_classes)))
-    def tf_map_func(*args):
-      """A wrapper for Defun that facilitates shape inference."""
-      # Pass in shape information from the input_dataset.
-      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
-                                            input_dataset.output_classes)
-      for arg, shape in zip(args, nest.flatten(dense_shapes)):
-        arg.set_shape(shape)
-
-      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-      nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types, input_dataset.output_shapes,
-          input_dataset.output_classes)
-      if dataset_ops._should_unpack_args(nested_args):  # pylint: disable=protected-access
-        dataset = map_func(*nested_args)
-      else:
-        dataset = map_func(nested_args)
-
-      if not isinstance(dataset, dataset_ops.Dataset):
-        raise TypeError("`map_func` must return a `Dataset` object.")
-
-      self._output_classes = dataset.output_classes
-      self._output_types = dataset.output_types
-      self._output_shapes = dataset.output_shapes
-
-      return dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-    self._map_func = tf_map_func
-    self._map_func.add_to_graph(ops.get_default_graph())
-
-    self._cycle_length = ops.convert_to_tensor(
-        cycle_length, dtype=dtypes.int64, name="cycle_length")
-    self._block_length = ops.convert_to_tensor(
-        block_length, dtype=dtypes.int64, name="block_length")
+    super(ParallelInterleaveDataset, self).__init__(input_dataset, map_func,
+                                                    cycle_length, block_length)
     self._sloppy = ops.convert_to_tensor(
         sloppy, dtype=dtypes.bool, name="sloppy")
     self._buffer_output_elements = convert.optional_param_to_tensor(
@@ -178,8 +140,9 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
         argument_default=2 * cycle_length)
 
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     return gen_dataset_ops.parallel_interleave_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._input_dataset._as_variant_tensor(),
         self._map_func.captured_inputs,
         self._cycle_length,
         self._block_length,
@@ -191,18 +154,7 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
             sparse.as_dense_types(self.output_types, self.output_classes)),
         output_shapes=nest.flatten(
             sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+    # pylint: enable=protected-access
 
 
 @tf_export("data.TFRecordDataset")
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index b1bdbdab37b63667b475c732df7a47d9e57f2b19..0fc32d51b9fe581a54519139f3bf12118f8f4028 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -109,15 +109,3 @@ py_test(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index c60f6923900fb1c43e6028c844260ee2aa981ff2..250b4b1b6ab983c8073b5de3d2d29d02a50c71a8 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -921,6 +921,7 @@ py_test(
     size = "small",
     srcs = ["cli/profile_analyzer_cli_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":debugger_cli_common",
         ":profile_analyzer_cli",
@@ -1096,15 +1097,3 @@ sh_test(
         ":offline_analyzer",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index 156afdfd4c44f2f1a07ffdd1e68ad48bbbe31cba..9a47cd12b47b35d0a85cfc1a211fdfee7cfa25bc 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -185,6 +185,15 @@ class DebugAnalyzer(object):
         type=str,
         default="",
         help="List only Tensors passing the filter of the specified name")
+    ap.add_argument(
+        "-fenn",
+        "--filter_exclude_node_names",
+        dest="filter_exclude_node_names",
+        type=str,
+        default="",
+        help="When applying the tensor filter, exclude node with names "
+        "matching the regular expression. Applicable only if --tensor_filter "
+        "or -f is used.")
     ap.add_argument(
         "-n",
         "--node_name_filter",
@@ -484,6 +493,10 @@ class DebugAnalyzer(object):
 
     Returns:
       Output text lines as a RichTextLines object.
+
+    Raises:
+      ValueError: If `--filter_exclude_node_names` is used without `-f` or
+        `--tensor_filter` being used.
     """
 
     # TODO(cais): Add annotations of substrings for dumped tensor names, to
@@ -520,8 +533,15 @@ class DebugAnalyzer(object):
         _add_main_menu(output, node_name=None, enable_list_tensors=False)
         return output
 
-      data_to_show = self._debug_dump.find(filter_callable)
+      data_to_show = self._debug_dump.find(
+          filter_callable,
+          exclude_node_names=parsed.filter_exclude_node_names)
     else:
+      if parsed.filter_exclude_node_names:
+        raise ValueError(
+            "The flag --filter_exclude_node_names is valid only when "
+            "the flag -f or --tensor_filter is used.")
+
       data_to_show = self._debug_dump.dumped_tensor_data
 
     # TODO(cais): Implement filter by lambda on tensor value.
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index 6b110fda9eba301f298e84b63d091bb300549bee..55231954d1c8ea987bbf87755dfde83d5efd03f0 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -820,6 +820,32 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         op_type_regex="(Add|MatMul)")
     check_main_menu(self, out, list_tensors_enabled=False)
 
+  def testListTensorWithFilterAndNodeNameExclusionWorks(self):
+    # First, create and register the filter.
+    def is_2x1_vector(datum, tensor):
+      del datum  # Unused.
+      return list(tensor.shape) == [2, 1]
+    self._analyzer.add_tensor_filter("is_2x1_vector", is_2x1_vector)
+
+    # Use shorthand alias for the command prefix.
+    out = self._registry.dispatch_command(
+        "lt", ["-f", "is_2x1_vector", "--filter_exclude_node_names", ".*v.*"])
+
+    # If the --filter_exclude_node_names were not used, then the matching
+    # tensors would be:
+    #   - simple_mul_add/v:0
+    #   - simple_mul_add/v/read:0
+    #   - simple_mul_add/matmul:0
+    #   - simple_mul_add/add:0
+    #
+    # With the --filter_exclude_node_names option, only the last two should
+    # show up in the result.
+    assert_listed_tensors(
+        self,
+        out, ["simple_mul_add/matmul:0", "simple_mul_add/add:0"],
+        ["MatMul", "Add"], tensor_filter_name="is_2x1_vector")
+    check_main_menu(self, out, list_tensors_enabled=False)
+
   def testListTensorsFilterNanOrInf(self):
     """Test register and invoke a tensor filter."""
 
diff --git a/tensorflow/python/debug/cli/curses_ui.py b/tensorflow/python/debug/cli/curses_ui.py
index bb52f9051250625836b0d7a0f8e30265d9b34e92..f66cefb427c9ccfa0769655415193e8d2535e53c 100644
--- a/tensorflow/python/debug/cli/curses_ui.py
+++ b/tensorflow/python/debug/cli/curses_ui.py
@@ -1185,6 +1185,22 @@ class CursesUI(base_ui.BaseUI):
       self._main_menu = None
       self._main_menu_pad = None
 
+  def _pad_line_end_with_whitespace(self, pad, row, line_end_x):
+    """Pad the whitespace at the end of a line with the default color pair.
+
+    Prevents spurious color pairs from appearing at the end of the lines in
+    certain text terimnals.
+
+    Args:
+      pad: The curses pad object to operate on.
+      row: (`int`) row index.
+      line_end_x: (`int`) column index of the end of the line (beginning of
+        the whitespace).
+    """
+    if line_end_x < self._max_x - 2:
+      pad.addstr(row, line_end_x, " " * (self._max_x - 3 - line_end_x),
+                 self._default_color_pair)
+
   def _screen_add_line_to_output_pad(self, pad, row, txt, color_segments=None):
     """Render a line in a text pad.
 
@@ -1208,6 +1224,7 @@ class CursesUI(base_ui.BaseUI):
 
     if not color_segments:
       pad.addstr(row, 0, txt, self._default_color_pair)
+      self._pad_line_end_with_whitespace(pad, row, len(txt))
       return
 
     if not isinstance(color_segments, list):
@@ -1248,6 +1265,8 @@ class CursesUI(base_ui.BaseUI):
     for segment, color_pair in zip(all_segments, all_color_pairs):
       if segment[1] < self._max_x:
         pad.addstr(row, segment[0], txt[segment[0]:segment[1]], color_pair)
+    if all_segments:
+      self._pad_line_end_with_whitespace(pad, row, all_segments[-1][1])
 
   def _screen_scroll_output_pad(self, pad, viewport_top, viewport_left,
                                 screen_location_top, screen_location_left,
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 8d355aa27f6fa10a1889420a9087800be12a81ce..8a65ad087b3002d8ad93f3a64f48715d26ff62d8 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -23,6 +23,7 @@ import glob
 import json
 import os
 import platform
+import re
 
 import numpy as np
 import six
@@ -1411,7 +1412,11 @@ class DebugDumpDir(object):
 
     return self._watch_key_to_datum[device_name].get(debug_watch_key, [])
 
-  def find(self, predicate, first_n=0, device_name=None):
+  def find(self,
+           predicate,
+           first_n=0,
+           device_name=None,
+           exclude_node_names=None):
     """Find dumped tensor data by a certain predicate.
 
     Args:
@@ -1430,17 +1435,24 @@ class DebugDumpDir(object):
         time order) for which the predicate returns True. To return all the
         `DebugTensotDatum` instances, let first_n be <= 0.
       device_name: optional device name.
+      exclude_node_names: Optional regular expression to exclude nodes with
+        names matching the regular expression.
 
     Returns:
       A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object
        for which predicate returns True, sorted in ascending order of the
        timestamp.
     """
+    if exclude_node_names:
+      exclude_node_names = re.compile(exclude_node_names)
 
     matched_data = []
     for device in (self._dump_tensor_data if device_name is None
                    else (self._dump_tensor_data[device_name],)):
       for datum in self._dump_tensor_data[device]:
+        if exclude_node_names and exclude_node_names.match(datum.node_name):
+          continue
+
         if predicate(datum, datum.get_tensor()):
           matched_data.append(datum)
 
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index f4fac1401918ccacd38aae5ad2ef8d686c9204b9..070d9c4cd7094c81b18192e75885ae6dd6729cbf 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -669,6 +669,55 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       self.assertEqual(1, len(first_bad_datum))
       self.assertEqual(x_name, first_bad_datum[0].node_name)
 
+  def testFindInfOrNanWithOpNameExclusion(self):
+    with session.Session() as sess:
+      u_name = "testFindInfOrNanWithOpNameExclusion/u"
+      v_name = "testFindInfOrNanWithOpNameExclusion/v"
+      w_name = "testFindInfOrNanWithOpNameExclusion/w"
+      x_name = "testFindInfOrNanWithOpNameExclusion/x"
+      y_name = "testFindInfOrNanWithOpNameExclusion/y"
+      z_name = "testFindInfOrNanWithOpNameExclusion/z"
+
+      u_init = constant_op.constant([2.0, 4.0])
+      u = variables.Variable(u_init, name=u_name)
+      v_init = constant_op.constant([2.0, 1.0])
+      v = variables.Variable(v_init, name=v_name)
+
+      # Expected output: [0.0, 3.0]
+      w = math_ops.subtract(u, v, name=w_name)
+
+      # Expected output: [inf, 1.3333]
+      x = math_ops.div(u, w, name=x_name)
+
+      # Expected output: [nan, 4.0]
+      y = math_ops.multiply(w, x, name=y_name)
+
+      z = math_ops.multiply(y, y, name=z_name)
+
+      u.initializer.run()
+      v.initializer.run()
+
+      _, dump = self._debug_run_and_get_dump(
+          sess, z,
+          expected_partition_graph_count=self._expected_partition_graph_count)
+
+      # Find all "offending tensors".
+      bad_data = dump.find(debug_data.has_inf_or_nan,
+                           exclude_node_names=".*/x$")
+
+      # Verify that the nodes with bad values are caught through running find
+      # on the debug dump.
+      self.assertEqual(2, len(bad_data))
+      # Assert that the node `x` should have been excluded.
+      self.assertEqual(y_name, bad_data[0].node_name)
+      self.assertEqual(z_name, bad_data[1].node_name)
+
+      first_bad_datum = dump.find(
+          debug_data.has_inf_or_nan, first_n=1, exclude_node_names=".*/x$")
+
+      self.assertEqual(1, len(first_bad_datum))
+      self.assertEqual(y_name, first_bad_datum[0].node_name)
+
   def _session_run_for_graph_structure_lookup(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
       u_name = "testDumpGraphStructureLookup/u"
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index 1465cb72950c8fa6a453ebd4290bbf6382173ff8..c8625655e51a43a222addedd4beecdd3515d7fb6 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -115,6 +115,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     #   unavailable (i.e., is None), the run-start CLI will be launched to ask
     #   the user. This is the case, e.g., right before the first run starts.
     self._active_tensor_filter = None
+    self._active_filter_exclude_node_names = None
     self._active_tensor_filter_run_start_response = None
     self._run_through_times = 1
     self._skip_debug = False
@@ -148,6 +149,15 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         type=str,
         default="",
         help="Run until a tensor in the graph passes the specified filter.")
+    ap.add_argument(
+        "-fenn",
+        "--filter_exclude_node_names",
+        dest="filter_exclude_node_names",
+        type=str,
+        default="",
+        help="When applying the tensor filter, exclude node with names "
+        "matching the regular expression. Applicable only if --tensor_filter "
+        "or -f is used.")
     ap.add_argument(
         "--node_name_filter",
         dest="node_name_filter",
@@ -324,9 +334,11 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       debug_dump.set_python_graph(self._sess.graph)
 
       passed_filter = None
+      passed_filter_exclude_node_names = None
       if self._active_tensor_filter:
         if not debug_dump.find(
-            self._tensor_filters[self._active_tensor_filter], first_n=1):
+            self._tensor_filters[self._active_tensor_filter], first_n=1,
+            exclude_node_names=self._active_filter_exclude_node_names):
           # No dumped tensor passes the filter in this run. Clean up the dump
           # directory and move on.
           self._remove_dump_root()
@@ -334,10 +346,14 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         else:
           # Some dumped tensor(s) from this run passed the filter.
           passed_filter = self._active_tensor_filter
+          passed_filter_exclude_node_names = (
+              self._active_filter_exclude_node_names)
           self._active_tensor_filter = None
+          self._active_filter_exclude_node_names = None
 
       self._prep_debug_cli_for_run_end(
-          debug_dump, request.tf_error, passed_filter)
+          debug_dump, request.tf_error, passed_filter,
+          passed_filter_exclude_node_names)
 
       self._run_start_response = self._launch_cli()
 
@@ -358,7 +374,11 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if os.path.isdir(self._dump_root):
       shutil.rmtree(self._dump_root)
 
-  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self,
+                                  debug_dump,
+                                  tf_error,
+                                  passed_filter,
+                                  passed_filter_exclude_node_names):
     """Prepare (but not launch) CLI for run-end, with debug dump from the run.
 
     Args:
@@ -368,6 +388,9 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         (if any).
       passed_filter: (None or str) Name of the tensor filter that just passed
         and caused the preparation of this run-end CLI (if any).
+      passed_filter_exclude_node_names: (None or str) Regular expression used
+        with the tensor filter to exclude ops with names matching the regular
+        expresssion.
     """
 
     if tf_error:
@@ -383,6 +406,9 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       if passed_filter is not None:
         # Some dumped tensor(s) from this run passed the filter.
         self._init_command = "lt -f %s" % passed_filter
+        if passed_filter_exclude_node_names:
+          self._init_command += (" --filter_exclude_node_names %s" %
+                                 passed_filter_exclude_node_names)
         self._title_color = "red_on_white"
 
     self._run_cli = analyzer_cli.create_analyzer_ui(
@@ -496,6 +522,11 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     parsed.op_type_filter = parsed.op_type_filter or None
     parsed.tensor_dtype_filter = parsed.tensor_dtype_filter or None
 
+    if parsed.filter_exclude_node_names and not parsed.till_filter_pass:
+      raise ValueError(
+          "The --filter_exclude_node_names (or -feon) flag is valid only if "
+          "the --till_filter_pass (or -f) flag is used.")
+
     if parsed.profile:
       raise debugger_cli_common.CommandLineExit(
           exit_token=framework.OnRunStartResponse(
@@ -525,6 +556,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       if parsed.till_filter_pass in self._tensor_filters:
         action = framework.OnRunStartAction.DEBUG_RUN
         self._active_tensor_filter = parsed.till_filter_pass
+        self._active_filter_exclude_node_names = (
+            parsed.filter_exclude_node_names)
         self._active_tensor_filter_run_start_response = run_start_response
       else:
         # Handle invalid filter name.
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 490812c96d83791cdc20c56f16c968f1a1851af8..b06fa26a935b42709575f8e400e0bda951ffbbc7 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -87,7 +87,11 @@ class LocalCLIDebuggerWrapperSessionForTest(
   def _prep_cli_for_run_start(self):
     pass
 
-  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self,
+                                  debug_dump,
+                                  tf_error,
+                                  passed_filter,
+                                  passed_filter_exclude_op_names):
     self.observers["debug_dumps"].append(debug_dump)
     self.observers["tf_errors"].append(tf_error)
 
@@ -451,6 +455,36 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
     self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
 
+  def testRunTillFilterPassesWithExcludeOpNames(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run", "-f", "greater_than_twelve",
+          "--filter_exclude_node_names", "inc_v.*"],
+         ["run"], ["run"]],
+        self.sess,
+        dump_root=self._tmp_dir)
+
+    def greater_than_twelve(datum, tensor):
+      del datum  # Unused.
+      return tensor > 12.0
+
+    # Verify that adding the same tensor filter more than once is tolerated
+    # (i.e., as if it were added only once).
+    wrapped_sess.add_tensor_filter("greater_than_twelve", greater_than_twelve)
+
+    # run five times.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    self.assertAllClose(14.0, self.sess.run(self.v))
+
+    self.assertEqual([1], wrapped_sess.observers["run_start_cli_run_numbers"])
+
+    # Due to the --filter_exclude_op_names flag, the run-end CLI should show up
+    # not after run 3, but after run 4.
+    self.assertEqual([4], wrapped_sess.observers["run_end_cli_run_numbers"])
+
   def testRunTillFilterPassesWorksInConjunctionWithOtherNodeNameFilter(self):
     """Test that --.*_filter flags work in conjunction with -f.
 
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 5bedf9c6fdfcf630a72ef8a34bca38417b738d4f..8c0d3feeceab1bf29c1dabc668176a6ef7806421 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -105,6 +105,7 @@ cuda_py_test(
         ":test",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:resource_variable_ops",
@@ -397,21 +398,6 @@ py_test(
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "imperative_grad",
     srcs = ["imperative_grad.py"],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 88de1a951ffd85b4ca97a084c8616d804dc10b45..92774d4d50e00c85599ceaef1cc99bb062bd3ce3 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -31,7 +31,6 @@ from tensorflow.python.eager import imperative_grad
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -40,6 +39,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 _op_attr_type_cache = {}
@@ -49,12 +49,10 @@ def op_attr_type(op_type, attr_name):
   try:
     return _op_attr_type_cache[(op_type, attr_name)]
   except KeyError:
-    with errors.raise_exception_on_not_ok_status() as status:
-      h = context.context()._handle  # pylint: disable=protected-access
-      attr_type = pywrap_tensorflow.TFE_OpNameGetAttrType(
-          h, op_type, attr_name, status)
-    _op_attr_type_cache[(op_type, attr_name)] = attr_type
-    return attr_type
+    h = context.context()._handle  # pylint: disable=protected-access
+    attr_type = pywrap_tensorflow.TFE_OpNameGetAttrType(h, op_type, attr_name)
+  _op_attr_type_cache[(op_type, attr_name)] = attr_type
+  return attr_type
 
 
 def make_attr(attr_type, value):
@@ -85,6 +83,14 @@ class _MockOp(object):
         return make_attr(typ, self.attrs[i + 1])
     raise KeyError(attr)
 
+  def _get_control_flow_context(self):
+    raise NotImplementedError(
+        "tf.GradientTape.gradients() does not support graph control flow "
+        "operations like tf.cond or tf.while at this time. Use tf.gradients() "
+        "instead. If you need this feature, please file a feature request at "
+        "https://github.com/tensorflow/tensorflow/issues/new"
+    )
+
 
 def _magic_gradient_function(op_name, attr_tuple, num_inputs,
                              inputs, outputs, out_grads):
@@ -162,8 +168,8 @@ def implicit_val_and_grad(f):
   """Returns a function which differentiates f with respect to variables.
 
   The wrapped function returns the value and the gradient of f when called with
-  the same arguments. The gradient is with respect to all TFE variables which
-  have `variable.watch()` called on them by f.
+  the same arguments. The gradient is with respect to all trainable TFE
+  variables accessed by `f`.
 
   This function is useful when the exact set of variables to differentiate with
   is not known ahead of time.
@@ -240,8 +246,8 @@ def implicit_grad(f):
   """Returns a function which differentiates f with respect to variables.
 
   The wrapped function returns the gradient of f when called with the same
-  arguments. The gradient is with respect to all TFE variables which have
-  `variable.watch()` called on them by f.
+  arguments. The gradient is with respect to all trainable TFE variables
+  accessed by `f`.
 
   This function is useful when the exact set of variables to differentiate with
   is not known ahead of time.
@@ -637,64 +643,62 @@ _default_vspace = imperative_grad.VSpace(
     ones=_ones)
 
 
+def _handle_or_self(x):
+  """If x is ResourceVariable, return its handle, else x."""
+  if isinstance(x, resource_variable_ops.ResourceVariable):
+    x = x.handle
+  return x
+
+
+@tf_export("GradientTape")
 class GradientTape(object):
-  """Records operations to use to compute gradients.
+  """Record operations for automatic differentiation.
 
-  Operations are recorded if:
-    - they happen in code marked by this context manager
-    - at least one of their inputs is being watched
+  Operations are recorded if they are executed within this context manager and
+  at least one of their inputs is being "watched".
 
-  Outputs of recorded operations are watched. Variables are automatically
-  watched and tensors can be manually watched by calling the watch method on the
-  context manager.
+  Trainable variables (created by `tf.contrib.eager.Variable` or
+  @{tf.get_variable}, trainable=True is default in both cases) are automatically
+  watched. Tensors can be manually watched by invoking the `watch` method on
+  this context manager.
 
-  Example usage:
+  For example, consider the function `y = x * x`. The gradient at `x = 3.0` can
+  be computed as:
 
   ```python
+  x = tf.constant(3.)
   with tfe.GradientTape() as g:
-    x = tf.constant(3.0)
     g.watch(x)
     y = x * x
-  grad = g.gradient(y, [x])[0]
-  assert grad.numpy() == 6.0
+  grad = g.gradient(y, [x])[0] # Will compute to 6.0
   ```
 
-  It is possible to use GradientTapes to compute higher-order derivatives as
-  follows:
+  GradientTapes can be nested to compute higher-order derivatives. For example,
 
   ```python
+  x = tf.constant(3.0)
   with tfe.GradientTape() as g:
-    x = tf.constant(3.0)
-    g.watch(x)
-    y = x * x
     with tfe.GradientTape() as gg:
-      gg.watch(y)
-      z = 2 * y
-    inner_grad = gg.gradient(z, [y])[0]
-    assert inner_grad.numpy() == 2
-    y = y + inner_grad
-  grad = g.gradient(y, [x])[0]
-  assert grad.numpy() == 6.0
+      gg.watch(x)
+      y = x * x
+    dy_dx = gg.gradient(y, [x])[0]     # Will compute to 6.0
+  d2y_dx2 = g.gradient(dy_dx, [x])[0]  # Will compute to 2.0
   ```
 
   By default, the resources held by a GradientTape are released as soon as
-  GradientTape.gradient() method is called. However, if one need to compute
-  multiple gradients over the same computation, she can create a persistent
-  GradientTape. Persistent tapes allow multiple calls to the gradient() method
-  and release resources when the tape object is destructed.
-
-  Example usage:
+  GradientTape.gradient() method is called. To compute multiple gradients over
+  the same computation, create a persistent gradient tape. This allows multiple
+  calls to the gradient() method as resources are released when the tape object
+  is garbage collected. For example:
 
   ```python
+  x = tf.constant(3.0)
   with tfe.GradientTape(persistent=True) as g:
-    x = tf.constant(3.0)
     g.watch(x)
     y = x * x
     z = y * y
-  dz_dx = g.gradient(z, [x])[0]
-  assert dz_dx.numpy() == 108.0   # 4*x^3 at x = 3
-  dy_dx = g.gradient(y, [x])[0]
-  assert dy_dx.numpy() == 6.0
+  dy_dx = g.gradient(z, [x])[0]  # 6.0
+  dz_dx = g.gradient(y, [x])[0]  # 108.0 (4*x^3 at x = 3)
   del g  # Drop the reference to the tape
   """
 
@@ -703,8 +707,8 @@ class GradientTape(object):
 
     Args:
       persistent: Boolean controlling whether a persistent gradient tape
-        is created. Must be True or False.
-
+        is created. False by default, which means at most one call can
+        be made to the gradient() method on this object.
     """
     self._tape = None
     self._persistent = persistent
@@ -720,12 +724,10 @@ class GradientTape(object):
     """Ensures that `tensor` is being traced by this tape.
 
     Args:
-      tensor: a Tensor or Variable a list of Tensors or Variables.
+      tensor: a Tensor or list of Tensors.
     """
     for t in nest.flatten(tensor):
-      if isinstance(t, resource_variable_ops.ResourceVariable):
-        t = t.handle
-      tape.watch(t)
+      tape.watch(_handle_or_self(t))
 
   def watched_variables(self):
     # Sorting variables by id, which is monotonically increasing in construction
@@ -735,33 +737,37 @@ class GradientTape(object):
                        key=lambda v: v.handle._id))  # pylint: disable=protected-access
 
   def gradient(self, target, sources, output_gradients=None):
-    """Computes the gradient using information traced by the tape.
+    """Computes the gradient using operations recorded in context of this tape.
 
     Args:
-      target: the tensor to be differentiated.
-      sources: a list of Tensors or Variables, the target will be
-       differentiated with respect to the sources.
+      target: Tensor to be differentiated.
+      sources: a list or nested structure of Tensors or Variables. `target`
+        will be differentiated against elements in `sources`.
       output_gradients: a list of gradients, one for each element of
-       target. Defaults to None.
+        target. Defaults to None.
 
     Returns:
-      a list of Tensors (or IndexedSlices, or None), one for each element in
-      `sources`.
+      a list or nested structure of Tensors (or IndexedSlices, or None),
+      one for each element in `sources`. Returned structure is the same as
+      the structure of `sources`.
 
     Raises:
       RuntimeError: if called inside the context of the tape, or if called more
-       than once.
+       than once on a non-persistent tape.
     """
     if self._tape is None:
       raise RuntimeError("GradientTape.gradient can only be called once "
                          "on non-persistent tapes, and "
                          "only when the context manager has exited.")
-    sources = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
-               else x
-               for x in sources]
-    grad = imperative_grad.imperative_grad(
-        _default_vspace, self._tape, [target], sources,
+    flat_sources = nest.flatten(sources)
+    flat_sources = [_handle_or_self(x) for x in flat_sources]
+
+    flat_grad = imperative_grad.imperative_grad(
+        _default_vspace, self._tape, [target], flat_sources,
         output_gradients=output_gradients)
+
     if not self._persistent:
       self._tape = None
+
+    grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 07a2155d24bd9e00340273b3af36712216f6e8be..991b4dbe7a688c8f6dc6420b6d6b7f7158d6bf86 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gradients
@@ -195,6 +196,19 @@ class BackpropTest(test.TestCase):
     g, = backprop.gradients_function(loss, [0])(logits, labels)
     self.assertAllEqual(g.numpy(), [[-0.5, 0.5]])
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientWithinTapeBlock(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    self.evaluate(v1.initializer)
+    with backprop.GradientTape() as t:
+      loss = 2 * v1
+      with self.assertRaises(RuntimeError):
+        t.gradient(loss, [v1])
+    with backprop.GradientTape(persistent=True) as t:
+      loss = 2 * v1
+      grad = t.gradient(loss, [v1])
+    self.assertAllEqual(self.evaluate(grad[0]), 2.0)
+
   @test_util.assert_no_new_tensors
   def testSecondGrad(self):
 
@@ -356,6 +370,54 @@ class BackpropTest(test.TestCase):
     self.assertEqual(backprop.implicit_grad(f)()[0][0], None)
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeRepeatedSource(self):
+    with backprop.GradientTape(persistent=False) as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = 2 * x
+    grad = g.gradient(target=y, sources=[x, x])
+    self.assertEqual(self.evaluate(grad), [2.0, 2.0])
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testPersistentGradientTapeRepeatedSource(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant(3.0)
+      y = constant_op.constant(5.0)
+      g.watch(x)
+      g.watch(y)
+      z = x * x + x * y
+    grad = g.gradient(target=z, sources=[x, x])
+    self.assertEqual(self.evaluate(grad), [11.0, 11.0])
+    grad = g.gradient(target=z, sources=[y, x])
+    self.assertEqual(self.evaluate(grad), [3.0, 11.0])
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeStructure(self):
+    with backprop.GradientTape(persistent=True) as g:
+      # Using different constant values because constant tensors are
+      # cached, leading to a different gradient then what one might expect.
+      x1 = constant_op.constant(3.0)
+      x2 = constant_op.constant(3.1)
+      x3 = constant_op.constant(3.2)
+      g.watch(x1)
+      g.watch(x2)
+      g.watch(x3)
+      y = x1  + 2 * x2  + 3 * x3
+    self.assertEqual(self.evaluate(g.gradient(y, x1)), [1.0])
+    self.assertEqual(self.evaluate(g.gradient(y, (x1,))), (1.0,))
+    self.assertEqual(self.evaluate(g.gradient(y, (x1, x2))), (1.0, 2.0))
+    self.assertEqual(self.evaluate(g.gradient(y, [(x1, x2), (x2, x3)])),
+                     [(1.0, 2.0), (2.0, 3.0)])
+    self.assertEqual(self.evaluate(g.gradient(y, (x1, x2, [x1, x3]))),
+                     (1.0, 2.0, [1.0, 3.0]))
+    self.assertEqual(self.evaluate(g.gradient(y, [x1, {'x2': x2, 'x3': x3}])),
+                     [1.0, {'x2': 2.0, 'x3': 3.0}])
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testGradientTape(self):
     with backprop.GradientTape() as g:
       x = constant_op.constant(3.0)
@@ -365,10 +427,53 @@ class BackpropTest(test.TestCase):
         gg.watch(y)
         z = 2 * y
       inner_grad = gg.gradient(z, [y])[0]
-      self.assertEqual(inner_grad.numpy(), 2.0)
+      self.assertEqual(self.evaluate(inner_grad), 2.0)
       y += inner_grad
     grad = g.gradient(y, [x])[0]
-    self.assertEqual(grad.numpy(), 6.0)
+    self.assertEqual(self.evaluate(grad), 6.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeWithCond(self):
+    x = constant_op.constant(3.0)
+
+    def true_fn():
+      return x
+
+    def false_fn():
+      return x * x
+
+    with backprop.GradientTape() as g:
+      g.watch(x)
+      y = control_flow_ops.cond(x < x, true_fn, false_fn)
+
+    if not context.executing_eagerly():
+      with self.assertRaisesRegexp(NotImplementedError, 'tf.gradients'):
+        dy = g.gradient(y, [x])[0]
+    else:
+      dy = g.gradient(y, [x])[0]
+      self.assertEqual(self.evaluate(dy), 6.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeWithWhileLoop(self):
+    i = constant_op.constant(1)
+    x = constant_op.constant(2.)
+
+    def cond(i, _):
+      return i < 3
+
+    def body(i, x):
+      return i + 1, x * 2
+
+    with backprop.GradientTape() as g:
+      g.watch([x])
+      _, y = control_flow_ops.while_loop(cond, body, [i, x])
+
+    if not context.executing_eagerly():
+      with self.assertRaisesRegexp(NotImplementedError, 'tf.gradients'):
+        dy = g.gradient(y, [x])[0]
+    else:
+      dy = g.gradient(y, [x])[0]
+      self.assertEqual(self.evaluate(dy), 4.0)
 
   @test_util.assert_no_new_tensors
   def testGradientTapeGradientCalledMultipleTimes(self):
@@ -383,6 +488,7 @@ class BackpropTest(test.TestCase):
       g.gradient(y, [x])
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testPersistentTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -390,12 +496,13 @@ class BackpropTest(test.TestCase):
       y = x * x
       z = y * y
     dz_dx = g.gradient(z, [x])[0]
-    self.assertEqual(dz_dx.numpy(), 4*3*3*3)
+    self.assertEqual(self.evaluate(dz_dx), 4 * 3 * 3 * 3)
     dy_dx = g.gradient(y, [x])[0]
-    self.assertEqual(dy_dx.numpy(), 2*3)
+    self.assertEqual(self.evaluate(dy_dx), 2 * 3)
     del g
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testPersistentNestedTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -406,22 +513,24 @@ class BackpropTest(test.TestCase):
         z = 2 * y
       for _ in range(2):
         inner_grad = gg.gradient(z, [y])[0]
-        self.assertEqual(inner_grad.numpy(), 2.0)
+        self.assertEqual(self.evaluate(inner_grad), 2.0)
       y += inner_grad
       del gg
     grad = g.gradient(y, [x])[0]
-    self.assertEqual(grad.numpy(), 6.0)
+    self.assertEqual(self.evaluate(grad), 6.0)
     grad = g.gradient(z, [x])[0]
-    self.assertEqual(grad.numpy(), 12.0)
+    self.assertEqual(self.evaluate(grad), 12.0)
     del g
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testGradientTapeVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0, name='v')
+    self.evaluate(v.initializer)
     with backprop.GradientTape() as g:
       y = v * v
     grad = g.gradient(y, [v])[0]
-    self.assertAllEqual(grad, 2.0)
+    self.assertAllEqual(self.evaluate(grad), 2.0)
 
   @test_util.assert_no_new_tensors
   def testEmptyParamsForValueAndGradFunction(self):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 4255677a68836decce1aaf8d4d7d2df2bf86315b..7ad37058fd92533fb84be408fddd6cbd65fc833a 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -55,7 +55,7 @@ def c_tfe_py_fastpath_execute(a,
                               transpose_b=False,
                               name=None):
   ctx = context.context()
-  assert ctx.in_eager_mode(
+  assert ctx.executing_eagerly(
   ), "The prototype doesn't contain C code for graph construction"
   try:
     return pywrap_tensorflow.TFE_Py_FastPathExecute(
@@ -82,16 +82,24 @@ class MicroBenchmarks(test.Benchmark):
     self._num_iters_2_by_2 = 30000
     self._num_iters_100_by_784 = 1000
 
-  def _run(self, func, num_iters):
+  def _run(self, func, num_iters, execution_mode=None):
     # call func to maybe warm up the GPU
-    func()
-    start = time.time()
-    for _ in xrange(num_iters):
+    ctx = context.context()
+    with ctx.execution_mode(execution_mode):
       func()
-    end = time.time()
-    mean_us = (end - start) * 1e6 / num_iters
-    self.report_benchmark(iters=num_iters, wall_time=mean_us,
-                          extras={"examples_per_sec": num_iters/(end-start)})
+      if execution_mode == context.ASYNC:
+        ctx.async_wait()
+      start = time.time()
+      for _ in xrange(num_iters):
+        func()
+      if execution_mode == context.ASYNC:
+        ctx.async_wait()
+      end = time.time()
+      mean_us = (end - start) * 1e6 / num_iters
+      self.report_benchmark(
+          iters=num_iters,
+          wall_time=mean_us,
+          extras={"examples_per_sec": num_iters / (end - start)})
 
   def benchmark_create_np_array(self):
     func = lambda: np.array([3.0])
@@ -193,6 +201,9 @@ class MicroBenchmarks(test.Benchmark):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)
 
+  def benchmark_slowpath_tf_identity(self):
+    self._run(lambda: gen_array_ops.identity(1), 30000)
+
   def benchmark_tfe_py_execute_identity(self):
     m = self._m_2
     ctx_handle = context.context()._handle
@@ -236,9 +247,10 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: np.dot(a, b)
     self._run(func, num_iters)
 
-  def _benchmark_tf_matmul(self, m, transpose_b, num_iters):
+  def _benchmark_tf_matmul(self, m, transpose_b, num_iters,
+                           execution_mode=None):
     func = lambda: math_ops.matmul(m, m, transpose_b=transpose_b)
-    self._run(func, num_iters)
+    self._run(func, num_iters, execution_mode=execution_mode)
 
   def _benchmark_gen_math_ops_matmul(self, m, transpose_b, num_iters):
     def func():
@@ -267,10 +279,14 @@ class MicroBenchmarks(test.Benchmark):
 
     self._run(func, num_iters)
 
-  def _benchmark_defun_matmul(self, m, transpose_b, num_iters):
+  def _benchmark_defun_matmul(self,
+                              m,
+                              transpose_b,
+                              num_iters,
+                              execution_mode=None):
     f = function.defun(math_ops.matmul)
     func = lambda: f(m, m, transpose_b)
-    self._run(func, num_iters)
+    self._run(func, num_iters, execution_mode=execution_mode)
 
   def _benchmark_read_variable(self, m, num_iters):
     self._run(m.value, num_iters)
@@ -301,6 +317,15 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_tf_matmul_2_by_2_CPU_async(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_tf_matmul(
+          m,
+          transpose_b=False,
+          num_iters=self._num_iters_2_by_2,
+          execution_mode=context.ASYNC)
+
   def benchmark_gen_math_ops_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -325,6 +350,15 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_defun_matmul_2_by_2_CPU_async(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_defun_matmul(
+          m,
+          transpose_b=False,
+          num_iters=self._num_iters_2_by_2,
+          execution_mode=context.ASYNC)
+
   def benchmark_tf_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -333,6 +367,17 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_tf_matmul_2_by_2_GPU_async(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_tf_matmul(
+          m,
+          transpose_b=False,
+          num_iters=self._num_iters_2_by_2,
+          execution_mode=context.ASYNC)
+
   def benchmark_gen_math_ops_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -357,6 +402,17 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_defun_matmul_2_by_2_GPU_async(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_defun_matmul(
+          m,
+          transpose_b=False,
+          num_iters=self._num_iters_2_by_2,
+          execution_mode=context.ASYNC)
+
   # Benchmarks for AA.T, A of dimension 100 by 784.
   def benchmark_np_matmul_100_by_784(self):
     self._benchmark_np_matmul(
@@ -370,6 +426,15 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_tf_matmul_100_by_784_CPU_async(self):
+    with context.device(CPU):
+      m = self._m_100_by_784.cpu()
+      self._benchmark_tf_matmul(
+          m,
+          transpose_b=True,
+          num_iters=self._num_iters_100_by_784,
+          execution_mode=context.ASYNC)
+
   def benchmark_gen_math_ops_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -402,6 +467,17 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_tf_matmul_100_by_784_GPU_async(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_100_by_784.gpu()
+      self._benchmark_tf_matmul(
+          m,
+          transpose_b=True,
+          num_iters=self._num_iters_100_by_784,
+          execution_mode=context.ASYNC)
+
   def benchmark_gen_math_ops_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 87d3ed880a88e040f02b957da99a094ec2d783ef..9e146f021e813886b42ca72b07122b485901a24b 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -28,7 +28,6 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
@@ -53,6 +52,8 @@ DEVICE_PLACEMENT_WARN = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_WARN
 DEVICE_PLACEMENT_SILENT = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_SILENT
 DEVICE_PLACEMENT_SILENT_FOR_INT32 = (
     pywrap_tensorflow.TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32)
+SYNC = 0
+ASYNC = 1
 
 
 class _TensorCache(object):
@@ -84,29 +85,41 @@ class _EagerContext(threading.local):
     self.device_spec = pydev.DeviceSpec.from_string("")
     self.device_name = self.device_spec.to_string()
     self.mode = _default_mode
+    self.is_eager = _default_mode == EAGER_MODE
     self.scope_name = ""
     self.recording_summaries = False
     self.summary_writer_resource = None
     self.scalar_cache = {}
     self.ones_rank_cache = _TensorCache()
+    self.execution_mode = None
 
 
-ContextStackEntry = collections.namedtuple(
-    "ContextStackEntry", ["is_building_function", "enter_context_fn"])
+ContextSwitch = collections.namedtuple(
+    "ContextSwitch", ["is_building_function", "enter_context_fn"])
 
 
-class ContextStack(threading.local):
+# `_ContextSwitchStack` is a `threading.local` to match the semantics of
+# ``DefaultGraphStack`, which is also a `threading.local`.
+class _ContextSwitchStack(threading.local):
   """A thread-local stack of context switches."""
 
-  def __init__(self):
-    super(ContextStack, self).__init__()
+  def __init__(self, eager):
+    super(_ContextSwitchStack, self).__init__()
     self.stack = []
+    if eager:
+      # Initialize the stack with a pointer to enter the eager context; this
+      # ensures that the fact that eager execution was enabled is propagated
+      # across threads, since (1) `enable_eager_execution` modifies a
+      # process-level flag (`_default_mode`) and (2) `__init__` is called each
+      # time a threading.local object is used in a separate thread.
+      self.push(is_building_function=False, enter_context_fn=eager_mode)
 
   def push(self, is_building_function, enter_context_fn):
     """Push metadata about a context switch onto the stack.
 
     A context switch can take one of two forms: installing a graph as the
-    default graph, or entering the eager context.
+    default graph, or entering the eager context. For each context switch,
+    we record whether or not the entered context is building a function.
 
     Args:
       is_building_function: (bool.) Whether the context is building a function.
@@ -115,7 +128,7 @@ class ContextStack(threading.local):
     """
 
     self.stack.append(
-        ContextStackEntry(is_building_function, enter_context_fn))
+        ContextSwitch(is_building_function, enter_context_fn))
 
   def pop(self):
     """Pop the stack."""
@@ -123,34 +136,49 @@ class ContextStack(threading.local):
     self.stack.pop()
 
 
-context_stack = ContextStack()
-
-
 # TODO(agarwal): rename to EagerContext / EagerRuntime ?
 # TODO(agarwal): consider keeping the corresponding Graph here.
 class Context(object):
   """Environment in which eager operations execute."""
 
-  def __init__(self, config=None, device_policy=None):
+  # TODO(agarwal): create and link in some documentation for `execution_mode`.
+  # pylint: disable=redefined-outer-name
+  def __init__(self, config=None, device_policy=None, execution_mode=None):
     """Creates a new Context.
 
     Args:
       config: (Optional.) A `ConfigProto` protocol buffer with configuration
-       options for the Context. Note that a lot of these options may be
-       currently unimplemented or irrelevant when eager execution is enabled.
+        options for the Context. Note that a lot of these options may be
+        currently unimplemented or irrelevant when eager execution is enabled.
       device_policy: (Optional.) What policy to use when trying to run an
-       operation on a device with inputs which are not on that device.
-       Valid values:
-         tfe.DEVICE_PLACEMENT_EXPLICIT: raises an error if the placement is not
-           correct.
-         tfe.DEVICE_PLACEMENT_WARN: copies the tensors which are not on the
+         operation on a device with inputs which are not on that device.
+         When set to None, an appropriate value will be picked automatically.
+         The value picked may change between TensorFlow releases.
+
+         Defaults to tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32.
+         Valid values:
+         - tfe.DEVICE_PLACEMENT_EXPLICIT: raises an error if the placement is
+           not correct.
+         - tfe.DEVICE_PLACEMENT_WARN: copies the tensors which are not on the
            right device but raises a warning.
-         tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
+         - tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
            hide performance problems.
-         tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
+         - tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
            raising errors on the other ones.
+      execution_mode: (Optional.) Policy controlling how operations dispatched
+        are actually executed. When set to None, an appropriate value will be
+        picked automatically. The value picked may change between TensorFlow
+        releases.
+        Valid values:
+        - tf.contrib.eager.SYNC: executes each operation synchronously.
+        - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+          operations may return "non-ready" handles.
+
+    Raises:
+     ValueError: If execution_mode is not valid.
     """
     self._eager_context = _EagerContext()
+    self._context_switches = _ContextSwitchStack(self.executing_eagerly())
     self._context_handle = None
     self._context_devices = None
     self._post_execution_callbacks = []
@@ -158,6 +186,14 @@ class Context(object):
     self._seed = None
     self._initialize_lock = threading.Lock()
     self._device_policy = device_policy
+    if execution_mode not in (None, SYNC, ASYNC):
+      raise ValueError(
+          "execution_mode should be None/SYNC/ASYNC. Got %s" % execution_mode)
+    if execution_mode is None:
+      execution_mode = SYNC
+    self._execution_mode = execution_mode
+
+  # pylint: enable=redefined-outer-name
 
   def _set_global_seed(self, seed):
     """Set a global eager mode seed for random ops."""
@@ -187,32 +223,27 @@ class Context(object):
       assert self._context_devices is None
       opts = pywrap_tensorflow.TFE_NewContextOptions()
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          if self._config is not None:
-            config_str = self._config.SerializeToString()
-            pywrap_tensorflow.TFE_ContextOptionsSetConfig(
-                opts, config_str, len(config_str), status)
-          if self._device_policy is not None:
-            pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
-                opts, self._device_policy)
-          self._context_handle = pywrap_tensorflow.TFE_NewContext(opts, status)
+        if self._config is not None:
+          config_str = self._config.SerializeToString()
+          pywrap_tensorflow.TFE_ContextOptionsSetConfig(opts, config_str)
+        if self._device_policy is not None:
+          pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
+              opts, self._device_policy)
+        if self._execution_mode == ASYNC:
+          pywrap_tensorflow.TFE_ContextOptionsSetAsync(opts, True)
+        self._context_handle = pywrap_tensorflow.TFE_NewContext(opts)
       finally:
         pywrap_tensorflow.TFE_DeleteContextOptions(opts)
       # Store list of devices
       self._context_devices = []
-      with errors.raise_exception_on_not_ok_status() as status:
-        device_list = pywrap_tensorflow.TFE_ContextListDevices(
-            self._context_handle, status)
+      device_list = pywrap_tensorflow.TFE_ContextListDevices(
+          self._context_handle)
       try:
         self._num_gpus = 0
         for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
-          with errors.raise_exception_on_not_ok_status() as status:
-            dev_name = pywrap_tensorflow.TF_DeviceListName(
-                device_list, i, status)
+          dev_name = pywrap_tensorflow.TF_DeviceListName(device_list, i)
           self._context_devices.append(pydev.canonical_name(dev_name))
-          with errors.raise_exception_on_not_ok_status() as status:
-            dev_type = pywrap_tensorflow.TF_DeviceListType(
-                device_list, i, status)
+          dev_type = pywrap_tensorflow.TF_DeviceListType(device_list, i)
           if dev_type == "GPU":
             self._num_gpus += 1
 
@@ -249,21 +280,28 @@ class Context(object):
 
   @tf_contextlib.contextmanager
   def _mode(self, mode):
+    """A context manager to allow setting the mode to EAGER/GRAPH."""
     ctx = self._eager_context
     old_mode = ctx.mode
+    old_is_eager = ctx.is_eager
     ctx.mode = mode
+    ctx.is_eager = mode == EAGER_MODE
     if mode == EAGER_MODE:
-      context_stack.push(False, eager_mode)
+      # Entering graph mode does not provide us with sufficient information to
+      # record a context switch; graph-based context switches are only logged
+      # when a graph is registered as the default graph.
+      self.context_switches.push(False, eager_mode)
     try:
       yield
     finally:
+      ctx.is_eager = old_is_eager
       ctx.mode = old_mode
       if mode == EAGER_MODE:
-        context_stack.pop()
+        self.context_switches.pop()
 
   def executing_eagerly(self):
     """Returns True if current thread has eager executing enabled."""
-    return self._eager_context.mode == EAGER_MODE
+    return self._eager_context.is_eager
 
   def scalar_cache(self):
     """Per-device cache for scalars."""
@@ -356,6 +394,40 @@ class Context(object):
     """List of the names of devices available to execute operations."""
     return self._devices
 
+  def get_execution_mode(self):
+    mode = self._eager_context.execution_mode
+    if mode is None:
+      mode = self._execution_mode
+    return mode
+
+  def set_execution_mode(self, mode):
+    """Sets execution mode for current thread."""
+    if mode not in (None, SYNC, ASYNC):
+      raise ValueError(
+          "Execution mode should be None/SYNC/ASYNC. Got %s" % mode)
+    if mode is None:
+      mode = SYNC
+    self._eager_context.execution_mode = mode
+    pywrap_tensorflow.TFE_ContextSetAsyncForThread(self._handle, mode == ASYNC)
+
+  @tf_contextlib.contextmanager
+  def execution_mode(self, mode):
+    """Context manager for setting execution mode for current thread."""
+    old_mode = self.get_execution_mode()
+    try:
+      self.set_execution_mode(mode)
+      yield
+    finally:
+      self.set_execution_mode(old_mode)
+
+  def async_wait(self):
+    """Waits for ops dispatched in ASYNC mode to finish."""
+    pywrap_tensorflow.TFE_ContextAsyncWait(self._handle)
+
+  def async_clear_error(self):
+    """Clears errors raised during ASYNC execution."""
+    pywrap_tensorflow.TFE_ContextAsyncClearError(self._handle)
+
   def num_gpus(self):
     """The number of GPUs available to execute operations."""
     self._initialize_handle_and_devices()
@@ -370,11 +442,9 @@ class Context(object):
     Args:
       fn: A wrapped TF_Function (returned from TF_GraphToFunction_wrapper).
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAddFunction(
-          self._handle,  # pylint: disable=protected-access
-          fn,
-          status)
+    pywrap_tensorflow.TFE_ContextAddFunction(
+        self._handle,  # pylint: disable=protected-access
+        fn)
 
   def add_function_def(self, fdef):
     """Add a function definition to the context.
@@ -386,12 +456,10 @@ class Context(object):
       fdef: A FunctionDef protocol buffer message.
     """
     fdef_string = fdef.SerializeToString()
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAddFunctionDef(
-          self._handle,  # pylint: disable=protected-access
-          fdef_string,
-          len(fdef_string),
-          status)
+    pywrap_tensorflow.TFE_ContextAddFunctionDef(
+        self._handle,  # pylint: disable=protected-access
+        fdef_string,
+        len(fdef_string))
 
   def add_post_execution_callback(self, callback):
     """Add a post-execution callback to the context.
@@ -434,23 +502,19 @@ class Context(object):
     To retrieve the accumulated metadata call context.export_run_metadata()
     and to stop tracing call context.disable_run_metadata().
     """
-    if not self._context_handle:
-      self._initialize_handle_and_devices()
-    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._context_handle)
+    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._handle)
 
   @tf_contextlib.contextmanager
   def device_policy(self, policy):
-    if not self._context_handle:
-      self._initialize_handle_and_devices()
-    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-        self._context_handle)
+    handle = self._handle
+    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(handle)
     pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-        self._handle, policy)
+        handle, policy)
     try:
       yield
     finally:
       pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-          self._handle, old)
+          handle, old)
 
   def disable_run_metadata(self):
     """Disables tracing of op execution via RunMetadata."""
@@ -470,14 +534,18 @@ class Context(object):
     if not self._context_handle:
       return None
     with c_api_util.tf_buffer() as buffer_:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pywrap_tensorflow.TFE_ContextExportRunMetadata(
-            self._context_handle, buffer_, status)
+      pywrap_tensorflow.TFE_ContextExportRunMetadata(
+          self._context_handle, buffer_)
       proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
     run_metadata = config_pb2.RunMetadata()
     run_metadata.ParseFromString(compat.as_bytes(proto_data))
     return run_metadata
 
+  @property
+  def context_switches(self):
+    """Returns a stack of context switches."""
+    return self._context_switches
+
 _context = None
 _context_lock = threading.Lock()
 
@@ -496,6 +564,10 @@ def context():
   return _context
 
 
+def context_safe():
+  return _context
+
+
 # TODO(agarwal): remove this.
 def get_default_context():
   """Same as context."""
@@ -595,6 +667,26 @@ def list_devices():
   return context().devices()
 
 
+def set_execution_mode(mode):
+  """Sets execution mode for the current thread."""
+  context().set_execution_mode(mode)
+
+
+def execution_mode(mode):
+  """Context manager for setting execution mode for current thread."""
+  return context().execution_mode(mode)
+
+
+def async_wait():
+  """Waits for ops dispatched in ASYNC mode to finish."""
+  return context().async_wait()
+
+
+def async_clear_error():
+  """Clears errors raised during ASYNC execution mode."""
+  return context().async_clear_error()
+
+
 def num_gpus():
   """Get the number of available GPU devices.
 
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 61c5526d484b08010a96b0a3de283098aef7a90a..3fabe7060e980423268eb6f52ab4043cc4a4847c 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -63,6 +63,16 @@ class TFETest(test_util.TensorFlowTestCase):
     ctx.scope_name = 'foo'
     self.assertEqual('foo', ctx.scope_name)
 
+    self.assertEqual(context.SYNC, ctx.get_execution_mode())
+    ctx.set_execution_mode(context.ASYNC)
+    self.assertEqual(context.ASYNC, ctx.get_execution_mode())
+    ctx.set_execution_mode(context.SYNC)
+    self.assertEqual(context.SYNC, ctx.get_execution_mode())
+    with ctx.execution_mode(context.ASYNC):
+      self.assertEqual(context.ASYNC, ctx.get_execution_mode())
+    ctx.set_execution_mode(context.SYNC)
+    self.assertEqual(context.SYNC, ctx.get_execution_mode())
+
     self.assertIsNone(ctx.summary_writer_resource)
     ctx.summary_writer_resource = 'mock'
     self.assertEqual('mock', ctx.summary_writer_resource)
@@ -87,6 +97,14 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertTrue(has_cpu_device)
     del ctx
 
+  def testAsyncBasic(self):
+    ctx = context.Context(execution_mode=context.ASYNC)
+    has_cpu_device = False
+    for x in ctx.devices():
+      has_cpu_device = has_cpu_device or 'CPU' in x
+    self.assertTrue(has_cpu_device)
+    del ctx
+
   def testRunMetadata(self):
     context.enable_run_metadata()
     t = constant_op.constant(1.0)
@@ -98,8 +116,7 @@ class TFETest(test_util.TensorFlowTestCase):
     cpu_stats = step_stats.dev_stats[0]
     self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
                      cpu_stats.device)
-    self.assertEqual(len(cpu_stats.node_stats), 1)
-    self.assertEqual(cpu_stats.node_stats[0].node_name, 'Add')
+    self.assertGreaterEqual(len(cpu_stats.node_stats), 1)
 
   def testShouldCopy(self):
     if not context.context().num_gpus():
@@ -113,19 +130,18 @@ class TFETest(test_util.TensorFlowTestCase):
     # available, when no device is explicitly provided)
     self.assertEqual(y.device, '/job:localhost/replica:0/task:0/device:CPU:0')
 
-  def testContextStackContainsEagerMode(self):
-    # Eager execution has been enabled, and no other context
-    # switch has occurred, so `context_stack` should contain
-    # exactly one entry.
-    self.assertEqual(len(context.context_stack.stack), 1)
-    stack_entry = context.context_stack.stack[0]
+  def testContextSwitchStackContainsEagerMode(self):
+    # Eager execution has been enabled, and no other context switch has
+    # occurred, so `context_switches` should contain exactly one entry.
+    self.assertEqual(len(context.context().context_switches.stack), 1)
+    switch = context.context().context_switches.stack[0]
 
     # The entry should log that eager mode was entered.
-    self.assertIs(stack_entry.enter_context_fn, context.eager_mode)
+    self.assertIs(switch.enter_context_fn, context.eager_mode)
 
     # It is not possible to build a graph function when eager execution
     # is enabled; the stack entry should reflect this fact.
-    self.assertFalse(stack_entry.is_building_function)
+    self.assertFalse(switch.is_building_function)
 
   def testInt32GPU(self):
     if not context.context().num_gpus():
@@ -208,6 +224,23 @@ class TFETest(test_util.TensorFlowTestCase):
     with self.assertRaises(RuntimeError):
       x.gpu(context.context().num_gpus() + 1)
 
+  def testCopyBetweenDevicesAsync(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    with context.execution_mode(context.ASYNC):
+      x = constant_op.constant([[1., 2.], [3., 4.]])
+      x = x.cpu()
+      x = x.gpu()
+      x = x.gpu()
+      x = x.cpu()
+      context.async_wait()
+
+    # Invalid device
+    with self.assertRaises(RuntimeError):
+      x.gpu(context.context().num_gpus() + 1)
+      context.async_wait()
+    context.async_clear_error()
+
   def testCopyScope(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -248,6 +281,29 @@ class TFETest(test_util.TensorFlowTestCase):
         attrs=('T', three.dtype.as_datatype_enum))[0]
     self.assertAllEqual(15, product)
 
+  def testExecuteBasicAsync(self):
+    with context.execution_mode(context.ASYNC):
+      three = constant_op.constant(3)
+      five = constant_op.constant(5)
+      product = execute(
+          b'Mul',
+          num_outputs=1,
+          inputs=[three, five],
+          attrs=('T', three.dtype.as_datatype_enum))[0]
+      self.assertAllEqual(15, product)
+    # Error: Invalid arguments
+    context.set_execution_mode(context.ASYNC)
+    with self.assertRaises(errors.InvalidArgumentError):
+      execute(
+          b'MatMul',
+          num_outputs=1,
+          inputs=[three, five],
+          attrs=('transpose_a', False, 'transpose_b', False, 'T',
+                 three.dtype.as_datatype_enum))
+      context.async_wait()
+    context.async_clear_error()
+    context.set_execution_mode(context.SYNC)
+
   def testExecuteTooManyNumOutputs(self):
     # num_outputs provided is 50, but only one output is produced.
     product = execute(
@@ -601,10 +657,11 @@ class SendRecvTest(test_util.TensorFlowTestCase):
     with ops.device('GPU:0'):
       t0 = constant_op.constant(1.0)
       self._send(t0, 't0', self.cpu_device)
-    self.assertAllEqual(
-        self._recv(dtypes.float32, 't0', gpu_device_name),
-        1.0)
-    self._send(constant_op.constant(2.0), 't1', gpu_device_name)
+    with ops.device('cpu:0'):
+      self.assertAllEqual(
+          self._recv(dtypes.float32, 't0', gpu_device_name),
+          1.0)
+      self._send(constant_op.constant(2.0), 't1', gpu_device_name)
     with ops.device('GPU:0'):
       self.assertAllEqual(
           self._recv(dtypes.float32, 't1', self.cpu_device),
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 535361498a9dd33003d0479051e97d7ff2553067..9a082596535f51e5a4fb6cc2a11a4dd8a422ed44 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -253,7 +253,7 @@ def add_execution_callback(callback):
       `f(op_type, op_name, attrs, inputs, outputs)`.
       `op_type` is the type of the operation that was just executed (e.g.,
         `MatMul`).
-      `op_name` is the name of the operation that has was just executed. This
+      `op_name` is the name of the operation that was just executed. This
         name is set by the client who created the operation and can be `None` if
         it is unset.
       `attrs` contains the attributes of the operation as a `tuple` of
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 343012e552592a6f8bb1255118add3e938aa443c..711eddcec1dec72862c5d170a5ba6d98b10cbcc3 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -34,7 +34,6 @@ from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -79,14 +78,10 @@ def capture_value(tensor_map, value, dtype, name):
         ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
         shapes = [[d.size for d in s.dim]
                   if not s.unknown_rank else None for s in shapes]
-        with errors.raise_exception_on_not_ok_status() as status:
-          pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
-              captured_value._op._graph._c_graph,  # pylint: disable=protected-access
-              captured_value._as_tf_output(),  # pylint: disable=protected-access
-              shapes,
-              ranks,
-              types,
-              status)
+        pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+            captured_value._op._graph._c_graph,  # pylint: disable=protected-access
+            captured_value._as_tf_output(),  # pylint: disable=protected-access
+            shapes, ranks, types)
 
     tensor_map[ops.tensor_id(value)] = (value, captured_value)
   else:
@@ -275,23 +270,20 @@ class _EagerDefinedFunction(object):
       inputs: the tensors in the graph to be used as inputs to the function
       outputs: the tensors in the graph which will be outputs to the function
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
-          graph._c_graph,  # pylint: disable=protected-access
-          compat.as_str(name),
-          False,
-          [o._c_op for o in operations],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
-          [],
-          None,
-          compat.as_str(""),
-          status)
+    fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
+        graph._c_graph,  # pylint: disable=protected-access
+        compat.as_str(name),
+        False,
+        [o._c_op for o in operations],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
+        [],
+        None,
+        compat.as_str(""))
     # TODO(apassos) avoid creating a FunctionDef (specially to grab the
     # signature, but also in general it's nice not to depend on it.
     with c_api_util.tf_buffer() as buffer_:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_, status)
+      pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_)
       proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
     function_def = function_pb2.FunctionDef()
     function_def.ParseFromString(compat.as_bytes(proto_data))
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index b9cde16867d498d73715535f028a5eb2bea97ea6..9af197981bde309160781fa5821152962e5383bb 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -37,6 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.training import gradient_descent
 
 
 class FunctionTest(test.TestCase):
@@ -376,23 +376,23 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
 
   def testGradientOfGatherWithDefun(self):
+    with ops.device('cpu:0'):
+      v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
 
-    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+      def sum_gather():
+        return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
 
-    def sum_gather():
-      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+      grad_fn = backprop.implicit_grad(sum_gather)
+      gradient = grad_fn()
+      defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
+      defun_gradient = defun_grad_fn()
+      self.assertEqual(len(gradient), len(defun_gradient))
 
-    grad_fn = backprop.implicit_grad(sum_gather)
-    gradient = grad_fn()
-    defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
-    defun_gradient = defun_grad_fn()
-    self.assertEqual(len(gradient), len(defun_gradient))
-
-    gradient = gradient[0][0]
-    defun_gradient = defun_gradient[0][0]
-    self.assertAllEqual(gradient.values, defun_gradient.values)
-    self.assertAllEqual(gradient.indices, defun_gradient.indices)
-    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+      gradient = gradient[0][0]
+      defun_gradient = defun_gradient[0][0]
+      self.assertAllEqual(gradient.values, defun_gradient.values)
+      self.assertAllEqual(gradient.indices, defun_gradient.indices)
+      self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
 
   def testReturningIndexedSlicesWithDefun(self):
 
@@ -475,9 +475,7 @@ class FunctionTest(test.TestCase):
     reshape = function.defun(array_ops.reshape)
     value = constant_op.constant([1., 2.])
     shape = constant_op.constant([2, 1]).gpu()
-    with self.assertRaises(errors.InvalidArgumentError):
-      with ops.device('gpu:0'):
-        reshape(value, shape)
+    reshape(value, shape)  # No error is raised
 
   def testDifferentiableFunctionNoneOutputs(self):
 
@@ -762,6 +760,37 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
       self.assertAllEqual(f().eval(), 4.0)
 
+  def testOptimizerInDefun(self):
+    def loss(v):
+      return v**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
+
+    @function.defun
+    def train():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      grad = backprop.implicit_grad(loss)(v)
+      optimizer.apply_gradients(grad)
+      return v.read_value()
+
+    value = train()
+    self.assertEqual(value.numpy(), -1.0)
+
+  def testOptimizerInDefunWithCapturedVariable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+    def loss():
+      return v**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
+
+    @function.defun
+    def train():
+      grad = backprop.implicit_grad(loss)()
+      optimizer.apply_gradients(grad)
+
+    train()
+    self.assertEqual(v.numpy(), -1.0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/imperative_grad.py b/tensorflow/python/eager/imperative_grad.py
index 837cad974ac6555ef2b13d1a1a5e0e5f5166b01d..000152855d1a90f32936cca40c10f00c2df863a5 100644
--- a/tensorflow/python/eager/imperative_grad.py
+++ b/tensorflow/python/eager/imperative_grad.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import collections
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import errors
 
 
 VSpace = collections.namedtuple(
@@ -60,6 +59,5 @@ def imperative_grad(
      or if only non-differentiable functions of the source were used in the
      computation of target.
   """
-  with errors.raise_exception_on_not_ok_status() as status:
-    return pywrap_tensorflow.TFE_Py_TapeGradient(
-        tape._tape, vspace, target, sources, output_gradients, status)  # pylint: disable=protected-access
+  return pywrap_tensorflow.TFE_Py_TapeGradient(
+      tape._tape, vspace, target, sources, output_gradients)  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index f70c7544d6c9e8095e95d0629b94384bc1cbe35b..fc76ede4c502ae8b554c925a921e419bf003c40c 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
@@ -376,6 +377,22 @@ class OpsTest(test_util.TensorFlowTestCase):
   def testNoOpIsNone(self):
     self.assertTrue(control_flow_ops.no_op() is None)
 
+  def testEagerContextPreservedAcrossThreads(self):
+    def init_fn():
+      self.assertTrue(context.executing_eagerly())
+      with ops.init_scope():
+        self.assertTrue(context.executing_eagerly())
+        context_switches = context.context().context_switches
+        self.assertEqual(len(context_switches.stack), 1)
+        self.assertFalse(context_switches.stack[0].is_building_function)
+        self.assertEqual(context_switches.stack[0].enter_context_fn,
+                         context.eager_mode)
+
+    self.assertTrue(context.executing_eagerly())
+    t1 = threading.Thread(target=init_fn)
+    t1.start()
+    t1.join()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index c2ce8efd7f70c6ba93b6d444f88ddbb9aa51ccdb..9afab0077b666b36d77abea5a7d8c444b6400812 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -117,7 +117,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
                    const string& function_name)
       : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
     op_name_ = function_name_;
-    op_name_.Consume("_");
+    str_util::ConsumePrefix(&op_name_, "_");
   }
   ~GenEagerPythonOp() override {}
 
@@ -366,8 +366,8 @@ string GenEagerPythonOp::Code() {
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
   // Handle graph-mode case
   strings::StrAppend(&result_,
-                     "  _ctx = _context.context()\n"
-                     "  if not _ctx.executing_eagerly():\n",
+                     "  _ctx = _context._context\n"
+                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
                      function_setup,
                      "    _, _, _op = _op_def_lib._apply_op_helper(\n");
   AddBodyNoReturn("        ");
@@ -492,7 +492,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
       strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
                          " = ", default_value, "\n");
     }
-    if (attr_type.starts_with("list(")) {
+    if (str_util::StartsWith(attr_type, "list(")) {
       ExpectListArg(indentation, attr_api_name, function_setup);
     }
 
@@ -683,13 +683,14 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
     return true;
   }
 
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix), parameters);
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
+             strings::StrCat(parameters, ", ctx=None"));
   strings::StrAppend(
       &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
   strings::StrAppend(&result_, "  This is for function ", function_name_,
                      "\n  \"\"\"\n");
 
-  strings::StrAppend(&result_, "  _ctx = _context.context()\n");
+  strings::StrAppend(&result_, "  _ctx = ctx if ctx else _context.context()\n");
 
   string function_setup;
   if (!GetEagerFunctionSetup("  ", &function_setup)) {
@@ -712,9 +713,9 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
 }
 
 void GenEagerPythonOp::AddEagerFastPathExecute() {
-  string fastpath_execute_params =
-      strings::StrCat("_ctx._handle, _ctx.device_name, \"", op_def_.name(),
-                      "\", ", "name, _ctx._post_execution_callbacks");
+  string fastpath_execute_params = strings::StrCat(
+      "_ctx._context_handle, _ctx._eager_context.device_name, \"",
+      op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
   string fallback_params;
 
   for (int i = 0; i < api_def_.in_arg_size(); i++) {
@@ -755,6 +756,8 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   strings::StrAppend(&result_, "      ", "return _result\n");
 
   // Handle fallback.
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "ctx=_ctx");
   strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
   strings::StrAppend(
       &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 105c09e81fefc09bca630534cbe044027dac860b..519814b979e00dd7c9df41eacbe1edc02c9d88e8 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -163,7 +163,7 @@ PyObject* PyIntFromDataType(TF_DataType l) {
 
 extern "C" {
 
-static const int kMaxEagerTensorParentSize = 32;
+static const int kMaxEagerTensorParentSize = 64;
 
 // TODO(agarwal): store context handle in EagerTensor.
 typedef struct EagerTensor {
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index fe9785dc667e948c760946d41faa9188574a2180..d99bd0b0ffe5ef8042b3b869f82a9952f1985abc 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1013,12 +1013,13 @@ static tensorflow::eager::TapeTensor TapeTensorFromTensor(PyObject* tensor) {
     TFE_TensorHandle* t = EagerTensor_Handle(tensor);
     tensorflow::int64 id = EagerTensor_id(tensor);
     const tensorflow::Tensor* tensor = nullptr;
-    const tensorflow::Status status = t->Tensor(&tensor);
+    const tensorflow::Status status = t->handle->Tensor(&tensor);
     if (MaybeRaiseExceptionFromStatus(status, nullptr)) {
-      return tensorflow::eager::TapeTensor{id, t->dtype,
+      return tensorflow::eager::TapeTensor{id, t->handle->dtype,
                                            tensorflow::TensorShape({})};
     } else {
-      return tensorflow::eager::TapeTensor{id, t->dtype, tensor->shape()};
+      return tensorflow::eager::TapeTensor{id, t->handle->dtype,
+                                           tensor->shape()};
     }
   }
   tensorflow::int64 id = FastTensorId(tensor);
@@ -1323,6 +1324,16 @@ std::vector<PyObject*> MakeTensorList(PyObject* tensors) {
 PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
                               PyObject* target, PyObject* sources,
                               PyObject* output_gradients, TF_Status* status) {
+  TFE_Py_Tape* tape_obj = reinterpret_cast<TFE_Py_Tape*>(tape);
+  if (!tape_obj->tape->IsPersistent()) {
+    auto* tape_set = GetTapeSet();
+    if (tape_set->find(tape_obj) != tape_set->end()) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "Trying to call tape.gradient on a non-persistent tape "
+                      "while it is still active.");
+      return nullptr;
+    }
+  }
   PyVSpace c_vspace(vspace);
   if (!c_vspace.Initialize().ok()) {
     return nullptr;
@@ -1348,7 +1359,6 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
       Py_INCREF(tensor);
     }
   }
-  TFE_Py_Tape* tape_obj = reinterpret_cast<TFE_Py_Tape*>(tape);
   std::vector<PyObject*> result;
   status->status = tape_obj->tape->ComputeGradient(
       c_vspace, target_vec, sources_vec, outgrad_vec, &result);
@@ -1362,11 +1372,15 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
   }
   if (!result.empty()) {
     PyObject* py_result = PyList_New(result.size());
+    tensorflow::gtl::FlatSet<PyObject*> seen_results(result.size());
     for (int i = 0; i < result.size(); ++i) {
       if (result[i] == nullptr) {
         Py_INCREF(Py_None);
         result[i] = Py_None;
+      } else if (seen_results.find(result[i]) != seen_results.end()) {
+        Py_INCREF(result[i]);
       }
+      seen_results.insert(result[i]);
       PyList_SET_ITEM(py_result, i, reinterpret_cast<PyObject*>(result[i]));
     }
     return py_result;
@@ -1395,16 +1409,33 @@ bool CheckInputsOk(PyObject* seq, int start_index,
     PyObject* item = PyTuple_GET_ITEM(seq, i + start_index);
     if (!op_def.input_arg(i).number_attr().empty() ||
         !op_def.input_arg(i).type_list_attr().empty()) {
-      // This item should be a list input.
-      if (!PyList_Check(item)) return false;
-      for (Py_ssize_t j = 0; j < PyList_Size(item); j++) {
-        PyObject* inner_item = PyList_GET_ITEM(item, j);
+      // This item should be a seq input.
+      if (!PySequence_Check(item)) {
+        VLOG(1) << "Falling back to slow path for Op \"" << op_def.name()
+                << "\", Input \"" << op_def.input_arg(i).name()
+                << "\" since we expected a sequence, but got "
+                << item->ob_type->tp_name;
+        return false;
+      }
+      for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(item); j++) {
+        PyObject* inner_item = PySequence_Fast_GET_ITEM(item, j);
         if (!EagerTensor_CheckExact(inner_item) &&
             !CheckResourceVariable(inner_item)) {
+          VLOG(1)
+              << "Falling back to slow path for Op \"" << op_def.name()
+              << "\", Input \"" << op_def.input_arg(i).name() << "\", Index "
+              << j
+              << " since we expected an EagerTensor/ResourceVariable, but got "
+              << inner_item->ob_type->tp_name;
           return false;
         }
       }
     } else if (!EagerTensor_CheckExact(item) && !CheckResourceVariable(item)) {
+      VLOG(1)
+          << "Falling back to slow path for Op \"" << op_def.name()
+          << "\", Input \"" << op_def.input_arg(i).name()
+          << "\" since we expected an EagerTensor/ResourceVariable, but got "
+          << item->ob_type->tp_name;
       return false;
     }
   }
@@ -1716,11 +1747,11 @@ const char* GetDeviceName(PyObject* py_device_name) {
   return nullptr;
 }
 
-bool RaiseIfNotPyList(PyObject* list, const string& attr_name) {
-  if (!PyList_Check(list)) {
+bool RaiseIfNotPySequence(PyObject* seq, const string& attr_name) {
+  if (!PySequence_Check(seq)) {
     PyErr_SetString(PyExc_TypeError,
-                    Printf("expected a list for attr %s, got %s instead",
-                           attr_name.data(), list->ob_type->tp_name)
+                    Printf("expected a sequence for attr %s, got %s instead",
+                           attr_name.data(), seq->ob_type->tp_name)
                         .data());
 
     return false;
@@ -1813,6 +1844,15 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
   op_exec_info.ctx = reinterpret_cast<TFE_Context*>(
       PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
+
+  if (op_exec_info.ctx == nullptr) {
+    // The context hasn't been initialized. It will be in the slow path.
+    RaiseFallbackException(
+        "This function does not handle the case of the path where "
+        "all inputs are not already EagerTensors.");
+    return nullptr;
+  }
+
   op_exec_info.device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
   op_exec_info.op_name = PyTuple_GET_ITEM(args, 2);
   op_exec_info.op_def = GetOpDef(op_exec_info.op_name);
@@ -1884,6 +1924,9 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
                               py_attr_value, &attr_list_sizes, status);
 
         if (TF_GetCode(status) != TF_OK) {
+          VLOG(1) << "Falling back to slow path for Op \"" << op_def->name()
+                  << "\" since we are unable to set the value for attr \""
+                  << attr.name() << "\" due to: " << TF_Message(status);
           RaiseFallbackException(TF_Message(status));
           return nullptr;
         }
@@ -1930,8 +1973,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
         PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i);
     if (!input_arg.number_attr().empty()) {
       // The item is a homogeneous list.
-      if (!RaiseIfNotPyList(input, input_arg.number_attr())) return nullptr;
-      Py_ssize_t len = PyList_Size(input);
+      if (!RaiseIfNotPySequence(input, input_arg.number_attr())) return nullptr;
+      Py_ssize_t len = PySequence_Fast_GET_SIZE(input);
 
       TFE_OpSetAttrInt(op, input_arg.number_attr().data(), len);
       if (op_exec_info.run_callbacks) {
@@ -1943,15 +1986,15 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(op_exec_info, PyList_GET_ITEM(input, 0), &input_arg,
-                          flattened_attrs.get(), flattened_inputs.get(), op,
-                          status)) {
+        if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
+                          &input_arg, flattened_attrs.get(),
+                          flattened_inputs.get(), op, status)) {
           return nullptr;
         }
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(op_exec_info, PyList_GET_ITEM(input, j),
+          if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, j),
                             nullptr /* input_arg */,
                             nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
@@ -1961,16 +2004,18 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       }
     } else if (!input_arg.type_list_attr().empty()) {
       // The item is a heterogeneous list.
-      if (!RaiseIfNotPyList(input, input_arg.type_list_attr())) return nullptr;
+      if (!RaiseIfNotPySequence(input, input_arg.type_list_attr())) {
+        return nullptr;
+      }
       const string& attr_name = input_arg.type_list_attr();
-      Py_ssize_t len = PyList_Size(input);
+      Py_ssize_t len = PySequence_Fast_GET_SIZE(input);
       tensorflow::gtl::InlinedVector<TF_DataType, 4> attr_value(len);
       PyObject* py_attr_value = nullptr;
       if (op_exec_info.run_callbacks) {
         py_attr_value = PyTuple_New(len);
       }
       for (Py_ssize_t j = 0; j < len; j++) {
-        PyObject* py_input = PyList_GET_ITEM(input, j);
+        PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
         if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
                              status)) {
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 04fcbb0e874bcc5583cd5f607de3a3764f4ba6b0..5d8b19223f000862aa46ad3a60796ae68bdec2f9 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -9,24 +9,13 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "estimator_py",
     srcs = ["estimator_lib.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":baseline",
+        ":boosted_trees",
         ":dnn",
         ":dnn_linear_combined",
         ":estimator",
@@ -251,6 +240,53 @@ py_test(
     ],
 )
 
+py_library(
+    name = "boosted_trees",
+    srcs = ["canned/boosted_trees.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":head",
+        ":model_fn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
+py_test(
+    name = "boosted_trees_test",
+    size = "medium",
+    srcs = ["canned/boosted_trees_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":boosted_trees",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
 py_library(
     name = "dnn",
     srcs = ["canned/dnn.py"],
@@ -265,7 +301,6 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:summary",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
@@ -617,6 +652,7 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:training",
         "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
@@ -929,5 +965,7 @@ cuda_py_test(
     ],
     tags = [
         "multi_gpu",
+        "noasan",  # flaky time outs
+        "notsan",  # flaky
     ],
 )
diff --git a/tensorflow/python/estimator/canned/baseline_test.py b/tensorflow/python/estimator/canned/baseline_test.py
index 96639e88ea4a07e14121049d78f07e03fcb22156..7bf2e62da9c4598c28ad38825aac2031c9d51905 100644
--- a/tensorflow/python/estimator/canned/baseline_test.py
+++ b/tensorflow/python/estimator/canned/baseline_test.py
@@ -42,13 +42,13 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
@@ -482,7 +482,7 @@ class BaselineRegressorTrainingTest(test.TestCase):
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
@@ -490,7 +490,7 @@ class BaselineRegressorTrainingTest(test.TestCase):
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
 
     mock_optimizer = test.mock.NonCallableMock(
@@ -685,13 +685,13 @@ class BaselineClassifierTrainingTest(test.TestCase):
       # Verify loss. We can't check the value directly, so we add an assert op.
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
           loss,
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
 
     mock_optimizer = test.mock.NonCallableMock(
         spec=optimizer.Optimizer,
@@ -1071,6 +1071,8 @@ class BaselineClassifierEvaluationTest(test.TestCase):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: 1.3133,
           metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
           metric_keys.MetricKeys.LABEL_MEAN: 1.,
           metric_keys.MetricKeys.ACCURACY_BASELINE: 1,
@@ -1132,6 +1134,8 @@ class BaselineClassifierEvaluationTest(test.TestCase):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
           metric_keys.MetricKeys.ACCURACY: 0.5,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
           metric_keys.MetricKeys.LABEL_MEAN: 0.5,
           metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
@@ -1207,6 +1211,8 @@ class BaselineClassifierEvaluationTest(test.TestCase):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
           metric_keys.MetricKeys.ACCURACY: 2. / (1. + 2.),
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: predictions_mean,
           metric_keys.MetricKeys.LABEL_MEAN: label_mean,
           metric_keys.MetricKeys.ACCURACY_BASELINE: (
@@ -1542,4 +1548,3 @@ class BaselineLogitFnTest(test.TestCase):
 
 if __name__ == '__main__':
   test.main()
-
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
new file mode 100644
index 0000000000000000000000000000000000000000..500ea03ea7fef9c60b9f36f1d04f1f4c337371e8
--- /dev/null
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -0,0 +1,736 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimator classes for BoostedTrees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.summary import summary
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
+
+_TreeHParams = collections.namedtuple(
+    'TreeHParams',
+    ['n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity'])
+
+_HOLD_FOR_MULTI_CLASS_SUPPORT = object()
+_HOLD_FOR_MULTI_DIM_SUPPORT = object()
+
+
+def _get_transformed_features(features, feature_columns):
+  """Gets the transformed features from features/feature_columns pair.
+
+  Args:
+    features: a dicionary of name to Tensor.
+    feature_columns: a list/set of tf.feature_column.
+
+  Returns:
+    result_features: a list of the transformed features, sorted by the name.
+    num_buckets: the maximum number of buckets across bucketized_columns.
+
+  Raises:
+    ValueError: when unsupported features/columns are tried.
+  """
+  num_buckets = 1
+  # pylint:disable=protected-access
+  for fc in feature_columns:
+    if isinstance(fc, feature_column_lib._BucketizedColumn):
+      # N boundaries creates (N+1) buckets.
+      num_buckets = max(num_buckets, len(fc.boundaries) + 1)
+    else:
+      raise ValueError('For now, only bucketized_column is supported but '
+                       'got: {}'.format(fc))
+  transformed = feature_column_lib._transform_features(features,
+                                                       feature_columns)
+  # pylint:enable=protected-access
+  result_features = []
+  for column in sorted(transformed, key=lambda tc: tc.name):
+    source_name = column.source_column.name
+    squeezed_tensor = array_ops.squeeze(transformed[column], axis=1)
+    if len(squeezed_tensor.shape) > 1:
+      raise ValueError('For now, only supports features equivalent to rank 1 '
+                       'but column `{}` got: {}'.format(
+                           source_name, features[source_name].shape))
+    result_features.append(squeezed_tensor)
+  return result_features, num_buckets
+
+
+def _keep_as_local_variable(tensor, name=None):
+  """Stores a tensor as a local Variable for faster read."""
+  return variable_scope.variable(
+      initial_value=tensor,
+      trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      validate_shape=False,
+      name=name)
+
+
+class _CacheTrainingStatesUsingHashTable(object):
+  """Caching logits, etc. using MutableHashTable."""
+
+  def __init__(self, example_ids, logits_dimension):
+    """Creates a cache with the given configuration.
+
+    It maintains a MutableDenseHashTable for all values.
+    The API lookup() and insert() would have those specs,
+      tree_ids: shape=[batch_size], dtype=int32
+      node_ids: shape=[batch_size], dtype=int32
+      logits: shape=[batch_size, logits_dimension], dtype=float32
+    However in the MutableDenseHashTable, ids are bitcasted into float32 and
+    all values are concatenated as a single tensor (of float32).
+
+    Hence conversion happens internally before inserting to the HashTable and
+    after lookup from it.
+
+    Args:
+      example_ids: a Rank 1 tensor to be used as a key of the cache.
+      logits_dimension: a constant (int) for the dimension of logits.
+
+    Raises:
+      ValueError: if example_ids is other than int64 or string.
+    """
+    if dtypes.as_dtype(dtypes.int64).is_compatible_with(example_ids.dtype):
+      empty_key = -1 << 62
+    elif dtypes.as_dtype(dtypes.string).is_compatible_with(example_ids.dtype):
+      empty_key = ''
+    else:
+      raise ValueError('Unsupported example_id_feature dtype %s.',
+                       example_ids.dtype)
+    # Cache holds latest <tree_id, node_id, logits> for each example.
+    # tree_id and node_id are both int32 but logits is a float32.
+    # To reduce the overhead, we store all of them together as float32 and
+    # bitcast the ids to int32.
+    self._table_ref = lookup_ops.mutable_dense_hash_table_v2(
+        empty_key=empty_key, value_dtype=dtypes.float32, value_shape=[3])
+    self._example_ids = example_ids
+    self._logits_dimension = logits_dimension
+
+  def lookup(self):
+    """Returns cached_tree_ids, cached_node_ids, cached_logits."""
+    cached_tree_ids, cached_node_ids, cached_logits = array_ops.split(
+        lookup_ops.lookup_table_find_v2(
+            self._table_ref, self._example_ids, default_value=[0.0, 0.0, 0.0]),
+        [1, 1, self._logits_dimension],
+        axis=1)
+    cached_tree_ids = array_ops.squeeze(
+        array_ops.bitcast(cached_tree_ids, dtypes.int32))
+    cached_node_ids = array_ops.squeeze(
+        array_ops.bitcast(cached_node_ids, dtypes.int32))
+    return (cached_tree_ids, cached_node_ids, cached_logits)
+
+  def insert(self, tree_ids, node_ids, logits):
+    """Inserts values and returns the op."""
+    insert_op = lookup_ops.lookup_table_insert_v2(
+        self._table_ref, self._example_ids,
+        array_ops.concat(
+            [
+                array_ops.expand_dims(
+                    array_ops.bitcast(tree_ids, dtypes.float32), 1),
+                array_ops.expand_dims(
+                    array_ops.bitcast(node_ids, dtypes.float32), 1),
+                logits,
+            ],
+            axis=1,
+            name='value_concat_for_cache_insert'))
+    return insert_op
+
+
+class _CacheTrainingStatesUsingVariables(object):
+  """Caching logits, etc. using Variables."""
+
+  def __init__(self, batch_size, logits_dimension):
+    """Creates a cache with the given configuration.
+
+    It maintains three variables, tree_ids, node_ids, logits, for caching.
+      tree_ids: shape=[batch_size], dtype=int32
+      node_ids: shape=[batch_size], dtype=int32
+      logits: shape=[batch_size, logits_dimension], dtype=float32
+
+    Note, this can be used only with in-memory data setting.
+
+    Args:
+      batch_size: `int`, the size of the cache.
+      logits_dimension: a constant (int) for the dimension of logits.
+    """
+    self._logits_dimension = logits_dimension
+    self._tree_ids = _keep_as_local_variable(
+        array_ops.zeros([batch_size], dtype=dtypes.int32),
+        name='tree_ids_cache')
+    self._node_ids = _keep_as_local_variable(
+        array_ops.zeros([batch_size], dtype=dtypes.int32),
+        name='node_ids_cache')
+    self._logits = _keep_as_local_variable(
+        array_ops.zeros([batch_size, logits_dimension], dtype=dtypes.float32),
+        name='logits_cache')
+
+  def lookup(self):
+    """Returns cached_tree_ids, cached_node_ids, cached_logits."""
+    return (self._tree_ids, self._node_ids, self._logits)
+
+  def insert(self, tree_ids, node_ids, logits):
+    """Inserts values and returns the op."""
+    return control_flow_ops.group(
+        [
+            self._tree_ids.assign(tree_ids),
+            self._node_ids.assign(node_ids),
+            self._logits.assign(logits)
+        ],
+        name='cache_insert')
+
+
+class StopAtAttemptsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at the number of trees."""
+
+  def __init__(self, num_finalized_trees_tensor, num_attempted_layers_tensor,
+               max_trees, max_depth):
+    self._num_finalized_trees_tensor = num_finalized_trees_tensor
+    self._num_attempted_layers_tensor = num_attempted_layers_tensor
+    self._max_trees = max_trees
+    self._max_depth = max_depth
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(
+        [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
+
+  def after_run(self, run_context, run_values):
+    num_finalized_trees, num_attempted_layers = run_values.results
+    if (num_finalized_trees >= self._max_trees or
+        1.0 * num_attempted_layers / self._max_depth > 2 * self._max_trees):
+      run_context.request_stop()
+
+
+class StopAtNumTreesHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at the number of trees."""
+
+  def __init__(self, num_trees_tensor, max_trees):
+    self._num_trees_tensor = num_trees_tensor
+    self._max_trees = max_trees
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(self._num_trees_tensor)
+
+  def after_run(self, run_context, run_values):
+    num_trees = run_values.results
+    if num_trees > self._max_trees:
+      run_context.request_stop()
+
+
+def _bt_model_fn(
+    features,
+    labels,
+    mode,
+    head,
+    feature_columns,
+    tree_hparams,
+    n_batches_per_layer,
+    config,
+    closed_form_grad_and_hess_fn=None,
+    example_id_column_name=None,
+    # TODO(youngheek): replace this later using other options.
+    train_in_memory=False,
+    name='boosted_trees'):
+  """Gradient Boosted Trees model_fn.
+
+  Args:
+    features: dict of `Tensor`.
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
+      dtype `int32` or `int64` in the range `[0, n_classes)`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `head_lib._Head` instance.
+    feature_columns: Iterable of `feature_column._FeatureColumn` model inputs.
+    tree_hparams: TODO. collections.namedtuple for hyper parameters.
+    n_batches_per_layer: A `Tensor` of `int64`. Each layer is built after at
+      least n_batches_per_layer accumulations.
+    config: `RunConfig` object to configure the runtime settings.
+    closed_form_grad_and_hess_fn: a function that accepts logits and labels
+      and returns gradients and hessians. By default, they are created by
+      tf.gradients() from the loss.
+    example_id_column_name: Name of the feature for a unique ID per example.
+      Currently experimental -- not exposed to public API.
+    train_in_memory: `bool`, when true, it assumes the dataset is in memory,
+      i.e., input_fn should return the entire dataset as a single batch, and
+      also n_batches_per_layer should be set as 1.
+    name: Name to use for the model.
+
+  Returns:
+      An `EstimatorSpec` instance.
+
+  Raises:
+    ValueError: mode or params are invalid, or features has the wrong type.
+  """
+  is_single_machine = (config.num_worker_replicas <= 1)
+  if train_in_memory:
+    assert n_batches_per_layer == 1, (
+        'When train_in_memory is enabled, input_fn should return the entire '
+        'dataset as a single batch, and n_batches_per_layer should be set as '
+        '1.')
+  worker_device = control_flow_ops.no_op().device
+  # maximum number of splits possible in the whole tree =2^(D-1)-1
+  # TODO(youngheek): perhaps storage could be optimized by storing stats with
+  # the dimension max_splits_per_layer, instead of max_splits (for the entire
+  # tree).
+  max_splits = (1 << tree_hparams.max_depth) - 1
+  with ops.name_scope(name) as name:
+    # Prepare.
+    global_step = training_util.get_or_create_global_step()
+    input_feature_list, num_buckets = _get_transformed_features(
+        features, feature_columns)
+    if train_in_memory and mode == model_fn.ModeKeys.TRAIN:
+      input_feature_list = [
+          _keep_as_local_variable(feature) for feature in input_feature_list
+      ]
+    num_features = len(input_feature_list)
+
+    cache = None
+    if mode == model_fn.ModeKeys.TRAIN:
+      if train_in_memory and is_single_machine:  # maybe just train_in_memory?
+        batch_size = array_ops.shape(input_feature_list[0])[0]
+        cache = _CacheTrainingStatesUsingVariables(batch_size,
+                                                   head.logits_dimension)
+      elif example_id_column_name:
+        example_ids = features[example_id_column_name]
+        cache = _CacheTrainingStatesUsingHashTable(example_ids,
+                                                   head.logits_dimension)
+
+    # Create Ensemble resources.
+    if is_single_machine:
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
+      local_tree_ensemble = tree_ensemble
+      ensemble_reload = control_flow_ops.no_op()
+    else:
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
+      with ops.device(worker_device):
+        local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
+            name=name + '_local', is_local=True)
+      # TODO(soroush): Do partial updates if this becomes a bottleneck.
+      ensemble_reload = local_tree_ensemble.deserialize(
+          *tree_ensemble.serialize())
+
+    # Create logits.
+    if mode != model_fn.ModeKeys.TRAIN:
+      logits = boosted_trees_ops.predict(
+          tree_ensemble_handle=local_tree_ensemble.resource_handle,
+          bucketized_features=input_feature_list,
+          logits_dimension=head.logits_dimension,
+          max_depth=tree_hparams.max_depth)
+    else:
+      if cache:
+        cached_tree_ids, cached_node_ids, cached_logits = cache.lookup()
+      else:
+        # Always start from the beginning when no cache is set up.
+        batch_size = array_ops.shape(input_feature_list[0])[0]
+        cached_tree_ids, cached_node_ids, cached_logits = (
+            array_ops.zeros([batch_size], dtype=dtypes.int32),
+            array_ops.zeros([batch_size], dtype=dtypes.int32),
+            array_ops.zeros(
+                [batch_size, head.logits_dimension], dtype=dtypes.float32))
+      with ops.control_dependencies([ensemble_reload]):
+        (stamp_token, num_trees, num_finalized_trees,
+         num_attempted_layers) = local_tree_ensemble.get_states()
+        summary.scalar('ensemble/num_trees', num_trees)
+        summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
+        summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
+
+        partial_logits, tree_ids, node_ids = boosted_trees_ops.training_predict(
+            tree_ensemble_handle=local_tree_ensemble.resource_handle,
+            cached_tree_ids=cached_tree_ids,
+            cached_node_ids=cached_node_ids,
+            bucketized_features=input_feature_list,
+            logits_dimension=head.logits_dimension,
+            max_depth=tree_hparams.max_depth)
+      logits = cached_logits + partial_logits
+
+    # Create training graph.
+    def _train_op_fn(loss):
+      """Run one training iteration."""
+      train_op = []
+      if cache:
+        train_op.append(cache.insert(tree_ids, node_ids, logits))
+      if closed_form_grad_and_hess_fn:
+        gradients, hessians = closed_form_grad_and_hess_fn(logits, labels)
+      else:
+        gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0]
+        hessians = gradients_impl.gradients(
+            gradients, logits, name='Hessians')[0]
+      stats_summary_list = [
+          array_ops.squeeze(
+              boosted_trees_ops.make_stats_summary(
+                  node_ids=node_ids,
+                  gradients=gradients,
+                  hessians=hessians,
+                  bucketized_features_list=[input_feature_list[f]],
+                  max_splits=max_splits,
+                  num_buckets=num_buckets),
+              axis=0) for f in range(num_features)
+      ]
+
+      def grow_tree_from_stats_summaries(stats_summary_list):
+        """Updates ensemble based on the best gains from stats summaries."""
+        (node_ids_per_feature, gains_list, thresholds_list,
+         left_node_contribs_list, right_node_contribs_list) = (
+             boosted_trees_ops.calculate_best_gains_per_feature(
+                 node_id_range=array_ops.stack([
+                     math_ops.reduce_min(node_ids),
+                     math_ops.reduce_max(node_ids)
+                 ]),
+                 stats_summary_list=stats_summary_list,
+                 l1=tree_hparams.l1,
+                 l2=tree_hparams.l2,
+                 tree_complexity=tree_hparams.tree_complexity,
+                 max_splits=max_splits))
+        grow_op = boosted_trees_ops.update_ensemble(
+            # Confirm if local_tree_ensemble or tree_ensemble should be used.
+            tree_ensemble.resource_handle,
+            feature_ids=math_ops.range(0, num_features, dtype=dtypes.int32),
+            node_ids=node_ids_per_feature,
+            gains=gains_list,
+            thresholds=thresholds_list,
+            left_node_contribs=left_node_contribs_list,
+            right_node_contribs=right_node_contribs_list,
+            learning_rate=tree_hparams.learning_rate,
+            max_depth=tree_hparams.max_depth,
+            pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING)
+        return grow_op
+
+      if train_in_memory and is_single_machine:
+        train_op.append(distribute_lib.increment_var(global_step))
+        train_op.append(grow_tree_from_stats_summaries(stats_summary_list))
+      else:
+        summary_accumulator = data_flow_ops.ConditionalAccumulator(
+            dtype=dtypes.float32,
+            # The stats consist of gradients and hessians (the last dimension).
+            shape=[num_features, max_splits, num_buckets, 2],
+            shared_name='stats_summary_accumulator')
+        apply_grad = summary_accumulator.apply_grad(
+            array_ops.stack(stats_summary_list, axis=0), stamp_token)
+
+        def grow_tree_from_accumulated_summaries_fn():
+          """Updates the tree with the best layer from accumulated summaries."""
+          # Take out the accumulated summaries from the accumulator and grow.
+          stats_summary_list = array_ops.unstack(
+              summary_accumulator.take_grad(1), axis=0)
+          grow_op = grow_tree_from_stats_summaries(stats_summary_list)
+          return grow_op
+
+        with ops.control_dependencies([apply_grad]):
+          train_op.append(distribute_lib.increment_var(global_step))
+          if config.is_chief:
+            train_op.append(
+                control_flow_ops.cond(
+                    math_ops.greater_equal(
+                        summary_accumulator.num_accumulated(),
+                        n_batches_per_layer),
+                    grow_tree_from_accumulated_summaries_fn,
+                    control_flow_ops.no_op,
+                    name='wait_until_n_batches_accumulated'))
+
+      return control_flow_ops.group(train_op, name='train_op')
+
+  estimator_spec = head.create_estimator_spec(
+      features=features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_train_op_fn,
+      logits=logits)
+  if mode == model_fn.ModeKeys.TRAIN:
+    # Add an early stop hook.
+    estimator_spec = estimator_spec._replace(
+        training_hooks=estimator_spec.training_hooks +
+        (StopAtNumTreesHook(num_trees, tree_hparams.n_trees),))
+  return estimator_spec
+
+
+def _create_classification_head(n_classes,
+                                weight_column=None,
+                                label_vocabulary=None):
+  """Creates a classification head. Refer to canned.head for details on args."""
+  # TODO(nponomareva): Support multi-class cases.
+  if n_classes == 2:
+    # pylint: disable=protected-access
+    return head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column=weight_column,
+        label_vocabulary=label_vocabulary,
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    # pylint: enable=protected-access
+  else:
+    raise ValueError('For now only binary classification is supported.'
+                     'n_classes given as {}'.format(n_classes))
+
+
+def _create_classification_head_and_closed_form(n_classes, weight_column,
+                                                label_vocabulary):
+  """Creates a head for classifier and the closed form gradients/hessians."""
+  head = _create_classification_head(n_classes, weight_column, label_vocabulary)
+  if n_classes == 2 and weight_column is None and label_vocabulary is None:
+    # Use the closed-form gradients/hessians for 2 class.
+    def _grad_and_hess_for_logloss(logits, labels):
+      # TODO(youngheek): add weights handling.
+      predictions = math_ops.reciprocal(math_ops.exp(-logits) + 1.0)
+      normalizer = math_ops.reciprocal(
+          math_ops.cast(array_ops.size(predictions), dtypes.float32))
+      gradients = (predictions - labels) * normalizer
+      hessians = predictions * (1.0 - predictions) * normalizer
+      return gradients, hessians
+
+    closed_form = _grad_and_hess_for_logloss
+  else:
+    closed_form = None
+  return (head, closed_form)
+
+
+def _create_regression_head(label_dimension, weight_column=None):
+  if label_dimension != 1:
+    raise ValueError('For now only 1 dimension regression is supported.'
+                     'label_dimension given as {}'.format(label_dimension))
+  # pylint: disable=protected-access
+  return head_lib._regression_head_with_mean_squared_error_loss(
+      label_dimension=label_dimension,
+      weight_column=weight_column,
+      loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+  # pylint: enable=protected-access
+
+
+@tf_export('estimator.BoostedTreesClassifier')
+class BoostedTreesClassifier(estimator.Estimator):
+  """A Classifier for Tensorflow Boosted Trees models."""
+
+  def __init__(
+      self,
+      feature_columns,
+      n_batches_per_layer,
+      model_dir=None,
+      n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
+      weight_column=None,
+      label_vocabulary=None,
+      n_trees=100,
+      max_depth=6,
+      learning_rate=0.1,
+      l1_regularization=0.,
+      l2_regularization=0.,
+      tree_complexity=0.,
+      config=None):
+    """Initializes a `BoostedTreesClassifier` instance.
+
+    Example:
+
+    ```python
+    bucketized_feature_1 = bucketized_column(
+      numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+    bucketized_feature_2 = bucketized_column(
+      numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+    classifier = estimator.BoostedTreesClassifier(
+        feature_columns=[bucketized_feature_1, bucketized_feature_2],
+        n_trees=100,
+        ... <some other params>
+    )
+
+    def input_fn_train():
+      ...
+      return dataset
+
+    classifier.train(input_fn=input_fn_train)
+
+    def input_fn_eval():
+      ...
+      return dataset
+
+    metrics = classifier.evaluate(input_fn=input_fn_eval)
+    ```
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      n_batches_per_layer: the number of batches to collect statistics per
+        layer.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      n_classes: number of label classes. Default is binary classification.
+        Multiclass support is not yet implemented.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      n_trees: number trees to be created.
+      max_depth: maximum depth of the tree to grow.
+      learning_rate: shrinkage parameter to be used when a tree added to the
+        model.
+      l1_regularization: regularization multiplier applied to the absolute
+        weights of the tree leafs.
+      l2_regularization: regularization multiplier applied to the square weights
+        of the tree leafs.
+      tree_complexity: regularization factor to penalize trees with more leaves.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: when wrong arguments are given or unsupported functionalities
+         are requested.
+    """
+    # TODO(nponomareva): Support multi-class cases.
+    if n_classes == _HOLD_FOR_MULTI_CLASS_SUPPORT:
+      n_classes = 2
+    head, closed_form = _create_classification_head_and_closed_form(
+        n_classes, weight_column, label_vocabulary=label_vocabulary)
+
+    # HParams for the model.
+    tree_hparams = _TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity)
+
+    def _model_fn(features, labels, mode, config):
+      return _bt_model_fn(  # pylint: disable=protected-access
+          features,
+          labels,
+          mode,
+          head,
+          feature_columns,
+          tree_hparams,
+          n_batches_per_layer,
+          config,
+          closed_form_grad_and_hess_fn=closed_form)
+
+    super(BoostedTreesClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+@tf_export('estimator.BoostedTreesRegressor')
+class BoostedTreesRegressor(estimator.Estimator):
+  """A Regressor for Tensorflow Boosted Trees models."""
+
+  def __init__(
+      self,
+      feature_columns,
+      n_batches_per_layer,
+      model_dir=None,
+      label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
+      weight_column=None,
+      n_trees=100,
+      max_depth=6,
+      learning_rate=0.1,
+      l1_regularization=0.,
+      l2_regularization=0.,
+      tree_complexity=0.,
+      config=None):
+    """Initializes a `BoostedTreesRegressor` instance.
+
+    Example:
+
+    ```python
+    bucketized_feature_1 = bucketized_column(
+      numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+    bucketized_feature_2 = bucketized_column(
+      numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+    regressor = estimator.BoostedTreesRegressor(
+        feature_columns=[bucketized_feature_1, bucketized_feature_2],
+        n_trees=100,
+        ... <some other params>
+    )
+
+    def input_fn_train():
+      ...
+      return dataset
+
+    regressor.train(input_fn=input_fn_train)
+
+    def input_fn_eval():
+      ...
+      return dataset
+
+    metrics = regressor.evaluate(input_fn=input_fn_eval)
+    ```
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      n_batches_per_layer: the number of batches to collect statistics per
+        layer.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      label_dimension: Number of regression targets per example.
+        Multi-dimensional support is not yet implemented.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      n_trees: number trees to be created.
+      max_depth: maximum depth of the tree to grow.
+      learning_rate: shrinkage parameter to be used when a tree added to the
+        model.
+      l1_regularization: regularization multiplier applied to the absolute
+        weights of the tree leafs.
+      l2_regularization: regularization multiplier applied to the square weights
+        of the tree leafs.
+      tree_complexity: regularization factor to penalize trees with more leaves.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: when wrong arguments are given or unsupported functionalities
+         are requested.
+    """
+    # TODO(nponomareva): Extend it to multi-dimension cases.
+    if label_dimension == _HOLD_FOR_MULTI_DIM_SUPPORT:
+      label_dimension = 1
+    head = _create_regression_head(label_dimension, weight_column)
+
+    # HParams for the model.
+    tree_hparams = _TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity)
+
+    def _model_fn(features, labels, mode, config):
+      return _bt_model_fn(  # pylint: disable=protected-access
+          features, labels, mode, head, feature_columns, tree_hparams,
+          n_batches_per_layer, config)
+
+    super(BoostedTreesRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e5cc7a5d6eb0a26fd47f7d7f9bfb566520b246
--- /dev/null
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -0,0 +1,799 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests boosted_trees estimators and model_fn."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.canned import boosted_trees
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import checkpoint_utils
+
+NUM_FEATURES = 3
+
+BUCKET_BOUNDARIES = [-2., .5, 12.]  # Boundaries for all the features.
+INPUT_FEATURES = np.array(
+    [
+        [12.5, 1.0, -2.001, -2.0001, -1.999],  # feature_0 quantized:[3,2,0,0,1]
+        [2.0, -3.0, 0.5, 0.0, 0.4995],         # feature_1 quantized:[2,0,2,1,1]
+        [3.0, 20.0, 50.0, -100.0, 102.75],     # feature_2 quantized:[2,3,3,0,3]
+    ],
+    dtype=np.float32)
+CLASSIFICATION_LABELS = [[0.], [1.], [1.], [0.], [0.]]
+REGRESSION_LABELS = [[1.5], [0.3], [0.2], [2.], [5.]]
+FEATURES_DICT = {'f_%d' % i: INPUT_FEATURES[i] for i in range(NUM_FEATURES)}
+
+# EXAMPLE_ID is not exposed to Estimator yet, but supported at model_fn level.
+EXAMPLE_IDS = np.array([0, 1, 2, 3, 4], dtype=np.int64)
+EXAMPLE_ID_COLUMN = '__example_id__'
+
+
+def _make_train_input_fn(is_classification):
+  """Makes train input_fn for classification/regression."""
+
+  def _input_fn():
+    features = dict(FEATURES_DICT)
+    features[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    if is_classification:
+      labels = CLASSIFICATION_LABELS
+    else:
+      labels = REGRESSION_LABELS
+    return features, labels
+
+  return _input_fn
+
+
+class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES)
+        for i in range(NUM_FEATURES)
+    }
+
+  def _assert_checkpoint(self, model_dir, expected_global_step):
+    self.assertEqual(expected_global_step,
+                     checkpoint_utils.load_variable(model_dir,
+                                                    ops.GraphKeys.GLOBAL_STEP))
+
+  def testTrainAndEvaluateBinaryClassifier(self):
+    input_fn = _make_train_input_fn(is_classification=True)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(est.model_dir, 6)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+
+  def testInferBinaryClassifier(self):
+    train_input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(train_input_fn, steps=num_steps)
+
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertEquals(5, len(predictions))
+    # All labels are correct.
+    self.assertAllClose([0], predictions[0]['class_ids'])
+    self.assertAllClose([1], predictions[1]['class_ids'])
+    self.assertAllClose([1], predictions[2]['class_ids'])
+    self.assertAllClose([0], predictions[3]['class_ids'])
+    self.assertAllClose([0], predictions[4]['class_ids'])
+
+
+class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES)
+        for i in range(NUM_FEATURES)
+    }
+
+  def _assert_checkpoint(self, model_dir, expected_global_step):
+    self.assertEqual(expected_global_step,
+                     checkpoint_utils.load_variable(model_dir,
+                                                    ops.GraphKeys.GLOBAL_STEP))
+
+  def testTrainAndEvaluateRegressor(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        max_depth=5)
+
+    # It will stop after 10 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(est.model_dir, 11)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 0.913176)
+
+  def testInferRegressor(self):
+    train_input_fn = _make_train_input_fn(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(est.model_dir, 6)
+
+    predictions = list(est.predict(input_fn=predict_input_fn))
+
+    self.assertEquals(5, len(predictions))
+    self.assertAllClose([0.703549], predictions[0]['predictions'])
+    self.assertAllClose([0.266539], predictions[1]['predictions'])
+    self.assertAllClose([0.256479], predictions[2]['predictions'])
+    self.assertAllClose([1.088732], predictions[3]['predictions'])
+    self.assertAllClose([1.901732], predictions[4]['predictions'])
+
+
+class ModelFnTests(test_util.TensorFlowTestCase):
+  """Tests bt_model_fn including unexposed internal functionalities."""
+
+  def setUp(self):
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
+    }
+    self._tree_hparams = boosted_trees._TreeHParams(  # pylint:disable=protected-access
+        n_trees=2,
+        max_depth=2,
+        learning_rate=0.1,
+        l1=0.,
+        l2=0.01,
+        tree_complexity=0.)
+
+  def _get_expected_ensembles_for_classification(self):
+    first_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 2
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.387675
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.181818
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0625
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+        """
+    second_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 2
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.387675
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.0
+              original_leaf {
+                scalar: -0.181818
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.105518
+              original_leaf {
+                scalar: 0.0625
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.348397
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.181818
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.224091
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.056815
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 0
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+        """
+    third_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 2
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.387675
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.0
+              original_leaf {
+                scalar: -0.181818
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.105518
+              original_leaf {
+                scalar: 0.0625
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.348397
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.181818
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.224091
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.056815
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.287131
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.162042
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.086986
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 3
+        }
+        """
+    return (first_round, second_round, third_round)
+
+  def _get_expected_ensembles_for_regression(self):
+    first_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 1.169714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.241322
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.083951
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+        """
+    second_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 1.169714
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 1
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.673407
+              original_leaf {
+                scalar: 0.241322
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.324102
+              original_leaf {
+                scalar: 0.083951
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.563167
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.247047
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.095273
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.222102
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 0
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+        """
+    third_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 1.169714
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 1
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.673407
+              original_leaf {
+                scalar: 0.241322
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.324102
+              original_leaf {
+                scalar: 0.083951
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.563167
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.247047
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.095273
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.222102
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.981026
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.005166
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.180281
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 3
+        }
+        """
+    return (first_round, second_round, third_round)
+
+  def _get_train_op_and_ensemble(self, head, config, is_classification,
+                                 train_in_memory):
+    """Calls bt_model_fn() and returns the train_op and ensemble_serialzed."""
+    features, labels = _make_train_input_fn(is_classification)()
+    estimator_spec = boosted_trees._bt_model_fn(  # pylint:disable=protected-access
+        features=features,
+        labels=labels,
+        mode=model_fn.ModeKeys.TRAIN,
+        head=head,
+        feature_columns=self._feature_columns,
+        tree_hparams=self._tree_hparams,
+        example_id_column_name=EXAMPLE_ID_COLUMN,
+        n_batches_per_layer=1,
+        config=config,
+        train_in_memory=train_in_memory)
+    resources.initialize_resources(resources.shared_resources()).run()
+    variables.global_variables_initializer().run()
+    variables.local_variables_initializer().run()
+
+    # Gets the train_op and serialized proto of the ensemble.
+    shared_resources = resources.shared_resources()
+    self.assertEqual(1, len(shared_resources))
+    train_op = estimator_spec.train_op
+    with ops.control_dependencies([train_op]):
+      _, ensemble_serialized = (
+          gen_boosted_trees_ops.boosted_trees_serialize_ensemble(
+              shared_resources[0].handle))
+    return train_op, ensemble_serialized
+
+  def testTrainClassifierInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_classification())
+    with self.test_session() as sess:
+      # Train with train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_classification_head(n_classes=2),
+            run_config.RunConfig(),
+            is_classification=True,
+            train_in_memory=True)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+  def testTrainClassifierNonInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_classification())
+    with self.test_session() as sess:
+      # Train without train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_classification_head(n_classes=2),
+            run_config.RunConfig(),
+            is_classification=True,
+            train_in_memory=False)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+  def testTrainRegressorInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_regression())
+    with self.test_session() as sess:
+      # Train with train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_regression_head(label_dimension=1),
+            run_config.RunConfig(),
+            is_classification=False,
+            train_in_memory=True)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+  def testTrainRegressorNonInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_regression())
+    with self.test_session() as sess:
+      # Train without train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_regression_head(label_dimension=1),
+            run_config.RunConfig(),
+            is_classification=False,
+            train_in_memory=False)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 7043da8de036e5be27d223271c37e065d9ffbcdd..6382622e0b5c72e5d3fcd9b9c6863968a425b86f 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -32,7 +32,6 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
-from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
 # The default learning rate of 0.05 is a historical artifact of the initial
@@ -183,17 +182,11 @@ def _dnn_model_fn(features,
         input_layer_partitioner=input_layer_partitioner)
     logits = logit_fn(features=features, mode=mode)
 
-    def _train_op_fn(loss):
-      """Returns the op to optimize the loss."""
-      return optimizer.minimize(
-          loss,
-          global_step=training_util.get_global_step())
-
     return head.create_estimator_spec(
         features=features,
         mode=mode,
         labels=labels,
-        train_op_fn=_train_op_fn,
+        optimizer=optimizer,
         logits=logits)
 
 
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 6d0fb96057ee93964ee3571bae3b878faad88882..f47706db2fc5f9baa38a36790832a958d5098587 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -31,10 +31,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
@@ -215,8 +215,7 @@ def _dnn_linear_combined_model_fn(features,
 
     train_op = control_flow_ops.group(*train_ops)
     with ops.control_dependencies([train_op]):
-      with ops.colocate_with(global_step):
-        return state_ops.assign_add(global_step, 1)
+      return distribute_lib.increment_var(global_step)
 
   return head.create_estimator_spec(
       features=features,
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 9a7d088778e440b8aa4d9bc4f05a72aa86e7c106..62b13c3200dd782c14dc427b62f9b03086c7174f 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -44,16 +44,16 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary as summary_lib
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import monitored_session
-from tensorflow.python.training import optimizer
+from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
@@ -134,7 +134,8 @@ def mock_head(testcase, hidden_units, logits_dimension, expected_logits):
       hidden_weights_names + hidden_biases_names +
       [LOGITS_WEIGHTS_NAME + '/part_0:0', LOGITS_BIASES_NAME + '/part_0:0'])
 
-  def _create_estimator_spec(features, mode, logits, labels, train_op_fn):
+  def _create_estimator_spec(
+      features, mode, logits, labels, train_op_fn=None, optimizer=None):
     del features, labels  # Not used.
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     testcase.assertItemsEqual(expected_var_names,
@@ -144,8 +145,12 @@ def mock_head(testcase, hidden_units, logits_dimension, expected_logits):
         expected_logits, logits, message='Failed for mode={}. '.format(mode))
     with ops.control_dependencies([assert_logits]):
       if mode == model_fn.ModeKeys.TRAIN:
+        if train_op_fn is not None:
+          train_op = train_op_fn(loss)
+        elif optimizer is not None:
+          train_op = optimizer.minimize(loss, global_step=None)
         return model_fn.EstimatorSpec(
-            mode=mode, loss=loss, train_op=train_op_fn(loss))
+            mode=mode, loss=loss, train_op=train_op)
       elif mode == model_fn.ModeKeys.EVAL:
         return model_fn.EstimatorSpec(mode=mode, loss=array_ops.identity(loss))
       elif mode == model_fn.ModeKeys.PREDICT:
@@ -191,7 +196,7 @@ def mock_optimizer(testcase, hidden_units, expected_loss=None):
     testcase.assertEquals(0, loss.shape.ndims)
     if expected_loss is None:
       if global_step is not None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       return control_flow_ops.no_op()
     assert_loss = assert_close(
         math_ops.to_float(expected_loss, name='expected'),
@@ -199,12 +204,12 @@ def mock_optimizer(testcase, hidden_units, expected_loss=None):
         name='assert_loss')
     with ops.control_dependencies((assert_loss,)):
       if global_step is not None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       return control_flow_ops.no_op()
 
   optimizer_mock = test.mock.NonCallableMagicMock(
-      spec=optimizer.Optimizer,
-      wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+      spec=optimizer_lib.Optimizer,
+      wraps=optimizer_lib.Optimizer(use_locking=False, name='my_optimizer'))
   optimizer_mock.minimize = test.mock.MagicMock(wraps=_minimize)
 
   return optimizer_mock
@@ -1035,6 +1040,8 @@ class BaseDNNClassifierEvaluateTest(object):
         metric_keys.MetricKeys.LOSS: expected_loss,
         metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2.,
         metric_keys.MetricKeys.ACCURACY: 0.5,
+        metric_keys.MetricKeys.PRECISION: 0.0,
+        metric_keys.MetricKeys.RECALL: 0.0,
         metric_keys.MetricKeys.PREDICTION_MEAN: 0.11105597,
         metric_keys.MetricKeys.LABEL_MEAN: 0.5,
         metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
@@ -1042,6 +1049,7 @@ class BaseDNNClassifierEvaluateTest(object):
         # that is what the algorithm returns.
         metric_keys.MetricKeys.AUC: 0.5,
         metric_keys.MetricKeys.AUC_PR: 0.75,
+
         ops.GraphKeys.GLOBAL_STEP: global_step
     }, dnn_classifier.evaluate(input_fn=_input_fn, steps=1))
 
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 8d742a2c6147e86619d4c0aad59b69459384bd4d..bb033d349534e044b2b92d064051ee5fa07f4d62 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
@@ -85,40 +86,39 @@ class _Head(object):
     ```python
     def _my_dnn_model_fn(features, labels, mode, params, config=None):
       # Optionally your callers can pass head to model_fn as a param.
-      head = tf.contrib.learn.regression_head(...)
-      input = tf.contrib.layers.input_from_feature_columns(features, ...)
-      last_hidden_layer_out = tf.contrib.layers.stack(
-          input, tf.contrib.layers.fully_connected, [1000, 500])
-      logits = tf.contrib.layers.fully_connected(
-          last_hidden_layer_out, head.logits_dimension, activation_fn=None)
-
-      def _train_op_fn(loss):
-        return optimizer.minimize(loss)
+      head = tf.contrib.estimator.regression_head(...)
+      inputs = tf.feature_column.input_layer(features, ...)
+      hidden_layer0 = tf.layers.dense(
+          inputs, units=1000, activation=tf.nn.relu)
+      hidden_layer1 = tf.layers.dense(
+          hidden_layer0, units=500, activation=tf.nn.relu)
+      logits = tf.layers.dense(
+          hidden_layer1, units=head.logits_dimension, activation=None)
 
       return head.create_estimator_spec(
           features=features,
           labels=labels,
           mode=mode,
           logits=logits,
-          train_op_fn=_train_op_fn)
+          optimizer=optimizer)
     ```
 
   There are cases where computing and applying gradients can not be meaningfully
-  captured with train_op_fn we support (for example, with sync optimizer). In
-  such case, you can take the responsibility on your own. Here is a common
-  use case,
+  captured with optimizer or train_op_fn we support (for example, with sync
+  optimizer). In such case, you can take the responsibility on your own. Here is
+  a common use case,
     ```python
     estimator_spec = head.create_estimator_spec(
         features=features,
         labels=labels,
         mode=mode,
         logits=logits,
-        train_op_fn=tf.contrib.learn.no_op_train_fn)
+        train_op_fn=lambda _: tf.no_op())
     if mode == model_fn.ModeKeys.TRAIN:
       optimizer = ...
       sync = tf.train.SyncReplicasOptimizer(opt=optimizer, ...)
-      update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
-                                                  loss=estimator_spec.loss, ...)
+      update_op = sync.minimize(
+          estimator_spec.loss, global_step=tf.get_global_step())
       hooks = [sync.make_session_run_hook(is_chief)]
       ... update train_op and hooks in EstimatorSpec and return
     ```
@@ -172,10 +172,12 @@ class _Head(object):
     """
     raise NotImplementedError('Calling an abstract method.')
 
+  # TODO(b/65403806): By default, collect regularization_losses from
+  # GraphKeys.REGULARIZATION_LOSSES collection.
   @abc.abstractmethod
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns `EstimatorSpec` that a model_fn can return.
 
     Please note that,
@@ -186,10 +188,14 @@ class _Head(object):
       mode: Estimator's `ModeKeys`.
       logits: logits `Tensor` to be used by the head.
       labels: Labels `Tensor`, or `dict` of same.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
-        to optimize the model with the loss. This is used in TRAIN mode and
-        must not be None. None is allowed in other modes. If you want to
-        optimize loss yourself you can pass `no_op_train_fn` and then use
+        to optimize the model with the loss in TRAIN mode. Used if `optimizer`
+        is `None`. Exactly one of `train_op_fn` and `optimizer` must be set in
+        TRAIN mode. None is allowed in other modes. If you want to optimize loss
+        yourself you can pass `lambda _: tf.no_op()` and then use
         EstimatorSpec.loss to compute and apply gradients.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses.
@@ -694,8 +700,8 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         processed_labels=label_ids)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -706,8 +712,11 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
       labels: Labels integer or string `Tensor` with shape matching `logits`,
         namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is
         required argument when `mode` equals `TRAIN` or `EVAL`.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Required in TRAIN mode.
+        `train_op`. Used if `optimizer` is `None`.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses. These losses are
         usually expressed as a batch average, so for best results users need to
@@ -717,7 +726,8 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
     Returns:
       `EstimatorSpec`.
     Raises:
-      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode, or if both are set.
     """
     with ops.name_scope(self._name, 'head'):
       logits = _check_logits_final_dim(logits, self.logits_dimension)
@@ -780,8 +790,16 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
                 regularization_loss=regularization_loss))
 
       # Train.
-      if train_op_fn is None:
-        raise ValueError('train_op_fn cannot be None.')
+      if optimizer is not None:
+        if train_op_fn is not None:
+          raise ValueError('train_op_fn and optimizer cannot both be set.')
+        train_op = optimizer.minimize(
+            regularized_training_loss,
+            global_step=training_util.get_global_step())
+      elif train_op_fn is not None:
+        train_op = train_op_fn(regularized_training_loss)
+      else:
+        raise ValueError('train_op_fn and optimizer cannot both be None.')
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -807,7 +825,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
-        train_op=train_op_fn(regularized_training_loss))
+        train_op=train_op)
 
 
 def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
@@ -869,11 +887,12 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
   Raises:
     ValueError: If `thresholds` contains a value outside of `(0, 1)`.
     ValueError: If `loss_reduction` is invalid.
+    TypeError: if `label_vocabulary` has invalid type.
   """
   thresholds = tuple(thresholds) if thresholds else tuple()
   if label_vocabulary is not None and not isinstance(label_vocabulary,
                                                      (list, tuple)):
-    raise ValueError(
+    raise TypeError(
         'label_vocabulary should be a list or tuple. Given type: {}'.format(
             type(label_vocabulary)))
 
@@ -940,6 +959,18 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                   predictions=class_ids,
                   weights=weights,
                   name=keys.ACCURACY),
+          _summary_key(self._name, keys.PRECISION):
+              metrics_lib.precision(
+                  labels=labels,
+                  predictions=class_ids,
+                  weights=weights,
+                  name=keys.PRECISION),
+          _summary_key(self._name, keys.RECALL):
+              metrics_lib.recall(
+                  labels=labels,
+                  predictions=class_ids,
+                  weights=weights,
+                  name=keys.RECALL),
           _summary_key(self._name, keys.PREDICTION_MEAN):
               _predictions_mean(
                   predictions=logistic,
@@ -1027,8 +1058,8 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
         processed_labels=labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -1039,8 +1070,11 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
       labels: Labels integer or string `Tensor` with shape matching `logits`,
         namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is required
         argument when `mode` equals `TRAIN` or `EVAL`.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Required in TRAIN mode.
+        `train_op`. Used if `optimizer` is `None`.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses. These losses are
         usually expressed as a batch average, so for best results users need to
@@ -1050,7 +1084,8 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
     Returns:
       `EstimatorSpec`.
     Raises:
-      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode, or if both are set.
     """
     # Predict.
     with ops.name_scope(self._name, 'head'):
@@ -1122,8 +1157,16 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                 regularization_loss=regularization_loss))
 
       # Train.
-      if train_op_fn is None:
-        raise ValueError('train_op_fn can not be None.')
+      if optimizer is not None:
+        if train_op_fn is not None:
+          raise ValueError('train_op_fn and optimizer cannot both be set.')
+        train_op = optimizer.minimize(
+            regularized_training_loss,
+            global_step=training_util.get_global_step())
+      elif train_op_fn is not None:
+        train_op = train_op_fn(regularized_training_loss)
+      else:
+        raise ValueError('train_op_fn and optimizer cannot both be None.')
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -1148,7 +1191,7 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
-        train_op=train_op_fn(regularized_training_loss))
+        train_op=train_op)
 
 
 def _regression_head_with_mean_squared_error_loss(
@@ -1277,8 +1320,8 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         processed_labels=labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None,
-      regularization_losses=None):
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -1290,8 +1333,11 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         `[D0, D1, ... DN, logits_dimension]`. When `logits_dimension=1`, shape
         `[D0, D1, ... DN]` is also supported. `labels` is required argument when
         `mode` equals `TRAIN` or `EVAL`.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Required in TRAIN mode.
+        `train_op`. Used if `optimizer` is `None`.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses. These losses are
         usually expressed as a batch average, so for best results users need to
@@ -1301,7 +1347,8 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
     Returns:
       `EstimatorSpec`.
     Raises:
-      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
+        mode, or if both are set.
     """
     # Predict.
     with ops.name_scope(self._name, 'head'):
@@ -1361,8 +1408,16 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
             eval_metric_ops=eval_metric_ops)
 
       # Train.
-      if train_op_fn is None:
-        raise ValueError('train_op_fn can not be None.')
+      if optimizer is not None:
+        if train_op_fn is not None:
+          raise ValueError('train_op_fn and optimizer cannot both be set.')
+        train_op = optimizer.minimize(
+            regularized_training_loss,
+            global_step=training_util.get_global_step())
+      elif train_op_fn is not None:
+        train_op = train_op_fn(regularized_training_loss)
+      else:
+        raise ValueError('train_op_fn and optimizer cannot both be None.')
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -1387,7 +1442,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
-        train_op=train_op_fn(regularized_training_loss))
+        train_op=train_op)
 
 
 def _assert_range(labels, n_classes, message=None):
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index b40758f8fef13bad939b3fc2f296e848ade46f63..fe6ee07529bc0314618a7cc85926dbb39660a352 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -842,6 +842,41 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
       }, summary_str, tol)
 
+  def test_train_with_optimizer(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+
+    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    features = {'x': np.array(((42,),), dtype=np.int32)}
+    expected_train_result = 'my_train_op'
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        return string_ops.string_join(
+            [constant_op.constant(expected_train_result),
+             string_ops.as_string(loss, precision=2)])
+
+    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
+    expected_loss = 10.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+
   def test_train_summaries_with_head_name(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
@@ -1559,6 +1594,8 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
         # loss_mean = loss/2 = 41./2 = 20.5
         keys.LOSS_MEAN: 20.5,
         keys.ACCURACY: 1./2,
+        keys.PRECISION: 1.,
+        keys.RECALL: 1./2,
         keys.PREDICTION_MEAN: 1./2,
         keys.LABEL_MEAN: 2./2,
         keys.ACCURACY_BASELINE: 2./2,
@@ -1602,11 +1639,13 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     expected_metric_keys = [
         '{}/some_binary_head'.format(metric_keys.MetricKeys.LOSS_MEAN),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.ACCURACY),
+        '{}/some_binary_head'.format(metric_keys.MetricKeys.PRECISION),
+        '{}/some_binary_head'.format(metric_keys.MetricKeys.RECALL),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.PREDICTION_MEAN),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.LABEL_MEAN),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.ACCURACY_BASELINE),
         '{}/some_binary_head'.format(metric_keys.MetricKeys.AUC),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.AUC_PR)
+        '{}/some_binary_head'.format(metric_keys.MetricKeys.AUC_PR),
     ]
     self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
 
@@ -1637,6 +1676,8 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
         keys.LOSS_MEAN: expected_unregularized_loss,
         keys.LOSS_REGULARIZATION: expected_regularization_loss,
         keys.ACCURACY: 1./2,
+        keys.PRECISION: 1.,
+        keys.RECALL: 1./2,
         keys.PREDICTION_MEAN: 1./2,
         keys.LABEL_MEAN: 2./2,
         keys.ACCURACY_BASELINE: 2./2,
@@ -1742,6 +1783,8 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     expected_metrics = {
         keys.LOSS_MEAN: 1.62652338 / 2.,
         keys.ACCURACY: 1./2,
+        keys.PRECISION: 1.,
+        keys.RECALL: .5,
         keys.PREDICTION_MEAN: 1./2,
         keys.LABEL_MEAN: 2./2,
         keys.ACCURACY_BASELINE: 2./2,
@@ -1934,6 +1977,39 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: 20.5,
       }, summary_str)
 
+  def test_train_with_optimizer(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    labels = np.array(((1,), (1,),), dtype=np.float64)
+    expected_train_result = b'my_train_op'
+    features = {'x': np.array(((42,),), dtype=np.float32)}
+    # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
+    expected_loss = 41.
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        with ops.control_dependencies((check_ops.assert_equal(
+            math_ops.to_float(expected_loss), math_ops.to_float(loss),
+            name='assert_loss'),)):
+          return constant_op.constant(expected_train_result)
+
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+
   def test_train_summaries_with_head_name(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
         name='some_binary_head')
@@ -2187,6 +2263,8 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
         keys.LOSS_MEAN: 26.9615384615,
         # accuracy = (1*1 + .1*0 + 1.5*0)/(1 + .1 + 1.5) = 1/2.6 = .38461538461
         keys.ACCURACY: .38461538461,
+        keys.PRECISION: 1./2.5,
+        keys.RECALL: 1./1.1,
         # prediction_mean = (1*1 + .1*0 + 1.5*1)/(1 + .1 + 1.5) = 2.5/2.6
         #                 = .96153846153
         keys.PREDICTION_MEAN: .96153846153,
@@ -2486,6 +2564,8 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     expected_metrics = {
         keys.LOSS_MEAN: expected_loss / np.sum(weights),
         keys.ACCURACY: (1.*0. + 1.5*1. + 2.*1. + 2.5*0.) / np.sum(weights),
+        keys.PRECISION: 2.0/3.0,
+        keys.RECALL: 2.0/4.5,
         keys.PREDICTION_MEAN: (1.*1 + 1.5*0 + 2.*1 + 2.5*0) / np.sum(weights),
         keys.LABEL_MEAN: (1.*0 + 1.5*0 + 2.*1 + 2.5*1) / np.sum(weights),
         keys.ACCURACY_BASELINE: (1.*0 + 1.5*0 + 2.*1 + 2.5*1) / np.sum(weights),
@@ -3064,6 +3144,40 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: 6.5,
       }, summary_str)
 
+  def test_train_with_optimizer(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    labels = np.array(((43.,), (44.,),), dtype=np.float64)
+    expected_train_result = b'my_train_op'
+    features = {'x': np.array(((42.,),), dtype=np.float32)}
+    # loss = (43-45)^2 + (44-41)^2 = 4 + 9 = 13
+    expected_loss = 13
+
+    class _Optimizer(object):
+
+      def minimize(self, loss, global_step):
+        del global_step
+        with ops.control_dependencies((check_ops.assert_equal(
+            math_ops.to_float(expected_loss), math_ops.to_float(loss),
+            name='assert_loss'),)):
+          return constant_op.constant(expected_train_result)
+
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        optimizer=_Optimizer())
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+
   def test_train_summaries_with_head_name(self):
     head = head_lib._regression_head_with_mean_squared_error_loss(
         name='some_regression_head')
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index a2f24ef27044680fe93b176b5207593165d0d109..e7ec4179917a88703444f8aa835ed0359ff58a46 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -33,7 +33,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import ftrl
-from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -157,17 +156,11 @@ def _linear_model_fn(features, labels, mode, head, feature_columns, optimizer,
         units=head.logits_dimension, feature_columns=feature_columns)
     logits = logit_fn(features=features)
 
-    def _train_op_fn(loss):
-      """Returns the op to optimize the loss."""
-      return optimizer.minimize(
-          loss,
-          global_step=training_util.get_global_step())
-
     return head.create_estimator_spec(
         features=features,
         mode=mode,
         labels=labels,
-        train_op_fn=_train_op_fn,
+        optimizer=optimizer,
         logits=logits)
 
 
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index 8e506a763133c3f8c15f49f253f84de5470088c5..0e6436b42143f4b136165d47c41e143dacb4d476 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -47,13 +47,13 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import optimizer as optimizer_lib
@@ -682,7 +682,7 @@ class BaseLinearRegressorTrainingTest(object):
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
@@ -690,7 +690,7 @@ class BaseLinearRegressorTrainingTest(object):
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
 
     mock_optimizer = test.mock.NonCallableMock(
@@ -905,13 +905,13 @@ class BaseLinearClassifierTrainingTest(object):
       # Verify loss. We can't check the value directly, so we add an assert op.
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
           loss,
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
 
     mock_optimizer = test.mock.NonCallableMock(
         spec=optimizer_lib.Optimizer,
@@ -1337,6 +1337,8 @@ class BaseLinearClassifierEvaluationTest(object):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: 41.,
           metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: 0.,
           metric_keys.MetricKeys.LABEL_MEAN: 1.,
           metric_keys.MetricKeys.ACCURACY_BASELINE: 1,
@@ -1406,6 +1408,8 @@ class BaseLinearClassifierEvaluationTest(object):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
           metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: 0.5,
           metric_keys.MetricKeys.LABEL_MEAN: 0.5,
           metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
@@ -1487,6 +1491,8 @@ class BaseLinearClassifierEvaluationTest(object):
           ops.GraphKeys.GLOBAL_STEP: 100,
           metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
           metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PRECISION: 0.,
+          metric_keys.MetricKeys.RECALL: 0.,
           metric_keys.MetricKeys.PREDICTION_MEAN: predictions_mean,
           metric_keys.MetricKeys.LABEL_MEAN: label_mean,
           metric_keys.MetricKeys.ACCURACY_BASELINE: (
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
index 44eb680939203fea67e3391326a6f1013f022ad5..f374d3154982e3b7cdc637e9e3606b3a2947cbf3 100644
--- a/tensorflow/python/estimator/canned/metric_keys.py
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -28,6 +28,8 @@ class MetricKeys(object):
   LOSS_REGULARIZATION = 'regularization_loss'
 
   ACCURACY = 'accuracy'
+  PRECISION = 'precision'
+  RECALL = 'recall'
   # This is the best the model could do by always predicting one class.
   # Should be < ACCURACY in a trained model.
   ACCURACY_BASELINE = 'accuracy_baseline'
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 5245a050a1ed87af157dbd2008b52122798f9491..4d3eff71ad2167315614c41b70f1127d51b12de3 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -41,8 +41,11 @@ from tensorflow.python.estimator.export.export import get_temp_export_dir
 from tensorflow.python.estimator.export.export import get_timestamped_export_dir
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder as saved_model_builder
@@ -50,6 +53,7 @@ from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import device_setter
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import evaluation
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
@@ -183,6 +187,9 @@ class Estimator(object):
             config)
       self._config = config
 
+    # The distribute field contains an instance of DistributionStrategy.
+    self._distribution = self._config.train_distribute
+
     # Model directory.
     model_dir = compat_internal.path_to_str(model_dir)
     if (model_dir is not None) and (self._config.model_dir is not None):
@@ -682,11 +689,26 @@ class Estimator(object):
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
     """Extracts the `features` and labels from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
+    # TODO(anjalisridhar): What about the default DistributionStrategy? Perhaps
+    # using any input is alright in that case. There is also a
+    # has_dataset_or_queue_runner function that we may want to extend and use.
+    if (self._distribution is not None and
+        not isinstance(result, dataset_ops.Dataset) and
+        mode == model_fn_lib.ModeKeys.TRAIN):
+      raise ValueError('input_fn() must return a tf.data.Dataset when using a '
+                       'DistributionStrategy.')
     input_hooks = []
     if isinstance(result, dataset_ops.Dataset):
-      iterator = result.make_initializable_iterator()
-      input_hooks.append(_DatasetInitializerHook(iterator))
-      result = iterator.get_next()
+      if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
+        # TODO(josh11b): This is currently using a one-shot iterator, we
+        # will update this to an initializeable iterator once the
+        # necessory support for creating an initializable iterator is
+        # available.
+        result = self._distribution.distribute_dataset(result).get_next()
+      else:
+        iterator = result.make_initializable_iterator()
+        input_hooks.append(_DatasetInitializerHook(iterator))
+        result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
@@ -815,6 +837,12 @@ class Estimator(object):
     return model_fn_results
 
   def _train_model(self, input_fn, hooks, saving_listeners):
+    if self._distribution:
+      return self._train_model_distributed(input_fn, hooks, saving_listeners)
+    else:
+      return self._train_model_default(input_fn, hooks, saving_listeners)
+
+  def _train_model_default(self, input_fn, hooks, saving_listeners):
     worker_hooks = []
     with ops.Graph().as_default() as g, g.device(self._device_fn):
       random_seed.set_random_seed(self._config.tf_random_seed)
@@ -826,86 +854,210 @@ class Estimator(object):
       worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
+      return self._train_with_estimator_spec(estimator_spec, worker_hooks,
+                                             hooks, global_step_tensor,
+                                             saving_listeners)
 
-      if self._warm_start_settings:
-        logging.info('Warm-starting with WarmStartSettings: %s' %
-                     (self._warm_start_settings,))
-        # pylint: disable=protected-access
-        warm_starting_util.warm_start(*self._warm_start_settings)
-        # pylint: enable=protected-access
-      # Check if the user created a loss summary, and add one if they didn't.
-      # We assume here that the summary is called 'loss'. If it is not, we will
-      # make another one with the name 'loss' to ensure it shows up in the right
-      # graph in TensorBoard.
-      if not any([x.op.name == 'loss'
-                  for x in ops.get_collection(ops.GraphKeys.SUMMARIES)]):
-        summary.scalar('loss', estimator_spec.loss)
-      ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
-      worker_hooks.extend(hooks)
-      worker_hooks.extend([
-          training.NanTensorHook(estimator_spec.loss),
-          training.LoggingTensorHook(
-              {
-                  'loss': estimator_spec.loss,
-                  'step': global_step_tensor
-              },
-              every_n_iter=100)
-      ])
-      worker_hooks.extend(estimator_spec.training_hooks)
-
-      if not (estimator_spec.scaffold.saver or
-              ops.get_collection(ops.GraphKeys.SAVERS)):
-        ops.add_to_collection(
-            ops.GraphKeys.SAVERS,
-            training.Saver(
-                sharded=True,
-                max_to_keep=self._config.keep_checkpoint_max,
-                keep_checkpoint_every_n_hours=(
-                    self._config.keep_checkpoint_every_n_hours),
-                defer_build=True,
-                save_relative_paths=True))
-
-      chief_hooks = []
-      all_hooks = worker_hooks + list(estimator_spec.training_chief_hooks)
-      saver_hooks = [
-          h for h in all_hooks if isinstance(h, training.CheckpointSaverHook)]
-      if (self._config.save_checkpoints_secs or
-          self._config.save_checkpoints_steps):
-        if not saver_hooks:
-          chief_hooks = [
-              training.CheckpointSaverHook(
-                  self._model_dir,
-                  save_secs=self._config.save_checkpoints_secs,
-                  save_steps=self._config.save_checkpoints_steps,
-                  scaffold=estimator_spec.scaffold)
-          ]
-          saver_hooks = [chief_hooks[0]]
-      if saving_listeners:
-        if not saver_hooks:
-          raise ValueError(
-              'There should be a CheckpointSaverHook to use saving_listeners. '
-              'Please set one of the RunConfig.save_checkpoints_steps or '
-              'RunConfig.save_checkpoints_secs.')
+  def _train_model_distributed(self, input_fn, hooks, saving_listeners):
+    self._distribution.configure(self._session_config)
+    worker_hooks = []
+    with ops.Graph().as_default() as g:
+      with self._distribution.scope():
+        random_seed.set_random_seed(self._config.tf_random_seed)
+        features, labels, input_hooks = (
+            self._get_features_and_labels_from_input_fn(
+                input_fn, model_fn_lib.ModeKeys.TRAIN))
+        worker_hooks.extend(input_hooks)
+        global_step_tensor = self._create_and_assert_global_step(g)
+        # The default destination for the global_step_tensor fetch call is the
+        # CPU.
+        global_step_read_tensor = self._distribution.fetch(global_step_tensor)
+        # we want to add to the global collection in the main thread not the
+        # tower threads.
+        ops.add_to_collection(training_util.GLOBAL_STEP_READ_KEY,
+                              global_step_read_tensor)
+        grouped_estimator_spec = self._distribution.call_for_each_tower(
+            self._call_model_fn,
+            features,
+            labels,  # although this will be None it seems
+            model_fn_lib.ModeKeys.TRAIN,
+            self.config)
+
+        # TODO(anjalisridhar): Figure out how to resolve the folowing scaffold
+        # parameters: init_feed_dict, init_fn.
+        scaffold_list = self._distribution.unwrap(
+            grouped_estimator_spec.scaffold)
+        init_feed_dict = [
+            s.init_feed_dict
+            for s in scaffold_list
+            if s.init_feed_dict is not None
+        ]
+        if init_feed_dict:
+          init_feed_dict = self._distribution.group(init_feed_dict)
         else:
-          # It is expected to have one CheckpointSaverHook. If multiple, we pick
-          # up the first one to add listener.
-          saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
-      with training.MonitoredTrainingSession(
-          master=self._config.master,
-          is_chief=self._config.is_chief,
-          checkpoint_dir=self._model_dir,
-          scaffold=estimator_spec.scaffold,
-          hooks=worker_hooks,
-          chief_only_hooks=(
-              tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
-          save_checkpoint_secs=0,  # Saving is handled by a hook.
-          save_summaries_steps=self._config.save_summary_steps,
-          config=self._session_config,
-          log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
-        loss = None
-        while not mon_sess.should_stop():
-          _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
-      return loss
+          init_feed_dict = None
+
+        init_fn = [s.init_fn for s in scaffold_list if s.init_fn is not None]
+        if init_fn:
+          init_fn = self._distribution.group(init_fn)
+        else:
+          init_fn = None
+
+        init_op = [s.init_op for s in scaffold_list if s.init_op is not None]
+        if init_op:
+          init_op = self._distribution.group(init_op)
+        else:
+          init_op = None
+
+        ready_op = self._distribution.call_for_each_tower(
+            create_per_tower_ready_op, grouped_estimator_spec.scaffold)
+        if ready_op is not None:
+          ready_op = self._distribution.group(ready_op)
+        else:
+          ready_op = None
+
+        ready_for_local_init_op = self._distribution.call_for_each_tower(
+            create_per_tower_ready_for_local_init_op,
+            grouped_estimator_spec.scaffold)
+        if ready_for_local_init_op is not None:
+          ready_for_local_init_op = self._distribution.group(
+              ready_for_local_init_op)
+        else:
+          ready_for_local_init_op = None
+
+        local_init_op = [
+            s.local_init_op
+            for s in scaffold_list
+            if s.local_init_op is not None
+        ]
+        if local_init_op:
+          local_init_op = self._distribution.group(local_init_op)
+        else:
+          local_init_op = None
+
+        summary_op = [
+            s.summary_op for s in scaffold_list if s.summary_op is not None
+        ]
+        if summary_op:
+          summary_op = self._distribution.group(summary_op)
+        else:
+          summary_op = None
+
+        scaffold = monitored_session.Scaffold(
+            init_op=init_op,
+            ready_op=ready_op,
+            ready_for_local_init_op=ready_for_local_init_op,
+            local_init_op=local_init_op,
+            summary_op=summary_op,
+            init_feed_dict=init_feed_dict,
+            init_fn=init_fn)
+
+        def get_hooks_from_the_first_device(per_device_hooks):
+          hooks_list = self._distribution.unwrap(per_device_hooks)
+          assert hooks_list
+          return hooks_list[0]
+
+        training_hooks = get_hooks_from_the_first_device(
+            grouped_estimator_spec.training_hooks)
+        training_chief_hooks = get_hooks_from_the_first_device(
+            grouped_estimator_spec.training_chief_hooks)
+
+        estimator_spec = model_fn_lib.EstimatorSpec(
+            mode=grouped_estimator_spec.mode,
+            loss=self._distribution.unwrap(
+                self._distribution.reduce(distribute_lib.get_loss_reduction(),
+                                          grouped_estimator_spec.loss,
+                                          destinations='/device:CPU:0'))[0],
+            train_op=self._distribution.group(grouped_estimator_spec.train_op),
+            training_hooks=training_hooks,
+            training_chief_hooks=training_chief_hooks,
+            scaffold=scaffold)
+        return self._train_with_estimator_spec(estimator_spec, worker_hooks,
+                                               hooks, global_step_read_tensor,
+                                               saving_listeners)
+
+  def _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks,
+                                 global_step_tensor, saving_listeners):
+    """Train a model with the given Estimator Spec."""
+    if self._warm_start_settings:
+      logging.info('Warm-starting with WarmStartSettings: %s' %
+                   (self._warm_start_settings,))
+      # pylint: disable=protected-access
+      warm_starting_util.warm_start(*self._warm_start_settings)
+      # pylint: enable=protected-access
+    # Check if the user created a loss summary, and add one if they didn't.
+    # We assume here that the summary is called 'loss'. If it is not, we will
+    # make another one with the name 'loss' to ensure it shows up in the right
+    # graph in TensorBoard.
+    if not any([x.op.name == 'loss'
+                for x in ops.get_collection(ops.GraphKeys.SUMMARIES)]):
+      summary.scalar('loss', estimator_spec.loss)
+    ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
+    worker_hooks.extend(hooks)
+    worker_hooks.extend([
+        training.NanTensorHook(estimator_spec.loss),
+        training.LoggingTensorHook(
+            {
+                'loss': estimator_spec.loss,
+                'step': global_step_tensor
+            },
+            every_n_iter=self._config.log_step_count_steps)
+    ])
+    worker_hooks.extend(estimator_spec.training_hooks)
+
+    if not (estimator_spec.scaffold.saver or
+            ops.get_collection(ops.GraphKeys.SAVERS)):
+      ops.add_to_collection(
+          ops.GraphKeys.SAVERS,
+          training.Saver(
+              sharded=True,
+              max_to_keep=self._config.keep_checkpoint_max,
+              keep_checkpoint_every_n_hours=(
+                  self._config.keep_checkpoint_every_n_hours),
+              defer_build=True,
+              save_relative_paths=True))
+
+    chief_hooks = []
+    all_hooks = worker_hooks + list(estimator_spec.training_chief_hooks)
+    saver_hooks = [
+        h for h in all_hooks if isinstance(h, training.CheckpointSaverHook)]
+    if (self._config.save_checkpoints_secs or
+        self._config.save_checkpoints_steps):
+      if not saver_hooks:
+        chief_hooks = [
+            training.CheckpointSaverHook(
+                self._model_dir,
+                save_secs=self._config.save_checkpoints_secs,
+                save_steps=self._config.save_checkpoints_steps,
+                scaffold=estimator_spec.scaffold)
+        ]
+        saver_hooks = [chief_hooks[0]]
+    if saving_listeners:
+      if not saver_hooks:
+        raise ValueError(
+            'There should be a CheckpointSaverHook to use saving_listeners. '
+            'Please set one of the RunConfig.save_checkpoints_steps or '
+            'RunConfig.save_checkpoints_secs.')
+      else:
+        # It is expected to have one CheckpointSaverHook. If multiple, we pick
+        # up the first one to add listener.
+        saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
+    with training.MonitoredTrainingSession(
+        master=self._config.master,
+        is_chief=self._config.is_chief,
+        checkpoint_dir=self._model_dir,
+        scaffold=estimator_spec.scaffold,
+        hooks=worker_hooks,
+        chief_only_hooks=(
+            tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
+        save_checkpoint_secs=0,  # Saving is handled by a hook.
+        save_summaries_steps=self._config.save_summary_steps,
+        config=self._session_config,
+        log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
+      loss = None
+      while not mon_sess.should_stop():
+        _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
+    return loss
 
   def _evaluate_model(self,
                       input_fn,
@@ -972,6 +1124,35 @@ class Estimator(object):
     return eval_results
 
 
+def create_per_tower_ready_op(scaffold):
+  """Create a Scaffold.ready_op inside a tower."""
+  if scaffold.ready_op:
+    return scaffold.ready_op
+
+  def default_ready_op():
+    return array_ops.concat([
+        variables.report_uninitialized_variables(),
+        resources.report_uninitialized_resources()
+    ], 0)
+
+  return monitored_session.Scaffold.get_or_default(
+      'ready_op', ops.GraphKeys.READY_OP, default_ready_op)
+
+
+def create_per_tower_ready_for_local_init_op(scaffold):
+  """Create a Scaffold.ready_for_local_init_op inside a tower."""
+  if scaffold.ready_for_local_init_op:
+    return scaffold.ready_for_local_init_op
+
+  def default_ready_for_local_init_op():
+    return variables.report_uninitialized_variables(
+        variables.global_variables())
+
+  return monitored_session.Scaffold.get_or_default(
+      'ready_for_local_init_op', ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
+      default_ready_for_local_init_op)
+
+
 def _check_checkpoint_available(model_dir):
   latest_path = saver.latest_checkpoint(model_dir)
   if not latest_path:
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index be8930b3cbcd89dbb31dffde0a7a5ecfb64fcd8b..60c59cbc183ccde936384e25da3d8bf44316f712 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.estimator.canned.baseline import BaselineClassifier
 from tensorflow.python.estimator.canned.baseline import BaselineRegressor
+from tensorflow.python.estimator.canned.boosted_trees import BoostedTreesClassifier
+from tensorflow.python.estimator.canned.boosted_trees import BoostedTreesRegressor
 from tensorflow.python.estimator.canned.dnn import DNNClassifier
 from tensorflow.python.estimator.canned.dnn import DNNRegressor
 from tensorflow.python.estimator.canned.dnn_linear_combined import DNNLinearCombinedClassifier
@@ -52,6 +54,8 @@ _allowed_symbols = [
     # Canned Estimators
     'BaselineClassifier',
     'BaselineRegressor',
+    'BoostedTreesClassifier',
+    'BoostedTreesRegressor',
     'DNNClassifier',
     'DNNRegressor',
     'DNNLinearCombinedClassifier',
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 62f035bce558f57c5fd39d60b44cf8eb0130ce38..f62c9cece6a4d370532ca3077d679a54f38918f1 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -43,7 +43,8 @@ _DEFAULT_REPLACEABLE_LIST = [
     'session_config',
     'keep_checkpoint_max',
     'keep_checkpoint_every_n_hours',
-    'log_step_count_steps'
+    'log_step_count_steps',
+    'train_distribute'
 ]
 
 _SAVE_CKPT_ERR = (
@@ -300,7 +301,8 @@ class RunConfig(object):
                session_config=None,
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
-               log_step_count_steps=100):
+               log_step_count_steps=100,
+               train_distribute=None):
     """Constructs a RunConfig.
 
     All distributed training related properties `cluster_spec`, `is_chief`,
@@ -423,8 +425,11 @@ class RunConfig(object):
         to be saved. The default value of 10,000 hours effectively disables
         the feature.
       log_step_count_steps: The frequency, in number of global steps, that the
-        global step/sec will be logged during training.
-
+        global step/sec and the loss will be logged during training.
+      train_distribute: an optional instance of
+        `tf.contrib.distribute.DistributionStrategy`. If specified,
+        then Estimator will distribute the user's model during training,
+        according to the policy specified by that strategy.
 
     Raises:
       ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
@@ -460,7 +465,8 @@ class RunConfig(object):
         session_config=session_config,
         keep_checkpoint_max=keep_checkpoint_max,
         keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
-        log_step_count_steps=log_step_count_steps)
+        log_step_count_steps=log_step_count_steps,
+        train_distribute=train_distribute)
 
     self._init_distributed_setting_from_environment_var(tf_config)
 
@@ -671,12 +677,18 @@ class RunConfig(object):
     """Returns the platform defined (in TF_CONFIG) service dict."""
     return self._service
 
+  @property
+  def train_distribute(self):
+    """Returns the optional `tf.contrib.distribute.DistributionStrategy` object.
+    """
+    return self._train_distribute
+
   def replace(self, **kwargs):
     """Returns a new instance of `RunConfig` replacing specified properties.
 
     Only the properties in the following list are allowed to be replaced:
 
-      - `model_dir`.
+      - `model_dir`,
       - `tf_random_seed`,
       - `save_summary_steps`,
       - `save_checkpoints_steps`,
@@ -685,6 +697,7 @@ class RunConfig(object):
       - `keep_checkpoint_max`,
       - `keep_checkpoint_every_n_hours`,
       - `log_step_count_steps`,
+      - `train_distribute`.
 
     In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
     can be set (should not be both).
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 238a90b67d9d0039c25a6f3800aad25a2db9e36f..295d4ca094cc8cb85c0f1f7fd47c20b910c270df 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -6,18 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "feature_column_py",
     srcs = ["feature_column_lib.py"],
@@ -45,6 +33,7 @@ py_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
@@ -54,6 +43,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/keras",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 381153c66a561e0ef9c494a50e0de3b85a0b2879..3a315e5c2ea0d9607b5aa52715364d6bdf152e1c 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -139,6 +139,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -148,6 +150,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import template
@@ -451,13 +454,162 @@ def linear_model(features,
     if cols_to_vars is not None:
       # Add the bias to cols_to_vars as well, converting the Variable or
       # PartitionedVariable to a list of Variable's.
-      if isinstance(bias, variables.Variable):
+      if (isinstance(bias, variables.Variable) or
+          resource_variable_ops.is_resource_variable(bias)):
         cols_to_vars['bias'] = [bias]
       else:  # Must be a PartitionedVariable.
         cols_to_vars['bias'] = list(bias)
     return predictions
 
 
+class _FCLinearWrapper(base.Layer):
+  """Wraps a _FeatureColumn in a layer for use in a linear model.
+
+  See `linear_model` above.
+  """
+
+  def __init__(self,
+               feature_column,
+               units=1,
+               sparse_combiner='sum',
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_FCLinearWrapper, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self._feature_column = feature_column
+    self._units = units
+    self._sparse_combiner = sparse_combiner
+    self._weight_collections = weight_collections
+    self._state = {}
+
+  def build(self, _):
+    self._state = self._feature_column._create_state(  # pylint: disable=protected-access
+        self._weight_collections, self.add_variable)
+
+    if isinstance(self._feature_column, _CategoricalColumn):
+      weight = self.add_variable(
+          name='weights',
+          shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
+    else:
+      num_elements = self._feature_column._variable_shape.num_elements()  # pylint: disable=protected-access
+      weight = self.add_variable(
+          name='weights',
+          shape=[num_elements, self._units],
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
+    ops.add_to_collections(self._weight_collections, weight)
+    self._weight_var = weight
+    self.built = True
+
+  def call(self, builder):
+    weighted_sum = _create_weighted_sum(
+        column=self._feature_column,
+        builder=builder,
+        units=self._units,
+        sparse_combiner=self._sparse_combiner,
+        weight_collections=self._weight_collections,
+        trainable=self.trainable,
+        weight_var=self._weight_var,
+        state=self._state)
+    return weighted_sum
+
+
+class _BiasLayer(base.Layer):
+  """A layer for the bias term.
+  """
+
+  def __init__(self,
+               units=1,
+               trainable=True,
+               weight_collections=None,
+               name=None,
+               **kwargs):
+    super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)
+    self._units = units
+    self._weight_collections = weight_collections
+
+  def build(self, _):
+    self._bias_variable = self.add_variable(
+        'bias_weights',
+        shape=[self._units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=self.trainable)
+    ops.add_to_collections(self._weight_collections, self._bias_variable)
+    self.built = True
+
+  def call(self, _):
+    return self._bias_variable
+
+
+class _LinearModel(training.Model):
+  """Creates a linear model using feature columns.
+  """
+
+  def __init__(self,
+               feature_columns,
+               units=1,
+               sparse_combiner='sum',
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_LinearModel, self).__init__(name=name, **kwargs)
+    self._feature_columns = _clean_feature_columns(feature_columns)
+    self._weight_collections = list(weight_collections or [])
+    if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:
+      self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+
+    column_layers = {}
+    for column in sorted(self._feature_columns, key=lambda x: x.name):
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name) as vs:  # pylint: disable=protected-access
+        column_name = vs.name
+      column_layer = _FCLinearWrapper(column, units, sparse_combiner,
+                                      self._weight_collections, trainable,
+                                      column_name, **kwargs)
+      column_layers[column_name] = column_layer
+    self._column_layers = self._add_layers(column_layers)
+    self._bias_layer = _BiasLayer(
+        units=units,
+        trainable=trainable,
+        weight_collections=self._weight_collections,
+        name='bias_layer',
+        **kwargs)
+
+  def call(self, features):
+    for column in self._feature_columns:
+      if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
+        raise ValueError(
+            'Items of feature_columns must be either a '
+            '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
+    weighted_sums = []
+    ordered_columns = []
+    builder = _LazyBuilder(features)
+    for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
+      ordered_columns.append(layer._feature_column)  # pylint: disable=protected-access
+      weighted_sum = layer(builder)
+      weighted_sums.append(weighted_sum)
+
+    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
+    predictions_no_bias = math_ops.add_n(
+        weighted_sums, name='weighted_sum_no_bias')
+    predictions = nn_ops.bias_add(
+        predictions_no_bias, self._bias_layer(builder), name='weighted_sum')  # pylint: disable=not-callable
+    return predictions
+
+  def _add_layers(self, layers):
+    # "Magic" required for keras.Model classes to track all the variables in
+    # a list of layers.Layer objects.
+    # TODO(ashankar): Figure out API so user code doesn't have to do this.
+    for name, layer in layers.items():
+      setattr(self, 'layer-%s' % name, layer)
+    return layers
+
+
 def _transform_features(features, feature_columns):
   """Returns transformed features based on features columns passed in.
 
@@ -1641,6 +1793,19 @@ class _FeatureColumn(object):
     """
     pass
 
+  def _create_state(self, weight_collections=None, creator=None):
+    """Returns an object that captures the state of the column.
+
+    Args:
+      weight_collections: Collections to add the variable to
+      creator: Variable creator method called, if provided.
+
+    Returns:
+      An object that encapsulates the state of the column. Can return None.
+    """
+    del weight_collections, creator  # Unused
+    return None
+
 
 class _DenseColumn(_FeatureColumn):
   """Represents a column which can be represented as `Tensor`.
@@ -1660,7 +1825,11 @@ class _DenseColumn(_FeatureColumn):
     pass
 
   @abc.abstractmethod
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _get_dense_tensor(self,
+                        inputs,
+                        weight_collections=None,
+                        trainable=None,
+                        state=None):
     """Returns a `Tensor`.
 
     The output of this function will be used by model-builder-functions. For
@@ -1678,6 +1847,9 @@ class _DenseColumn(_FeatureColumn):
         will be created) are added.
       trainable: If `True` also add variables to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see @{tf.Variable}).
+      state: An object encapsulating the state of the column. Columns that
+        create state using the _create_state method would have that state
+        passed in to this method.
 
     Returns:
       `Tensor` of shape [batch_size] + `_variable_shape`.
@@ -1685,13 +1857,14 @@ class _DenseColumn(_FeatureColumn):
     pass
 
 
-def _create_weighted_sum(
-    column,
-    builder,
-    units,
-    sparse_combiner,
-    weight_collections,
-    trainable):
+def _create_weighted_sum(column,
+                         builder,
+                         units,
+                         sparse_combiner,
+                         weight_collections,
+                         trainable,
+                         weight_var=None,
+                         state=None):
   """Creates a weighted sum for a dense or sparse column for linear_model."""
   if isinstance(column, _CategoricalColumn):
     return _create_categorical_column_weighted_sum(
@@ -1700,32 +1873,50 @@ def _create_weighted_sum(
         units=units,
         sparse_combiner=sparse_combiner,
         weight_collections=weight_collections,
-        trainable=trainable)
+        trainable=trainable,
+        weight_var=weight_var)
   else:
     return _create_dense_column_weighted_sum(
         column=column,
         builder=builder,
         units=units,
         weight_collections=weight_collections,
-        trainable=trainable)
+        trainable=trainable,
+        weight_var=weight_var,
+        state=state)
 
 
-def _create_dense_column_weighted_sum(
-    column, builder, units, weight_collections, trainable):
+def _create_dense_column_weighted_sum(column,
+                                      builder,
+                                      units,
+                                      weight_collections,
+                                      trainable,
+                                      weight_var=None,
+                                      state=None):
   """Create a weighted sum of a dense column for linear_model."""
-  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-      builder,
-      weight_collections=weight_collections,
-      trainable=trainable)
+  if state is not None:
+    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+        builder,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        state=state)
+  else:
+    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+        builder,
+        weight_collections=weight_collections,
+        trainable=trainable)
   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-  weight = variable_scope.get_variable(
-      name='weights',
-      shape=[num_elements, units],
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+  if weight_var is not None:
+    weight = weight_var
+  else:
+    weight = variable_scope.get_variable(
+        name='weights',
+        shape=[num_elements, units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return math_ops.matmul(tensor, weight, name='weighted_sum')
 
 
@@ -1775,8 +1966,13 @@ class _CategoricalColumn(_FeatureColumn):
     pass
 
 
-def _create_categorical_column_weighted_sum(
-    column, builder, units, sparse_combiner, weight_collections, trainable):
+def _create_categorical_column_weighted_sum(column,
+                                            builder,
+                                            units,
+                                            sparse_combiner,
+                                            weight_collections,
+                                            trainable,
+                                            weight_var=None):
   """Create a weighted sum of a categorical column for linear_model."""
   sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
       builder,
@@ -1790,12 +1986,15 @@ def _create_categorical_column_weighted_sum(
     weight_tensor = sparse_ops.sparse_reshape(
         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
-  weight = variable_scope.get_variable(
-      name='weights',
-      shape=(column._num_buckets, units),  # pylint: disable=protected-access
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+  if weight_var is not None:
+    weight = weight_var
+  else:
+    weight = variable_scope.get_variable(
+        name='weights',
+        shape=(column._num_buckets, units),  # pylint: disable=protected-access
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return _safe_embedding_lookup_sparse(
       weight,
       id_tensor,
@@ -1889,12 +2088,12 @@ class _LazyBuilder(object):
       self._feature_tensors[key] = feature_tensor
       return feature_tensor
 
-    if not isinstance(key, (str, _FeatureColumn)):
-      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
-                      'Provided: {}'.format(key))
+    if isinstance(key, str):
+      raise ValueError('Feature {} is not in features dictionary.'.format(key))
 
     if not isinstance(key, _FeatureColumn):
-      raise ValueError('Feature {} is not in features dictionary.'.format(key))
+      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
+                      'Provided: {}'.format(key))
 
     column = key
     logging.debug('Transforming feature_column %s.', column)
@@ -2193,8 +2392,33 @@ class _EmbeddingColumn(
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _get_dense_tensor_internal(
-      self, inputs, weight_collections=None, trainable=None):
+  def _create_state(self, weight_collections=None, creator=None):
+    variables_map = {}
+    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+    if creator is not None:
+      embedding_weights = creator(
+          name='embedding_weights',
+          shape=embedding_shape,
+          dtype=dtypes.float32,
+          initializer=self.initializer,
+          trainable=self.trainable)
+      ops.add_to_collections(weight_collections, embedding_weights)
+    else:
+      embedding_weights = variable_scope.get_variable(
+          name='embedding_weights',
+          shape=embedding_shape,
+          dtype=dtypes.float32,
+          initializer=self.initializer,
+          trainable=self.trainable,
+          collections=weight_collections)
+    variables_map['embedding_weights'] = embedding_weights
+    return variables_map
+
+  def _get_dense_tensor_internal(self,
+                                 inputs,
+                                 weight_collections=None,
+                                 trainable=None,
+                                 state=None):
     """Private method that follows the signature of _get_dense_tensor."""
     # Get sparse IDs and weights.
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
@@ -2202,14 +2426,10 @@ class _EmbeddingColumn(
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
-    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-    embedding_weights = variable_scope.get_variable(
-        name='embedding_weights',
-        shape=embedding_shape,
-        dtype=dtypes.float32,
-        initializer=self.initializer,
-        trainable=self.trainable and trainable,
-        collections=weight_collections)
+    if state is None:
+      state = self._create_state(weight_collections)
+    embedding_weights = state['embedding_weights']
+
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
       if isinstance(to_restore, variables.PartitionedVariable):
@@ -2227,7 +2447,11 @@ class _EmbeddingColumn(
         name='%s_weights' % self.name,
         max_norm=self.max_norm)
 
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _get_dense_tensor(self,
+                        inputs,
+                        weight_collections=None,
+                        trainable=None,
+                        state=None):
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
           'In embedding_column: {}. '
@@ -2240,8 +2464,10 @@ class _EmbeddingColumn(
               self.name, type(self.categorical_column),
               self.categorical_column))
     return self._get_dense_tensor_internal(
-        inputs=inputs, weight_collections=weight_collections,
-        trainable=trainable)
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        state=state)
 
   def _get_sequence_dense_tensor(
       self, inputs, weight_collections=None, trainable=None):
@@ -2297,7 +2523,39 @@ class _SharedEmbeddingColumn(
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _create_state(self, weight_collections=None, creator=None):
+    variables_map = {}
+    shared_embedding_collection = ops.get_collection(
+        self.shared_embedding_collection_name)
+    if not shared_embedding_collection:
+      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+      if creator is not None:
+        embedding_weights = creator(
+            name='embedding_weights',
+            shape=embedding_shape,
+            dtype=dtypes.float32,
+            initializer=self.initializer,
+            trainable=self.trainable)
+        ops.add_to_collections(weight_collections, embedding_weights)
+      else:
+        embedding_weights = variable_scope.get_variable(
+            name='embedding_weights',
+            shape=embedding_shape,
+            dtype=dtypes.float32,
+            initializer=self.initializer,
+            trainable=self.trainable,
+            collections=weight_collections)
+      ops.add_to_collection(self.shared_embedding_collection_name,
+                            embedding_weights)
+      variables_map['embedding_weights'] = embedding_weights
+
+    return variables_map
+
+  def _get_dense_tensor(self,
+                        inputs,
+                        weight_collections=None,
+                        trainable=None,
+                        state=None):
     # This method is called from a variable_scope with name _var_scope_name,
     # which is shared among all shared embeddings. Open a name_scope here, so
     # that the ops for different columns have distinct names.
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 6f366e77229577b1a6a5363f882daa07203f525c..07588af37ee92eb2143d20eafa2874d794360fa4 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.feature_column.feature_column import _LinearModel
 from tensorflow.python.feature_column.feature_column import _transform_features
 from tensorflow.python.feature_column.feature_column import InputLayer
 from tensorflow.python.framework import constant_op
@@ -339,6 +340,20 @@ class NumericColumnTest(test.TestCase):
         sess.run(price_var.assign([[10.]]))
         self.assertAllClose([[10.], [50.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[10.], [50.]], predictions.eval())
+
 
 class BucketizedColumnTest(test.TestCase):
 
@@ -561,6 +576,62 @@ class BucketizedColumnTest(test.TestCase):
         sess.run(bias.assign([1.]))
         self.assertAllClose([[81.], [141.]], predictions.eval())
 
+  def test_keras_linear_model_one_input_value(self):
+    """Tests _LinearModel for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [bucketized_price])
+      bias = get_keras_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+
+  def test_keras_linear_model_two_input_values(self):
+    """Tests _LinearModel for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1., 1.], [5., 6.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [bucketized_price])
+      bias = get_keras_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight per bucket per input column, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
+            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
+                                         [60.], [70.], [80.], [90.], [100.]]))
+        # 1st example:
+        #   price -1. is in the 0th bucket, whose weight is 10.
+        #   price 1. is in the 6th bucket, whose weight is 70.
+        # 2nd example:
+        #   price 5. is in the 3rd bucket, whose weight is 40.
+        #   price 6. is in the 9th bucket, whose weight is 100.
+        self.assertAllClose([[80.], [140.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], predictions.eval())
+
 
 class HashedCategoricalColumnTest(test.TestCase):
 
@@ -767,6 +838,28 @@ class HashedCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
         self.assertAllClose(((4.,), (6.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_keras_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 3: wire_var[3] = 4
+        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
 
 class CrossedColumnTest(test.TestCase):
 
@@ -1060,6 +1153,96 @@ class CrossedColumnTest(test.TestCase):
                 dense_shape=(2, 2)),
         }, (crossed,))
 
+  def test_keras_linear_model(self):
+    """Tests _LinearModel.
+
+    Uses data from test_get_sparse_tesnsors_simple.
+    """
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+      }, (crossed,))
+      bias = get_keras_linear_model_bias()
+      crossed_var = get_linear_model_column_var(crossed)
+      with _initialized_session() as sess:
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            crossed_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+
+  def test_keras_linear_model_with_weights(self):
+
+    class _TestColumnWithWeights(_CategoricalColumn):
+      """Produces sparse IDs and sparse weights."""
+
+      @property
+      def name(self):
+        return 'test_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {
+            self.name:
+                parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name):
+                parsing_ops.VarLenFeature(dtypes.float32),
+        }
+
+      @property
+      def _num_buckets(self):
+        return 5
+
+      def _transform_feature(self, inputs):
+        return (inputs.get(self.name),
+                inputs.get('{}_weights'.format(self.name)))
+
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
+                              trainable=None):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = inputs.get(self)
+        return _CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        get_keras_linear_model_predictions({
+            t.name:
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[0, 1, 2],
+                    dense_shape=(2, 2)),
+            '{}_weights'.format(t.name):
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[1., 10., 2.],
+                    dense_shape=(2, 2)),
+            'c':
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=['cA', 'cB', 'cC'],
+                    dense_shape=(2, 2)),
+        }, (crossed,))
+
 
 def get_linear_model_bias():
   with variable_scope.variable_scope('linear_model', reuse=True):
@@ -1071,6 +1254,28 @@ def get_linear_model_column_var(column):
                             'linear_model/' + column.name)[0]
 
 
+def get_keras_linear_model_bias():
+  with variable_scope.variable_scope('linear_model', reuse=True):
+    with variable_scope.variable_scope('bias_layer', reuse=True):
+      return variable_scope.get_variable('bias_weights')
+
+
+def get_keras_linear_model_predictions(features,
+                                       feature_columns,
+                                       units=1,
+                                       sparse_combiner='sum',
+                                       weight_collections=None,
+                                       trainable=True):
+  keras_linear_model = _LinearModel(
+      feature_columns,
+      units,
+      sparse_combiner,
+      weight_collections,
+      trainable,
+      name='linear_model')
+  return keras_linear_model(features)  # pylint: disable=not-callable
+
+
 @test_util.with_c_api
 class LinearModelTest(test.TestCase):
 
@@ -1698,125 +1903,748 @@ class LinearModelTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
-class InputLayerTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_retrieving_input(self):
-    features = {'a': [0.]}
-    input_layer = InputLayer(fc.numeric_column('a'))
-    inputs = self.evaluate(input_layer(features))
-    self.assertAllClose([[0.]], inputs)
-
-  def test_reuses_variables(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
-
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
-      embedding_dimension = 2
-      def _embedding_column_initializer(shape, dtype, partition_info):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
-
-      input_layer = InputLayer([embedding_column])
-      features = {'a': sparse_input}
-
-      inputs = input_layer(features)
-      variables = input_layer.variables
-
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
-
-      # Check that only one variable was created.
-      self.assertEqual(1, len(variables))
-
-      # Check that invoking input_layer on the same features does not create
-      # additional variables
-      _ = input_layer(features)
-      self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], input_layer.variables[0])
-
-  def test_feature_column_input_layer_gradient(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
-
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
-      embedding_dimension = 2
-
-      def _embedding_column_initializer(shape, dtype, partition_info):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
-
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
-
-      input_layer = InputLayer([embedding_column])
-      features = {'a': sparse_input}
+@test_util.with_c_api
+class _LinearModelTest(test.TestCase):
 
-      def scale_matrix():
-        matrix = input_layer(features)
-        return 2 * matrix
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      get_keras_linear_model_predictions(features={}, feature_columns=[])
 
-      # Sanity check: Verify that scale_matrix returns the correct output.
-      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+  def test_should_be_feature_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+      get_keras_linear_model_predictions(
+          features={'a': [[0]]}, feature_columns='NotSupported')
 
-      # Check that the returned gradient is correct.
-      grad_function = backprop.implicit_grad(scale_matrix)
-      grads_and_vars = grad_function()
-      indexed_slice = grads_and_vars[0][0]
-      gradient = grads_and_vars[0][0].values
+  def test_should_be_dense_or_categorical_column(self):
 
-      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
-      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+    class NotSupportedColumn(_FeatureColumn):
 
+      @property
+      def name(self):
+        return 'NotSupportedColumn'
 
-@test_util.with_c_api
-class FunctionalInputLayerTest(test.TestCase):
+      def _transform_feature(self, cache):
+        pass
 
-  def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
-      fc.input_layer(features={}, feature_columns=[])
+      @property
+      def _parse_example_spec(self):
+        pass
 
-  def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
-      fc.input_layer(
-          features={'a': [[0]]},
-          feature_columns=[
-              fc.categorical_column_with_hash_bucket('wire_cast', 4)
-          ])
+    with self.assertRaisesRegexp(
+        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
+      get_keras_linear_model_predictions(
+          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
 
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.input_layer(
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      get_keras_linear_model_predictions(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_dense_bias(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        sess.run(price_var.assign([[10.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[15.], [55.]], predictions.eval())
+
+  def test_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = get_keras_linear_model_predictions(features, [wire_cast])
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_and_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [wire_cast, price])
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+
+  def test_dense_and_sparse_column(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _DenseAndSparseColumn(_DenseColumn, _CategoricalColumn):
+
+      @property
+      def name(self):
+        return 'dense_and_sparse_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
+
+      def _transform_feature(self, inputs):
+        return inputs.get(self.name)
+
+      @property
+      def _variable_shape(self):
+        raise ValueError('Should not use this method.')
+
+      def _get_dense_tensor(self,
+                            inputs,
+                            weight_collections=None,
+                            trainable=None):
+        raise ValueError('Should not use this method.')
+
+      @property
+      def _num_buckets(self):
+        return 4
+
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
+                              trainable=None):
+        sp_tensor = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [1, 0], [1, 1]],
+            values=[2, 0, 3],
+            dense_shape=[2, 2])
+        return _CategoricalColumn.IdWeightPair(sp_tensor, None)
+
+    dense_and_sparse_column = _DenseAndSparseColumn()
+    with ops.Graph().as_default():
+      sp_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {dense_and_sparse_column.name: sp_tensor}
+      predictions = get_keras_linear_model_predictions(
+          features, [dense_and_sparse_column])
+      bias = get_keras_linear_model_bias()
+      dense_and_sparse_column_var = get_linear_model_column_var(
+          dense_and_sparse_column)
+      with _initialized_session() as sess:
+        sess.run(
+            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
+                                                [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_multi_output(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(
+          features, [price], units=3)
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        sess.run(price_var.assign([[10., 100., 1000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
+                            predictions.eval())
+
+  def test_sparse_multi_output(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = get_keras_linear_model_predictions(
+          features, [wire_cast], units=3)
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        sess.run(
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
+                                  [1000., 1100.,
+                                   1200.], [10000., 11000., 12000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
+                            predictions.eval())
+
+  def test_dense_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_sparse_multi_rank(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
+      wire_value = sparse_tensor.SparseTensorValue(
+          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
+          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
+          dense_shape=[2, 2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = get_keras_linear_model_predictions(features, [wire_cast])
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(
+            np.zeros((2, 1)),
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        self.assertAllClose(
+            [[1010.], [11000.]],
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+
+  def test_sparse_combiner(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = get_keras_linear_model_predictions(
+          features, [wire_cast], sparse_combiner='mean')
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+
+  def test_dense_multi_dimension_multi_output(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = get_keras_linear_model_predictions(
+          features, [price], units=3)
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
+        sess.run(bias.assign([2., 3., 4.]))
+        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
+                            predictions.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(
+            Exception,
+            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+          predictions = get_keras_linear_model_predictions(features, [price])
+      else:
+        predictions = get_keras_linear_model_predictions(features, [price])
+        with _initialized_session():
+          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+            predictions.eval()
+
+  def test_dense_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_dense_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
+      bias = get_keras_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price1_var.eval())
+        self.assertAllClose([[0.]], price2_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price1_var.assign([[10.], [100.]]))
+        sess.run(price2_var.assign([[1000.]]))
+        sess.run(bias.assign([7.]))
+        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+
+  def test_dense_collection(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      get_keras_linear_model_predictions(
+          features, [price], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      self.assertIn(bias, my_vars)
+      self.assertIn(price_var, my_vars)
+
+  def test_sparse_collection(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      get_keras_linear_model_predictions(
+          features, [wire_cast], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, my_vars)
+      self.assertIn(wire_cast_var, my_vars)
+
+  def test_dense_trainable_default(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(price_var, trainable_vars)
+
+  def test_sparse_trainable_default(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      get_keras_linear_model_predictions(features, [wire_cast])
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(wire_cast_var, trainable_vars)
+
+  def test_dense_trainable_false(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      get_keras_linear_model_predictions(features, [price], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_sparse_trainable_false(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      get_keras_linear_model_predictions(features, [wire_cast], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      get_keras_linear_model_predictions(
+          features, [price_a, wire_cast, price_b],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      get_keras_linear_model_predictions(
+          features, [wire_cast, price_b, price_a],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+      get_keras_linear_model_predictions(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        get_keras_linear_model_predictions(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'must have the same size and shape'):
+          sess.run(
+              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            predictions,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_with_numpy_input_fn(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([-1., 2., 13., 104.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = get_keras_linear_model_predictions(features,
+                                             [price_buckets, body_style])
+    # self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      bias = get_keras_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_with_1d_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price':
+            constant_op.constant([
+                -1.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+
+    net = get_keras_linear_model_predictions(features,
+                                             [price_buckets, body_style])
+    with _initialized_session() as sess:
+      bias = get_keras_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+
+    price_data = np.array([-1., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
+    country_data = np.array(['US', 'CA'])
+
+    net = get_keras_linear_model_predictions(
+        features, [price_buckets, body_style, country])
+    bias = get_keras_linear_model_bias()
+    price_buckets_var = get_linear_model_column_var(price_buckets)
+    body_style_var = get_linear_model_column_var(body_style)
+    with _initialized_session() as sess:
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          sess.run(
+                              net,
+                              feed_dict={
+                                  features['price']: price_data,
+                                  features['body-style']: body_style_data,
+                                  features['country']: country_data
+                              }))
+
+  def test_with_rank_0_feature(self):
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
+
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      get_keras_linear_model_predictions(features, [price])
+
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = get_keras_linear_model_predictions(features, [price])
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
+
+
+class InputLayerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_retrieving_input(self):
+    features = {'a': [0.]}
+    input_layer = InputLayer(fc.numeric_column('a'))
+    inputs = self.evaluate(input_layer(features))
+    self.assertAllClose([[0.]], inputs)
+
+  def test_reuses_variables(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      inputs = input_layer(features)
+      variables = input_layer.variables
+
+      # Sanity check: test that the inputs are correct.
+      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+      # Check that only one variable was created.
+      self.assertEqual(1, len(variables))
+
+      # Check that invoking input_layer on the same features does not create
+      # additional variables
+      _ = input_layer(features)
+      self.assertEqual(1, len(variables))
+      self.assertEqual(variables[0], input_layer.variables[0])
+
+  def test_feature_column_input_layer_gradient(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      def scale_matrix():
+        matrix = input_layer(features)
+        return 2 * matrix
+
+      # Sanity check: Verify that scale_matrix returns the correct output.
+      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+      # Check that the returned gradient is correct.
+      grad_function = backprop.implicit_grad(scale_matrix)
+      grads_and_vars = grad_function()
+      indexed_slice = grads_and_vars[0][0]
+      gradient = grads_and_vars[0][0].values
+
+      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+
+@test_util.with_c_api
+class FunctionalInputLayerTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.input_layer(features={}, feature_columns=[])
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
+      fc.input_layer(
+          features={'a': [[0]]},
+          feature_columns=[
+              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+          ])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.input_layer(
           features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
 
   def test_bare_column(self):
@@ -2715,6 +3543,32 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_keras_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
 
@@ -3082,6 +3936,31 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_keras_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
 
 class IdentityCategoricalColumnTest(test.TestCase):
 
@@ -3306,6 +4185,28 @@ class IdentityCategoricalColumnTest(test.TestCase):
         # weight_var[2] + weight_var[1] = 3+2 = 5
         self.assertAllClose(((1.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual(3, column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2))
+      }, (column,))
+      bias = get_keras_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] = 1
+        # weight_var[2] + weight_var[1] = 3+2 = 5
+        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
 
 class TransformFeaturesTest(test.TestCase):
 
@@ -3537,6 +4438,25 @@ class IndicatorColumnTest(test.TestCase):
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
         self.assertAllClose([[2. + 3.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+
+      predictions = get_keras_linear_model_predictions(features, [animal])
+      weight_var = get_linear_model_column_var(animal)
+      with _initialized_session():
+        # All should be zero-initialized.
+        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
+        self.assertAllClose([[0.]], predictions.eval())
+        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
+        self.assertAllClose([[2. + 3.]], predictions.eval())
+
   def test_input_layer(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -3725,6 +4645,72 @@ class EmbeddingColumnTest(test.TestCase):
             'aaa': sparse_input
         }))
 
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_get_dense_tensor_with_state(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Create embedding_weights variable.
+    weight_collections = [
+        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
+    ]
+    state = embedding_column._create_state(weight_collections)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }), state=state)
+
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(
@@ -3940,15 +4926,90 @@ class EmbeddingColumnTest(test.TestCase):
             'aaa': sparse_input
         }))
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_linear_model(self):
+    # Inputs.
+    batch_size = 4
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(batch_size, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          categorical_column.name: sparse_input
+      }, (embedding_column,))
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_embedding/weights:0',
+          'linear_model/aaa_embedding/embedding_weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v for v in ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_embedding/embedding_weights:0']
+      linear_weights = trainable_vars[
+          'linear_model/aaa_embedding/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
 
-  def test_linear_model(self):
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # example 2, ids [], embedding[2] = [0, 0]
+        # example 3, ids [1], embedding[3] = [3, 5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+
+  def test_keras_linear_model(self):
     # Inputs.
     batch_size = 4
     vocabulary_size = 3
@@ -3965,6 +5026,7 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_dimension = 2
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -3975,15 +5037,16 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
+      predictions = get_keras_linear_model_predictions({
           categorical_column.name: sparse_input
       }, (embedding_column,))
       expected_var_names = (
-          'linear_model/bias_weights:0',
+          'linear_model/bias_layer/bias_weights:0',
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
@@ -3991,15 +5054,14 @@ class EmbeddingColumnTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
-          v.name: v for v in ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES)
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_weights:0']
+      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
-      linear_weights = trainable_vars[
-          'linear_model/aaa_embedding/weights:0']
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
         self.assertAllClose(np.zeros((1,)), bias.eval())
@@ -4443,6 +5505,80 @@ class SharedEmbeddingColumnTest(test.TestCase):
     embedding_lookup_b = embedding_column_b._get_dense_tensor(
         _LazyBuilder(input_features))
 
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    embedding_var = global_vars[0]
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, embedding_var.eval())
+      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
+      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+
+  def test_get_dense_tensor_with_state(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups_a = (
+        # example 0:
+        (7., 11.),  # ids [2], embedding = [7, 11]
+        # example 1:
+        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+    )
+    expected_lookups_b = (
+        # example 0:
+        (1., 2.),  # ids [0], embedding = [1, 2]
+        # example 1:
+        (0., 0.),  # ids [], embedding = [0, 0]
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Create state.
+    weight_collections = [
+        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
+    ]
+    state = embedding_column_a._create_state(weight_collections)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        _LazyBuilder(input_features), state=state)
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        _LazyBuilder(input_features), state=state)
+
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(
@@ -4595,6 +5731,97 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
         self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    # Inputs.
+    batch_size = 2
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          categorical_column_a.name: input_a,
+          categorical_column_b.name: input_b,
+      }, (embedding_column_a, embedding_column_b))
+      # Linear weights do not follow the column name. But this is a rare use
+      # case, and fixing it would add too much complexity to the code.
+      expected_var_names = (
+          'linear_model/bias_layer/bias_weights:0',
+          'linear_model/aaa_bbb_shared_embedding/weights:0',
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
+      linear_weights_a = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/weights:0']
+      linear_weights_b = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights_a.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+        linear_weights_b.assign(((3.,), (5.,))).eval()
+        # example 0, ids [0], embedding[0] = [1, 2]
+        # example 1, ids [], embedding[1] = 0, 0]
+        # sum(embeddings * linear_weights)
+        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+
   def _test_input_layer(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
@@ -4880,6 +6107,101 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           weight_tensor.eval())
 
+  def test_keras_linear_model(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(.5, 1., .1),
+                  dense_shape=(2, 2))
+      }, (column,))
+      bias = get_keras_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  def test_keras_linear_model_mismatched_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError,
+                                   r'Dimensions.*are not compatible'):
+        get_keras_linear_model_predictions({
+            'ids':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 2, 1),
+                    dense_shape=(2, 2)),
+            'values':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (0, 1), (1, 0), (1, 1)),
+                    values=(.5, 11., 1., .1),
+                    dense_shape=(2, 2))
+        }, (column,))
+
+  def test_keras_linear_model_mismatched_dense_values(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values': ((.5,), (1.,))
+      }, (column,))
+      with _initialized_session():
+        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+          predictions.eval()
+
+  def test_keras_linear_model_mismatched_dense_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values': ((.5,), (1.,), (.1,))
+      }, (column,))
+      bias = get_keras_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
   def test_linear_model(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 6c522de452b59ea9a200ccf89cfb428a26970db1..4356a534b4c978840b81086564219a96ed984504 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -33,7 +33,7 @@ class ScopedTFStatus(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteStatus is not None:
+    if c_api is not None and c_api.TF_DeleteStatus is not None:
       c_api.TF_DeleteStatus(self.status)
 
 
@@ -46,7 +46,7 @@ class ScopedTFGraph(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteGraph is not None:
+    if c_api is not None and c_api.TF_DeleteGraph is not None:
       c_api.TF_DeleteGraph(self.graph)
 
 
@@ -59,7 +59,7 @@ class ScopedTFImportGraphDefOptions(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteImportGraphDefOptions is not None:
+    if c_api is not None and c_api.TF_DeleteImportGraphDefOptions is not None:
       c_api.TF_DeleteImportGraphDefOptions(self.options)
 
 
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 2a40316d51c023df9c664d0dd79a0df3b2ac5041..84106c32c673e15832ff747a7fededdfbfb94ed8 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -473,6 +473,8 @@ _CODE_TO_EXCEPTION_CLASS = {
     DATA_LOSS: DataLossError,
 }
 
+c_api.PyExceptionRegistry_Init(_CODE_TO_EXCEPTION_CLASS)
+
 _EXCEPTION_CLASS_TO_CODE = dict((
     (class_, code) for (code, class_) in _CODE_TO_EXCEPTION_CLASS.items()))
 
@@ -499,6 +501,7 @@ def _make_specific_exception(node_def, op, message, error_code):
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
+# TODO(b/77295559): expand use of TF_Status* SWIG typemap and deprecate this.
 @tf_export("errors.raise_exception_on_not_ok_status")  # pylint: disable=invalid-name
 class raise_exception_on_not_ok_status(object):
   """Context manager to check for C API status."""
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 14d72d8a3de7e22bee4f9961c2f66044c217f641..c5caf9ebc06e7f63353c42965f02676642504fd0 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -30,7 +30,6 @@ from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -275,8 +274,7 @@ class _DefinedFunction(object):
     self._create_definition_if_needed()
     if self._c_func:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_FunctionToFunctionDef(self._c_func, buf, status)
+        c_api.TF_FunctionToFunctionDef(self._c_func, buf)
         fdef = function_pb2.FunctionDef()
         proto_data = c_api.TF_GetBuffer(buf)
         fdef.ParseFromString(compat.as_bytes(proto_data))
@@ -399,18 +397,16 @@ class _DefinedFunction(object):
                       if self._out_names else [])
       description = self._func.__doc__ or None
       # pylint: disable=protected-access
-      with errors.raise_exception_on_not_ok_status() as status:
-        self._c_func = c_api.TF_GraphToFunction_wrapper(
-            temp_graph._c_graph,
-            base_func_name,
-            self._func_name is None,  # append_hash_to_fn_name
-            None,  # opers
-            [t._as_tf_output() for t in inputs],
-            [t._as_tf_output() for t in outputs],
-            output_names,
-            None,  # opts
-            description,
-            status)
+      self._c_func = c_api.TF_GraphToFunction_wrapper(
+          temp_graph._c_graph,
+          base_func_name,
+          self._func_name is None,  # append_hash_to_fn_name
+          None,  # opers
+          [t._as_tf_output() for t in inputs],
+          [t._as_tf_output() for t in outputs],
+          output_names,
+          None,  # opts
+          description)
       # pylint: enable=protected-access
       self._set_c_attrs(kwargs_attr)
 
@@ -433,9 +429,8 @@ class _DefinedFunction(object):
       serialized = attr_value.SerializeToString()
       # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
       # It might be worth creating a convenient way to re-use the same status.
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.TF_FunctionSetAttrValueProto(self._c_func, compat.as_str(name),
-                                           serialized, status)
+      c_api.TF_FunctionSetAttrValueProto(self._c_func, compat.as_str(name),
+                                         serialized)
 
   def _create_hash_str(self, input_arg, output_arg, node_def):
     """Creates an 8-character string unique to this input.
@@ -830,8 +825,7 @@ def _from_definition(fdef, grad_func=None):
   # pylint: disable=protected-access
   if ops._USE_C_API:
     serialized = fdef.SerializeToString()
-    with errors.raise_exception_on_not_ok_status() as status:
-      result._c_func = c_api.TF_FunctionImportFunctionDef(serialized, status)
+    result._c_func = c_api.TF_FunctionImportFunctionDef(serialized)
     result._extra_inputs = []
   else:
     result._definition = fdef
@@ -934,6 +928,12 @@ def _parse_kwargs_as_attrs(func_name, **kwargs):
           s=("function_%s" % func_name).encode())
     # pylint: enable=protected-access
 
+  kwargs_keys = list(kwargs.keys())
+  for key in kwargs_keys:
+    if key.startswith("experimental_"):
+      attrs[key] = attr_value_pb2.AttrValue(s=compat.as_bytes(kwargs[key]))
+      del kwargs[key]
+
   if kwargs:
     raise ValueError("Unknown keyword arguments: %s" % kwargs.keys())
   return attrs
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 65ca801cbe922b36e3bc72bc2fbcd88f66aa5290..83d256fab6ed6c2fb46319fdc2d5de079346809e 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -1227,6 +1227,15 @@ class FunctionsFromProtos(test.TestCase):
         ValueError, "FunctionDefLibrary contains cyclic gradient functions!"):
       function._from_library(library)
 
+  def testExperimentalAttrs(self):
+
+    @function.Defun(dtypes.int32, experimental_tag="tag_value")
+    def FunctionWithAttr(i):
+      return array_ops.identity(i)
+    self.assertTrue("experimental_tag" in FunctionWithAttr.definition.attr)
+    self.assertEqual(
+        FunctionWithAttr.definition.attr["experimental_tag"].s, b"tag_value")
+
 
 @test_util.with_c_api
 class FunctionOverloadTest(test.TestCase):
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 783e9259ad1fae4eb05fc7beeb5be83ae3f7fdce..23f529b98856195899b7903242871f608f159cec 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -301,14 +301,17 @@ def _ProcessNewOps(graph):
   colocation_pairs = {}
 
   for new_op in graph._add_new_tf_operations(compute_devices=False):  # pylint: disable=protected-access
+    original_device = new_op.device
+    new_op._set_device('')  # pylint: disable=protected-access
     colocation_names = _GetColocationNames(new_op)
     if colocation_names:
       colocation_pairs[new_op] = colocation_names
-      # Don't apply this op's device function, since colocation constraints
-      # override device functions. Note that this op's device may still be set
-      # by the loop below.
+      # Don't set a device for this op, since colocation constraints override
+      # device functions and the original device. Note that this op's device may
+      # still be set by the loop below.
+      # TODO(skyewm): why does it override the original device?
     else:
-      with _MaybeDevice(new_op.device):
+      with _MaybeDevice(original_device):
         graph._apply_device_functions(new_op)  # pylint: disable=protected-access
 
   # The following loop populates the device field of ops that are colocated
@@ -482,30 +485,31 @@ def import_graph_def(graph_def,
     with graph._lock:  # pylint: disable=protected-access
       with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
         try:
-          with errors.raise_exception_on_not_ok_status() as status:
-            results = c_api.TF_GraphImportGraphDefWithResults(
-                graph._c_graph, serialized, options, status)  # pylint: disable=protected-access
+          results = c_api.TF_GraphImportGraphDefWithResults(
+              graph._c_graph, serialized, options)  # pylint: disable=protected-access
         except errors.InvalidArgumentError as e:
           # Convert to ValueError for backwards compatibility.
           raise ValueError(str(e))
 
-      _ProcessNewOps(graph)
+      # Create _DefinedFunctions for any imported functions.
+      #
+      # We do this by creating _DefinedFunctions directly from `graph_def`, and
+      # adding them to `graph`. Adding an existing function to a TF_Graph is a
+      # no-op, so this only has the effect of updating the Python state (usually
+      # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
+      #
+      # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
+      # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
+      # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
+      # _USE_C_SHAPES is removed.
+      if graph_def.library and graph_def.library.function:
+        # pylint: disable=protected-access
+        functions = function._from_library(graph_def.library)
+        for f in functions:
+          f.add_to_graph(graph)
+        # pylint: enable=protected-access
 
-    # Create _DefinedFunctions for any imported functions.
-    #
-    # We do this by creating _DefinedFunctions directly from `graph_def`, and
-    # adding them to `graph`. Adding an existing function to a TF_Graph is a
-    # no-op, so this only has the effect of updating the Python state (usually
-    # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
-    #
-    # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
-    # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
-    if graph_def.library and graph_def.library.function:
-      # pylint: disable=protected-access
-      functions = function._from_library(graph_def.library)
-      for f in functions:
-        f.add_to_graph(graph)
-      # pylint: enable=protected-access
+      _ProcessNewOps(graph)
 
     # Treat input mappings that don't appear in the graph as an error, because
     # they are likely to be due to a typo.
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index c39191e6d90f5f00806eba021ea41dd6ebe86f98..2c913d1e028e15e293158fe180e263a78c514ee4 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -38,13 +39,13 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-# TODO(skyewm): reenable when this works with _USE_C_SHAPES=False
-# @test_util.with_c_api
+@test_util.with_c_api
 class ImportGraphDefTest(test.TestCase):
 
   def _MakeGraphDef(self,
@@ -218,6 +219,23 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(outer_inner.name, "outer/inner_1")
       self.assertEqual(outer_inner_c.name, "outer/inner/c_1")
 
+  def testEmptyNameScope(self):
+    with ops.Graph().as_default():
+      # Create name scope but don't create any ops with it
+      with ops.name_scope("foo"):
+        pass
+
+      # Import graph def that uses name scope name
+      op, = importer.import_graph_def(
+          self._MakeGraphDef("node { name: 'foo' op: 'IntOutput' }"),
+          return_elements=["foo"],
+          name="")
+
+      if ops._USE_C_API:
+        self.assertEqual(op.name, "foo")
+      else:
+        self.assertEqual(op.name, "foo_1")
+
   def testInputMap(self):
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(0, dtype=dtypes.int32)
@@ -356,6 +374,39 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(d._input_types, [dtypes.int32_ref, dtypes.int32])
       self.assertEqual(d.outputs, [])
 
+  def testResources(self):
+    # Produce GraphDef containing a ops producing and consuming resources.
+    graph = ops.Graph()
+    with graph.as_default():
+      var = resource_variable_ops.ResourceVariable(1.0)
+      var_assign = var.assign(2.0)
+      # Use an op that requires handle shape to be set.
+      var_shape = resource_variable_ops.variable_shape(var.handle)
+      init = variables.global_variables_initializer()
+    graph_def = graph.as_graph_def()
+
+    # Import the GraphDef.
+    with ops.Graph().as_default():
+      # pylint: disable=unused-variable
+      imported_var, imported_assign, imported_shape, imported_init = (
+          importer.import_graph_def(
+              graph_def,
+              return_elements=[var.name, var_assign.name, var_shape.name,
+                               init.name]))
+
+      # Make sure the handle shape is set on the imported variable.
+      new_var_shape = resource_variable_ops.variable_shape(imported_var)
+      # pylint: enable=unused-variable
+
+      # Run the imported graph.
+      # TODO(b/76173421): make this work (currently DCHECKS)
+      # with self.test_session() as sess:
+      #   sess.run(imported_init)
+      #   self.assertEqual(sess.run(imported_var), 1.0)
+      #   self.assertEqual(sess.run(imported_assign), 2.0)
+      #   self.assertEqual(list(sess.run(imported_shape)), [])
+      #   self.assertEqual(list(sess.run(new_var_shape)), [])
+
   def testWhileLoop(self):
     # Produce GraphDef containing while loop.
     graph = ops.Graph()
@@ -680,6 +731,49 @@ class ImportGraphDefTest(test.TestCase):
           "list { s: 'loc:@imported_graph/A' }",
           b.node_def.attr["_class"])
 
+  def testColocationAndDevice(self):
+    # A and B are colocated, device set on A.
+    original_graph_def = self._MakeGraphDef("""
+          node { name: 'A' op: 'None' device: '/device:CPU:0' attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }
+          node { name: 'B' op: 'None'  attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }""")
+
+    with ops.Graph().as_default():
+      a, b = importer.import_graph_def(original_graph_def,
+                                       return_elements=["A", "B"],
+                                       name="")
+      self.assertEqual(a.device, "/device:CPU:0")
+      self.assertEqual(b.device, "/device:CPU:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@A"])
+
+    # A and B are colocated, device set on B.
+    original_graph_def = self._MakeGraphDef("""
+          node { name: 'A' op: 'None' attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }
+          node { name: 'B' op: 'None' device: '/device:CPU:0' attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }""")
+
+    with ops.Graph().as_default():
+      a, b = importer.import_graph_def(original_graph_def,
+                                       return_elements=["A", "B"],
+                                       name="")
+      # TODO(skyewm): this behavior seems inconsistent with the above. Why is
+      # B's device ignored?
+      self.assertEqual(a.device, "")
+      self.assertEqual(b.device, "")
+      self.assertEqual(a.colocation_groups(), [b"loc:@A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@A"])
+
   def testColocationWithDeviceFn(self):
     original_graph_def = self._MakeGraphDef("""
           node { name: 'A' op: 'None' attr {
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 1f2aa264c110930b318f30e3a24010a96ebce47e..535c6017f5fd0f8adf9ed091bd4477762e52b0e3 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -26,7 +26,6 @@ import threading  # pylint: disable=unused-import
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-import
 from tensorflow.python import pywrap_tensorflow as py_tf
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -54,8 +53,7 @@ def load_op_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library or get the python wrappers.
   """
-  with errors_impl.raise_exception_on_not_ok_status() as status:
-    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
+  lib_handle = py_tf.TF_LoadLibrary(library_filename)
 
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
@@ -99,5 +97,4 @@ def load_file_system_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library.
   """
-  with errors_impl.raise_exception_on_not_ok_status() as status:
-    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
+  py_tf.TF_LoadLibrary(library_filename)
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 4bb9941bb778ae9b4022ef9d376cb031223ddb1c..391b17720c6f5925fe6cab02ac2a784257177a27 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -737,7 +737,9 @@ def import_scoped_meta_graph(meta_graph_or_file,
         import_scope or "", mark_as_used=False)
 
     importer.import_graph_def(
-        input_graph_def, name=(import_scope or ""), input_map=input_map,
+        input_graph_def,
+        name=(import_scope or scope_to_prepend_to_names),
+        input_map=input_map,
         producer_op_list=producer_op_list)
 
     # Restores all the other collections.
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 06cec504e407301c7cfbceaa04a07674fe87c712..5d5fb037fc217849ea32102bf60796c47d565f3b 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -285,8 +285,7 @@ class SimpleMetaGraphTest(test.TestCase):
       self.assertIs(global_vars[0], trainable_vars[0])
 
 
-# TODO(skyewm): reenable when this works with _USE_C_SHAPES=False
-# @test_util.with_c_api
+@test_util.with_c_api
 class ScopedMetaGraphTest(test.TestCase):
 
   def _testScopedExport(self, test_dir, exported_filenames):
@@ -538,6 +537,21 @@ class ScopedMetaGraphTest(test.TestCase):
         self.assertEqual(list(imported_variables.values())[0].name,
                          "foo/bar/myvar:0")
 
+  def testScopedImportUnderNameScopeNoVarScope(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      variables.Variable(initial_value=1.0, trainable=True, name="myvar")
+    meta_graph_def, _ = meta_graph.export_scoped_meta_graph(graph=graph)
+
+    graph = ops.Graph()
+    with graph.as_default():
+      with ops.name_scope("foo"):
+        imported_variables = meta_graph.import_scoped_meta_graph(
+            meta_graph_def)
+        self.assertEqual(len(imported_variables), 1)
+        self.assertEqual(list(imported_variables.values())[0].name,
+                         "foo/myvar:0")
+
   def testImportsUsingSameScopeName(self):
     with ops.Graph().as_default():
       variables.Variable(0, name="v")
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 44df22cf58bfa8da2f9820e041d02668278ad018..2d55f98a1c5f1abbe079a4733131284f71cb4318 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -42,6 +42,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -62,7 +63,7 @@ from tensorflow.python.util.tf_export import tf_export
 # calls to the C API. Currently disabled by default but can be manually enabled
 # in code or via the environment variable. This will be removed once all
 # functionality is supported and there's no performance penalty with it enabled.
-_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "0") is not "0"
+_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "1") is not "0"
 _USE_C_SHAPES = os.getenv("TF_C_API_GRAPH_CONSTRUCTION_SHAPES", "0") is not "0"
 
 
@@ -295,6 +296,7 @@ class Tensor(_TensorLike):
 
     # Attributes used for C++ shape inference. Not inspected, only forwarded.
     # If set, will be a HandleData object from cpp_shape_inference.proto.
+    # TODO(b/74620627): remove when _USE_C_SHAPES is removed
     self._handle_data = None
     self._id = uid()
 
@@ -371,15 +373,12 @@ class Tensor(_TensorLike):
     """
     graph = self._op._graph._c_graph # pylint: disable=protected-access
     if graph and _USE_C_SHAPES:
-      with errors.raise_exception_on_not_ok_status() as status:
-        num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output(),
-                                                  status)
+      num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output())
       if num_dims == -1:
         dim_list = None
       else:
-        with errors.raise_exception_on_not_ok_status() as status:
-          dim_list = c_api.TF_GraphGetTensorShape_wrapper(
-              graph, self._as_tf_output(), num_dims, status)
+        dim_list = c_api.TF_GraphGetTensorShape_wrapper(
+            graph, self._as_tf_output(), num_dims)
         dim_list = [None if i == -1 else i for i in dim_list]
       return tensor_shape.TensorShape(dim_list)
     return self._shape_val
@@ -487,13 +486,11 @@ class Tensor(_TensorLike):
         else:
           dim_list.append(dim.value)
     try:
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.TF_GraphSetTensorShape_wrapper(
-            self._op._graph._c_graph,  # pylint: disable=protected-access
-            self._as_tf_output(),
-            dim_list,
-            unknown_shape,
-            status)
+      c_api.TF_GraphSetTensorShape_wrapper(
+          self._op._graph._c_graph,  # pylint: disable=protected-access
+          self._as_tf_output(),
+          dim_list,
+          unknown_shape)
     except errors.InvalidArgumentError as e:
       # Convert to ValueError for backwards compatibility.
       raise ValueError(str(e))
@@ -838,41 +835,51 @@ class _EagerTensorBase(Tensor):
   def set_shape(self, shape):
     if not self.shape.is_compatible_with(shape):
       raise ValueError(
-          "EagerTensor's shape %s is not compatible with supplied shape %s" %
+          "Tensor's shape %s is not compatible with supplied shape %s" %
           (self.shape, shape))
 
   # Methods not supported / implemented for Eager Tensors.
   @property
   def op(self):
-    raise AttributeError("op not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.op is meaningless when eager execution is enabled.")
 
   @property
   def graph(self):
-    raise AttributeError("graph not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.graph is meaningless when eager execution is enabled.")
 
   @property
   def name(self):
-    raise AttributeError("name not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.name is meaningless when eager execution is enabled.")
 
   @property
   def value_index(self):
-    raise AttributeError("value_index not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.value_index is meaningless when eager execution is enabled.")
 
   def consumers(self):
-    raise NotImplementedError("consumers not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "Tensor.consumers is meaningless when eager execution is enabled.")
 
   def _add_consumer(self, consumer):
-    raise NotImplementedError("_add_consumer not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "_add_consumer not supported when eager execution is enabled.")
 
   def _as_node_def_input(self):
     raise NotImplementedError(
-        "_as_node_def_input not supported for Eager Tensors.")
+        "_as_node_def_input not supported when eager execution is enabled.")
 
   def _as_tf_output(self):
-    raise NotImplementedError("_as_tf_output not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "_as_tf_output not supported when eager execution is enabled.")
 
   def eval(self, feed_dict=None, session=None):
-    raise NotImplementedError("eval not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "eval is not supported when eager execution is enabled, "
+        "is .numpy() what you're looking for?"
+    )
 
 
 # This call creates an EagerTensor class, as a subclass of _EagerTensorBase, and
@@ -1502,13 +1509,10 @@ def _create_c_op(graph, node_def, inputs, control_inputs):
     serialized = attr_value.SerializeToString()
     # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
     # It might be worth creating a convenient way to re-use the same status.
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_api.TF_SetAttrValueProto(op_desc,
-                                 compat.as_str(name), serialized, status)
+    c_api.TF_SetAttrValueProto(op_desc, compat.as_str(name), serialized)
 
   try:
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_op = c_api.TF_FinishOperation(op_desc, status)
+    c_op = c_api.TF_FinishOperation(op_desc)
   except errors.InvalidArgumentError as e:
     # Convert to ValueError for backwards compatibility.
     raise ValueError(str(e))
@@ -1653,6 +1657,9 @@ class Operation(object):
       self._control_inputs_val = control_input_ops
       self._node_def_val = copy.deepcopy(node_def)
       self._op_def_val = op_def
+    else:
+      # This will be set by self.inputs.
+      self._inputs_val = None
 
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
@@ -1906,7 +1913,8 @@ class Operation(object):
     tensor._add_consumer(self)  # pylint: disable=protected-access
     self._recompute_node_def()
 
-  def _update_input(self, index, tensor):
+  # TODO(skyewm): Remove `update_dtype` when we enable the C API.
+  def _update_input(self, index, tensor, update_dtype=True):
     """Update the input to this operation at the given index.
 
     NOTE: This is for TF internal use only. Please don't use it.
@@ -1914,6 +1922,7 @@ class Operation(object):
     Args:
       index: the index of the input to update.
       tensor: the Tensor to be used as the input at the given index.
+      update_dtype: If `False`, the type for this input is not updated.
 
     Raises:
       TypeError: if tensor is not a Tensor,
@@ -1924,16 +1933,17 @@ class Operation(object):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
     if self._c_op:
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.UpdateEdge(
-            self._graph._c_graph,  # pylint: disable=protected-access
-            tensor._as_tf_output(),  # pylint: disable=protected-access
-            self._tf_input(index),
-            status)
+      # Reset cached inputs.
+      self._inputs_val = None
+      c_api.UpdateEdge(
+          self._graph._c_graph,  # pylint: disable=protected-access
+          tensor._as_tf_output(),  # pylint: disable=protected-access
+          self._tf_input(index))
     else:
       self._inputs_val[index].consumers().remove(self)
       self._inputs_val[index] = tensor
-      self._input_types_val[index] = tensor.dtype
+      if update_dtype:
+        self._input_types_val[index] = tensor.dtype
       tensor._add_consumer(self)  # pylint: disable=protected-access
       self._recompute_node_def()
 
@@ -2039,15 +2049,18 @@ class Operation(object):
   def inputs(self):
     """The list of `Tensor` objects representing the data inputs of this op."""
     if self._c_op:
-      tf_outputs = c_api.GetOperationInputs(self._c_op)
-      # pylint: disable=protected-access
-      retval = [
-          self.graph._get_tensor_by_tf_output(tf_output)
-          for tf_output in tf_outputs
-      ]
-      # pylint: enable=protected-access
-      return Operation._InputList(retval)
-    return Operation._InputList(self._inputs_val)
+      if self._inputs_val is None:
+        tf_outputs = c_api.GetOperationInputs(self._c_op)
+        # pylint: disable=protected-access
+        retval = [
+            self.graph._get_tensor_by_tf_output(tf_output)
+            for tf_output in tf_outputs
+        ]
+        # pylint: enable=protected-access
+        self._inputs_val = Operation._InputList(retval)
+      return self._inputs_val
+    else:
+      return Operation._InputList(self._inputs_val)
 
   @property
   def _inputs(self):
@@ -2100,6 +2113,30 @@ class Operation(object):
     else:
       return self._control_inputs_val
 
+  @property
+  def _control_outputs(self):
+    """The `Operation` objects which have a control dependency on this op.
+
+    Before any of the ops in self._control_outputs can execute tensorflow will
+    ensure self has finished executing.
+
+    Returns:
+      A list of `Operation` objects.
+
+    """
+    if self._c_op:
+      control_c_ops = c_api.TF_OperationGetControlOutputs_wrapper(self._c_op)
+      # pylint: disable=protected-access
+      return [
+          self.graph._get_operation_by_name_unsafe(
+              c_api.TF_OperationName(c_op)) for c_op in control_c_ops
+      ]
+      # pylint: enable=protected-access
+    else:
+      # TODO(apassos) this should be less inefficient.
+      return [o for o in self._graph.get_operations()
+              if self in o.control_inputs]
+
   @property
   def _control_inputs(self):
     logging.warning("Operation._control_inputs is private, use "
@@ -2146,8 +2183,7 @@ class Operation(object):
     # pylint: enable=line-too-long
     if self._c_op:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_OperationToNodeDef(self._c_op, buf, status)
+        c_api.TF_OperationToNodeDef(self._c_op, buf)
         data = c_api.TF_GetBuffer(buf)
       node_def = node_def_pb2.NodeDef()
       node_def.ParseFromString(compat.as_bytes(data))
@@ -2205,11 +2241,9 @@ class Operation(object):
       buf = c_api.TF_NewBufferFromString(
           compat.as_bytes(attr_value.SerializeToString()))
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf,
-                        status)
-          # pylint: enable=protected-access
+        # pylint: disable=protected-access
+        c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf)
+        # pylint: enable=protected-access
       finally:
         c_api.TF_DeleteBuffer(buf)
     else:
@@ -2231,8 +2265,7 @@ class Operation(object):
     if self._c_op:
       try:
         with c_api_util.tf_buffer() as buf:
-          with errors.raise_exception_on_not_ok_status() as status:
-            c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf, status)
+          c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
           data = c_api.TF_GetBuffer(buf)
       except errors.InvalidArgumentError as e:
         # Convert to ValueError for backwards compatibility.
@@ -2446,11 +2479,10 @@ def _set_shapes_for_outputs_c_api(op):
   # The C API computes the shapes when the TF_Operation is created. Fetch the
   # output shapes from the C object.
   for output in op.outputs:
-    with errors.raise_exception_on_not_ok_status() as status:
-      # pylint: disable=protected-access
-      shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
-          op._graph._c_graph, output._as_tf_output(), status)
-      # pylint: enable=protected-access
+    # pylint: disable=protected-access
+    shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+        op._graph._c_graph, output._as_tf_output())
+    # pylint: enable=protected-access
     if unknown_shape:
       output.set_shape(tensor_shape.unknown_shape())
     elif not shape_vector:
@@ -2459,6 +2491,14 @@ def _set_shapes_for_outputs_c_api(op):
       shape_vector = [None if d == -1 else d for d in shape_vector]
       output.set_shape(tensor_shape.TensorShape(shape_vector))
 
+    serialized = c_api.ResourceHandleShapeAndType(op._graph._c_graph,
+                                                  output._as_tf_output())
+    if serialized:
+      output._handle_data = (
+          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString(
+              compat.as_bytes(serialized)))
+    else:
+      output._handle_data = None
 
 # TODO(skyewm): remove this when _USE_C_API flag is removed.
 def _set_shapes_for_outputs(op):
@@ -2717,8 +2757,6 @@ class Graph(object):
     self._next_id_counter = 0  # GUARDED_BY(self._lock)
     self._nodes_by_name = dict()  # GUARDED_BY(self._lock)
     self._version = 0  # GUARDED_BY(self._lock)
-    # Current name stack: uniquified names
-    self._name_stack = ""
     # Maps a name used in the graph to the next id to use for that name.
     self._names_in_use = {}
     self._stack_state_is_thread_local = False
@@ -2777,6 +2815,9 @@ class Graph(object):
     # being called inside function definitions behave as if they were seeing the
     # actual outside graph).
     self._graph_key = "grap-key-%d/" % (uid(),)
+    # A string with the last reduction method passed to
+    # losses.compute_weighted_loss(), or None.
+    self._last_loss_reduction = None
     self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
 
@@ -2962,8 +3003,7 @@ class Graph(object):
     # pylint: enable=line-too-long
     if self._c_graph:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_GraphVersions(self._c_graph, buf, status)
+        c_api.TF_GraphVersions(self._c_graph, buf)
         data = c_api.TF_GetBuffer(buf)
       version_def = versions_pb2.VersionDef()
       version_def.ParseFromString(compat.as_bytes(data))
@@ -3066,8 +3106,7 @@ class Graph(object):
     if self._c_graph:
       with self._lock:
         with c_api_util.tf_buffer() as buf:
-          with errors.raise_exception_on_not_ok_status() as status:
-            c_api.TF_GraphToGraphDef(self._c_graph, buf, status)
+          c_api.TF_GraphToGraphDef(self._c_graph, buf)
           data = c_api.TF_GetBuffer(buf)
         graph = graph_pb2.GraphDef()
         graph.ParseFromString(compat.as_bytes(data))
@@ -3176,14 +3215,10 @@ class Graph(object):
       # remove this when all functions are generated using the C API by default
       # as this will be unnecessary.
       if not function._c_func:
-        with errors.raise_exception_on_not_ok_status() as status:
-          serialized = function.definition.SerializeToString()
-          function._c_func = c_api.TF_FunctionImportFunctionDef(
-              serialized, status)
-      with errors.raise_exception_on_not_ok_status() as status:
-        gradient = function._grad_func._c_func if function._grad_func else None
-        c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient,
-                                   status)
+        serialized = function.definition.SerializeToString()
+        function._c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+      gradient = function._grad_func._c_func if function._grad_func else None
+      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient)
     else:
       # If there is already a function with the same name, raise an error
       # if bodies are different. Else, do nothing. The C API version above
@@ -3293,6 +3328,20 @@ class Graph(object):
           input_types=input_types,
           original_op=self._default_original_op,
           op_def=op_def)
+
+      # TODO(vrv): Instead of eagerly filling in shape property for every op,
+      # only populate the shape when requested.
+      #
+      # TODO(skyewm): unlike in the original Python implementation, the C API
+      # always computes shape information (even for function calls, which the
+      # original Python shape inference code doesn't handle). Deprecate the
+      # compute_shapes argument.
+      #
+      # TODO(b/74620627): move this back to _create_op_helper once _USE_C_SHAPES
+      # is removed
+      if (ret._c_op and _USE_C_SHAPES) or compute_shapes:  # pylint: disable=protected-access
+        set_shapes_for_outputs(ret)
+
       self._create_op_helper(ret, compute_shapes=compute_shapes,
                              compute_device=compute_device)
     return ret
@@ -3319,22 +3368,17 @@ class Graph(object):
     """
     self._check_not_finalized()
     ret = Operation(c_op, self)
-    assert ret.name not in self._names_in_use
-    self._names_in_use[ret.name] = 1
+    # If a name_scope was created with ret.name but no nodes were created in it,
+    # the name will still appear in _names_in_use even though the name hasn't
+    # been used. This is ok, just leave _names_in_use as-is in this case.
+    # TODO(skyewm): make the C API guarantee no name conflicts.
+    if ret.name not in self._names_in_use:
+      self._names_in_use[ret.name] = 1
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
   def _create_op_helper(self, op, compute_shapes=True, compute_device=True):
     """Common logic for creating an op in this graph."""
-    # TODO(vrv): Instead of eagerly filling in shape property for every op, only
-    # populate the shape when requested.
-    #
-    # TODO(skyewm): unlike in the original Python implementation, the C API
-    # always computes shape information (even for function calls, which the
-    # original Python shape inference code doesn't handle). Deprecate the
-    # compute_shapes argument.
-    if (op._c_op and _USE_C_SHAPES) or compute_shapes:  # pylint: disable=protected-access
-      set_shapes_for_outputs(op)
     # TODO(b/XXXX): move to Operation.__init__ once _USE_C_API flag is removed.
     self._add_op(op)
 
@@ -3439,6 +3483,12 @@ class Graph(object):
     ]
 
     for op in new_ops:
+      # Operations created by the C API always retrieve shapes from the C API so
+      # we preserve the shapes of ops created in import_graph_def (from the
+      # "_output_shapes" attr of the imported NodeDef).
+      # TODO(b/74620627): move this back to _create_op_helper once _USE_C_SHAPES
+      # is removed.
+      _set_shapes_for_outputs_c_api(op)
       new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
       # pylint: disable=protected-access
       op._add_control_inputs(new_control_inputs)
@@ -3689,11 +3739,9 @@ class Graph(object):
     """Returns the `OpDef` proto for `type`. `type` is a string."""
     if self._c_graph:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.TF_GraphGetOpDef(self._c_graph,
-                                 compat.as_bytes(type), buf, status)
-          # pylint: enable=protected-access
+        # pylint: disable=protected-access
+        c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
+        # pylint: enable=protected-access
         data = c_api.TF_GetBuffer(buf)
       op_def = op_def_pb2.OpDef()
       op_def.ParseFromString(compat.as_bytes(data))
@@ -3886,6 +3934,17 @@ class Graph(object):
     finally:
       self._default_original_op = old_original_op
 
+  @property
+  def _name_stack(self):
+    # This may be called from a thread where name_stack doesn't yet exist.
+    if not hasattr(self._thread_local, "_name_stack"):
+      self._thread_local._name_stack = ""
+    return self._thread_local._name_stack
+
+  @_name_stack.setter
+  def _name_stack(self, name_stack):
+    self._thread_local._name_stack = name_stack
+
   # pylint: disable=g-doc-return-or-yield,line-too-long
   @tf_contextlib.contextmanager
   def name_scope(self, name):
@@ -4458,6 +4517,22 @@ class Graph(object):
         return tf.matmul(tensor, tensor)
     ```
 
+    Also note that though execution of ops created under this scope will trigger
+    execution of the dependencies, the ops created under this scope might still
+    be pruned from a normal tensorflow graph. For example, in the following
+    snippet of code the dependencies are never executed:
+
+    ```python
+      loss = model.loss()
+      with tf.control_dependencies(dependencies):
+        loss = loss + tf.constant(1)  # note: dependencies ignored in the
+                                      # backward pass
+      return tf.gradients(loss, model.variables)
+    ```
+
+    This is because evaluating the gradient graph does not require evaluating
+    the constant(1) op created in the forward pass.
+
     Args:
       control_inputs: A list of `Operation` or `Tensor` objects which
         must be executed or computed before running the operations
@@ -5095,11 +5170,12 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     try:
-      context.context_stack.push(default.building_function, default.as_default)
+      context.context().context_switches.push(default.building_function,
+                                              default.as_default)
       with super(_DefaultGraphStack, self).get_controller(default) as g:
         yield g
     finally:
-      context.context_stack.pop()
+      context.context().context_switches.pop()
 
 
 _default_graph_stack = _DefaultGraphStack()
@@ -5125,13 +5201,13 @@ def init_scope():
         graph function. Here, a context is defined as either a graph or an eager
         context. Every context switch, i.e., every installation of a graph as
         the default graph and every switch into eager mode, is logged in a
-        thread-local stack called the `context_stack`; the log entry for a
+        thread-local stack called `context_switches`; the log entry for a
         context switch is popped from the stack when the context is exited.
-        Entering an `init_scope` is equivalent to crawling up the
-        `context_stack`, finding the first context that is not building a graph
-        function, and entering it. A caveat is that if graph mode is enabled
-        but the default graph stack is empty, then entering an `init_scope`
-        will simply install a fresh graph as the default one.
+        Entering an `init_scope` is equivalent to crawling up
+        `context_switches`, finding the first context that is not building a
+        graph function, and entering it. A caveat is that if graph mode is
+        enabled but the default graph stack is empty, then entering an
+        `init_scope` will simply install a fresh graph as the default one.
 
     (3) The gradient tape is paused while the scope is active.
   """
@@ -5161,7 +5237,7 @@ def init_scope():
       outer_context = default_graph.as_default
     else:
       # Find a context that is not building a function.
-      for stack_entry in reversed(context.context_stack.stack):
+      for stack_entry in reversed(context.context().context_switches.stack):
         if not stack_entry.is_building_function:
           outer_context = stack_entry.enter_context_fn
           break
@@ -5184,7 +5260,8 @@ def init_scope():
 
 
 @tf_export("enable_eager_execution")
-def enable_eager_execution(config=None, device_policy=None):
+def enable_eager_execution(config=None, device_policy=None,
+                           execution_mode=None):
   """Enables eager execution for the lifetime of this program.
 
   Eager execution provides an imperative interface to TensorFlow. With eager
@@ -5210,13 +5287,15 @@ def enable_eager_execution(config=None, device_policy=None):
 
   Args:
     config: (Optional.) A @{tf.ConfigProto} to use to configure the environment
-     in which operations are executed. Note that @{tf.ConfigProto} is also
-     used to configure graph execution (via @{tf.Session}) and many options
-     within `tf.ConfigProto` are not implemented (or are irrelevant) when
+      in which operations are executed. Note that @{tf.ConfigProto} is also
+      used to configure graph execution (via @{tf.Session}) and many options
+      within `tf.ConfigProto` are not implemented (or are irrelevant) when
      eager execution is enabled.
     device_policy: (Optional.) Policy controlling how operations requiring
      inputs on a specific device (e.g., a GPU 0) handle inputs on a different
-     device  (e.g. GPU 1 or CPU).
+     device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
+     picked automatically. The value picked may change between TensorFlow
+     releases.
      Valid values:
 
       - tf.contrib.eager.DEVICE_PLACEMENT_EXPLICIT: raises an error if the
@@ -5232,6 +5311,15 @@ def enable_eager_execution(config=None, device_policy=None):
 
       - tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies
         int32 tensors, raising errors on the other ones.
+    execution_mode: (Optional.) Policy controlling how operations dispatched are
+      actually executed. When set to None, an appropriate value will be picked
+      automatically. The value picked may change between TensorFlow releases.
+      Valid values:
+
+        - tf.contrib.eager.SYNC: executes each operation synchronously.
+
+        - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+          operations may return "non-ready" handles.
 
   Raises:
     ValueError: If eager execution is enabled after creating/executing a
@@ -5248,6 +5336,10 @@ def enable_eager_execution(config=None, device_policy=None):
     raise ValueError(
         "device_policy must be one of None, tf.contrib.eager.DEVICE_PLACEMENT_*"
     )
+  if execution_mode not in (None, context.SYNC, context.ASYNC):
+    raise ValueError(
+        "execution_mode must be one of None, tf.contrib.eager.SYNC, "
+        "tf.contrib.eager.ASYNC")
   # pylint: disable=protected-access
   if context._default_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
@@ -5258,28 +5350,31 @@ def enable_eager_execution(config=None, device_policy=None):
           "tf.enable_eager_execution must be called at program startup.")
   context._default_mode = context.EAGER_MODE
   if context._context is None:
-    context._context = context.Context(config=config,
-                                       device_policy=device_policy)
-    if context.context_stack.stack:
-      raise AssertionError("Invariant violated: The context stack must "
-                           "be empty when eager execution is enabled.")
-    # Log that eager execution has been enabled by pushing an entry onto the
-    # context stack; this entry won't ever be popped, as it's impossible to
-    # disable eager execution
-    context.context_stack.push(False, context.eager_mode)
-  elif ((config is not None and config is not context._context._config)
-        or (device_policy is not None
-            and device_policy is not context._context._device_policy)):
+    context._context = context.Context(
+        config=config,
+        device_policy=device_policy,
+        execution_mode=execution_mode)
+  elif ((config is not None and config is not context._context._config) or
+        (device_policy is not None and
+         device_policy is not context._context._device_policy) or
+        (execution_mode is not None and
+         execution_mode is not context._context._execution_mode)):
     raise ValueError("Trying to change the options of an active eager"
                      " execution. Context config: %s, specified config:"
-                     " %s. Context device policy: %s; specified device"
-                     " policy: %s." % (config, context._context._config,
-                                       device_policy,
-                                       context._context._device_policy))
+                     " %s. Context device policy: %s, specified device"
+                     " policy: %s. Context execution mode: %s, "
+                     " specified execution mode %s." %
+                     (context._context._config, config,
+                      context._context._device_policy, device_policy,
+                      context._context._execution_mode, execution_mode))
   else:
     raise ValueError(
         "tf.enable_eager_execution must be called at program startup.")
 
+  # Monkey patch to get rid of an unnecessary conditional since the context is
+  # now initialized.
+  context.context = context.context_safe
+
 
 def eager_run(main=None, argv=None):
   """Runs the program with an optional main function and argv list.
@@ -5362,7 +5457,7 @@ def get_name_scope():
   Returns:
     A string representing the current name scope.
   """
-  if context.in_eager_mode():
+  if context.executing_eagerly():
     return context.context().scope_name.rstrip("/")
   return get_default_graph().get_name_scope()
 
@@ -5814,6 +5909,9 @@ def strip_name_scope(name, export_scope):
     is None.
   """
   if export_scope:
+    if export_scope[-1] == "/":
+      export_scope = export_scope[:-1]
+
     try:
       # Strips export_scope/, export_scope///,
       # ^export_scope/, loc:@export_scope/.
@@ -5839,6 +5937,9 @@ def prepend_name_scope(name, import_scope):
     is None.
   """
   if import_scope:
+    if import_scope[-1] == "/":
+      import_scope = import_scope[:-1]
+
     try:
       str_to_replace = r"([\^]|loc:@|^)(.*)"
       return re.sub(str_to_replace, r"\1" + import_scope + r"/\2",
@@ -5921,8 +6022,9 @@ def get_from_proto_function(collection_name):
 def _assert_collection_is_ok(collection_name):
   if context.executing_eagerly():
     if collection_name in GraphKeys._VARIABLE_COLLECTIONS:  # pylint: disable=protected-access
-      raise ValueError("When Eager Execution is enabled, variable "
-                       "collections are not supported.")
+      raise ValueError(
+          "variable collections are not supported when eager execution is enabled."
+      )
 
 
 def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 6daab80408c79f95323c7976bd101c0765d7f566..58bead91ed8dedaa2fa90e9edfcf377e943ef79f 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -473,6 +473,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x, x])
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, x, x, y, y])
+    self.assertEqual(x._control_outputs, [z])
 
   def testAddControlInputC(self):
     # The C API dedups redundant control edges, pure Python does not
@@ -487,6 +488,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x])
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, y])
+    self.assertEqual(x._control_outputs, [z])
 
   def testRemoveAllControlInputs(self):
     a = constant_op.constant(1)
@@ -1556,6 +1558,35 @@ class MultithreadedGraphStateTest(test_util.TensorFlowTestCase):
              input: "^ColocateWithMe_2" }
     """, gd)
 
+  def testNameStack(self):
+
+    class NameSettingThread(self.TestThread):
+
+      def run(self):
+        with g.name_scope("foo"):
+          op1 = g.create_op("FloatOutput", [], [dtypes.float32])
+          self.has_mutated_graph.set()
+          self.should_continue.wait()
+          self.should_continue.clear()
+          op2 = g.create_op("FloatOutput", [], [dtypes.float32])
+          self.result = (op1, op2)
+
+    g = ops.Graph()
+    threads = [NameSettingThread(g, i) for i in range(3)]
+    for t in threads:
+      t.start()
+      t.has_mutated_graph.wait()
+      t.has_mutated_graph.clear()
+
+    for t in threads:
+      t.should_continue.set()
+      t.join()
+
+    suffixes = ["", "_1", "_2"]
+    for t, s in zip(threads, suffixes):
+      self.assertEquals("foo" + s + "/FloatOutput", t.result[0].name)
+      self.assertEquals("foo" + s + "/FloatOutput_1", t.result[1].name)
+
 
 @test_util.with_c_api
 class ObjectWithName(object):
@@ -2919,6 +2950,9 @@ class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "device_policy must be one of"):
       c = config_pb2.ConfigProto()
       ops.enable_eager_execution(c, c)
+    with self.assertRaisesRegexp(ValueError, "execution_mode must be one of"):
+      c = config_pb2.ConfigProto()
+      ops.enable_eager_execution(c, execution_mode=c)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 00a63650b7c8ee6ec56e0d7f92525a87e0100696..e5e3b821998718e7b87a95439a442df98ad7c997 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -78,7 +78,7 @@ bool IsPythonReserved(const string& s) {
 bool IsOpWithUnderscorePrefix(const string& s) {
   static const std::set<string>* const kUnderscoreOps = new std::set<string>(
       {// Lowercase built-in functions and types in Python, from:
-       // [x for x in dir(__builtins__) if x[0].islower()]
+       // [x for x in dir(__builtins__) if x[0].islower()] except "round".
        // These need to be excluded so they don't conflict with actual built-in
        // functions since we use '*' imports.
        "abs", "all", "any", "apply", "bin", "bool", "buffer", "bytearray",
@@ -90,18 +90,15 @@ bool IsOpWithUnderscorePrefix(const string& s) {
        "iter", "len", "license", "list", "locals", "long", "map", "max",
        "memoryview", "min", "next", "object", "oct", "open", "ord", "pow",
        "print", "property", "quit", "range", "raw_input", "reduce", "reload",
-       "repr", "reversed", "round", "set", "setattr", "slice", "sorted",
-       "staticmethod", "str", "sum", "super", "tuple", "type", "unichr",
-       "unicode", "vars", "xrange", "zip",
+       "repr", "reversed", "set", "setattr", "slice", "sorted", "staticmethod",
+       "str", "sum", "super", "tuple", "type", "unichr", "unicode", "vars",
+       "xrange", "zip",
        // These have the same name as ops defined in Python and might be used
        // incorrectly depending on order of '*' imports.
        // TODO(annarev): reduce usage of '*' imports and remove these from the
        // list.
        "fused_batch_norm", "histogram_fixed_width", "stack",
-       "batch_norm_with_global_normalization",
-       // TODO(annarev): replace these ops in the next change.
-       "broadcast_gradient_args", "enter", "histogram_summary", "ref_enter",
-       "ref_identity", "scalar_summary"});
+       "batch_norm_with_global_normalization"});
   return kUnderscoreOps->count(s) > 0;
 }
 
@@ -451,7 +448,7 @@ string AttrValueToPython(const string& type, const AttrValue& value,
     return TensorToPython(value.tensor());
   } else if (type == "func") {
     return StringToPython(value.func().name());
-  } else if (StringPiece(type).starts_with("list(")) {
+  } else if (str_util::StartsWith(type, "list(")) {
     return strings::StrCat("[", AttrListToPython(value, dtype_module), "]");
   } else {
     return "?";
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index bc5ca195da50499c6fbab822a9a093be3f0277e0..ca6ed42beec4a3d5ff70d5a605ab006265d1cce9 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -95,7 +96,8 @@ string InferSourceFileName(const char* argv_zero) {
   // operators defined in <op type>_ops.cc
   const char* kExecPrefix = "gen_";
   const char* kExecSuffix = "_py_wrappers_cc";
-  if (command_str.Consume(kExecPrefix) && command_str.ends_with(kExecSuffix)) {
+  if (str_util::ConsumePrefix(&command_str, kExecPrefix) &&
+      str_util::EndsWith(command_str, kExecSuffix)) {
     command_str.remove_suffix(strlen(kExecSuffix));
     return strings::StrCat(command_str, ".cc");
   } else {
diff --git a/tensorflow/python/framework/smart_cond.py b/tensorflow/python/framework/smart_cond.py
index c7ff23e4ff809ed7bc57259fa3ec9feb921b5a71..48a834392b47b4cdcc82381153852584052a5aad 100644
--- a/tensorflow/python/framework/smart_cond.py
+++ b/tensorflow/python/framework/smart_cond.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow as c_api
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
@@ -83,9 +82,8 @@ def smart_constant_value(pred):
     # wanted to limit the change hidden behind _USE_C_API).
     # pylint: disable=protected-access
     if pred_value is None and ops._USE_C_API:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pred_value = c_api.TF_TryEvaluateConstant_wrapper(
-            pred.graph._c_graph, pred._as_tf_output(), status)
+      pred_value = c_api.TF_TryEvaluateConstant_wrapper(pred.graph._c_graph,
+                                                        pred._as_tf_output())
     # pylint: enable=protected-access
 
   else:
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index 546c48adbae7a97b404b2507994dfff77dd69ec6..6676cfcaa334e02208d9ec346de7d266c4700f24 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -110,6 +110,9 @@ class TensorSpec(object):
   def __ne__(self, other):
     return not self == other
 
+  def __reduce__(self):
+    return TensorSpec, (self._shape, self._dtype, self._name)
+
 
 class BoundedTensorSpec(TensorSpec):
   """A `TensorSpec` that specifies minimum and maximum values.
@@ -210,4 +213,7 @@ class BoundedTensorSpec(TensorSpec):
     return (tensor_spec_eq and np.allclose(self.minimum, other.minimum) and
             np.allclose(self.maximum, other.maximum))
 
+  def __reduce__(self):
+    return BoundedTensorSpec, (self._shape, self._dtype, self._minimum,
+                               self._maximum, self._name)
 
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index b33d769d86d3ea03f6a59082a132b0a94d5921ea..2e9e43e12279fe833d640d4163c5474c398e70cd 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import pickle
+
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -143,6 +145,10 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     unbounded_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
     self.assertFalse(unbounded_spec.is_bounded())
 
+  def testSerialization(self):
+    desc = tensor_spec.TensorSpec([1, 5], dtypes.float32, "test")
+    self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
+
 
 class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
 
@@ -243,6 +249,10 @@ class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(spec.dtype.max, bounded_spec.maximum)
     self.assertEqual(spec.name, bounded_spec.name)
 
+  def testSerialization(self):
+    desc = tensor_spec.BoundedTensorSpec([1, 5], dtypes.float32, -1, 1, "test")
+    self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/test_file_system.cc b/tensorflow/python/framework/test_file_system.cc
index 094ea6f658ab800736eebce2db7ee80da151a033..6e9915adbb619c5c4891742ddda700da47ed590f 100644
--- a/tensorflow/python/framework/test_file_system.cc
+++ b/tensorflow/python/framework/test_file_system.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/null_file_system.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index e9e86e452b19fd23b4eddfb4f6a985458b503eff..bf00fa6439b82234e951598131d2d7ab579fb6c4 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -53,6 +53,7 @@ from tensorflow.python.eager import tape  # pylint: disable=unused-import
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -201,6 +202,7 @@ def _strip_checkpoint_v2_randomized(graph_def):
 def IsGoogleCudaEnabled():
   return pywrap_tensorflow.IsGoogleCudaEnabled()
 
+
 def CudaSupportsHalfMatMulAndConv():
   return pywrap_tensorflow.CudaSupportsHalfMatMulAndConv()
 
@@ -335,6 +337,8 @@ def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs):
     # Make sure default graph reflects prev_value in case next test doesn't call
     # reset_default_graph().
     ops.reset_default_graph()
+
+
 # pylint: disable=protected-access
 
 
@@ -451,7 +455,8 @@ def with_c_api(cls):
   # If the C API is already enabled, don't do anything. Some tests break if the
   # same test is run twice, so this allows us to turn on the C API by default
   # without breaking these tests.
-  if ops._USE_C_API: return cls
+  if ops._USE_C_API:
+    return cls
 
   for name, value in cls.__dict__.copy().items():
     if callable(value) and name.startswith("test"):
@@ -469,6 +474,7 @@ def assert_no_new_pyobjects_executing_eagerly(f):
   Useful for checking that there are no missing Py_DECREFs in the C exercised by
   a bit of Python.
   """
+
   def decorator(self, **kwargs):
     """Warms up, gets an object count, runs the test, checks for new objects."""
     with context.eager_mode():
@@ -481,10 +487,18 @@ def assert_no_new_pyobjects_executing_eagerly(f):
       gc.collect()
       # There should be no new Python objects hanging around.
       new_count = len(gc.get_objects())
-      self.assertEqual(previous_count, new_count)
+      # In some cases (specifacally on MacOS), new_count is somehow
+      # smaller than previous_count.
+      # Using plain assert because not all classes using this decorator
+      # have assertLessEqual
+      assert new_count <= previous_count, (
+          "new_count(%d) is not less than or equal to previous_count(%d)" % (
+              new_count, previous_count))
       gc.enable()
+
   return decorator
 
+
 def assert_no_new_tensors(f):
   """Decorator for asserting that no new Tensors persist after a test.
 
@@ -508,17 +522,15 @@ def assert_no_new_tensors(f):
 
     def _is_tensorflow_object(obj):
       try:
-        return isinstance(obj, (
-            ops.Tensor,
-            variables.Variable,
-            tensor_shape.Dimension,
-            tensor_shape.TensorShape))
+        return isinstance(obj,
+                          (ops.Tensor, variables.Variable,
+                           tensor_shape.Dimension, tensor_shape.TensorShape))
       except ReferenceError:
         # If the object no longer exists, we don't care about it.
         return False
 
-    tensors_before = set(id(obj) for obj in gc.get_objects()
-                         if _is_tensorflow_object(obj))
+    tensors_before = set(
+        id(obj) for obj in gc.get_objects() if _is_tensorflow_object(obj))
     outside_graph_key = ops.get_default_graph()._graph_key
     with ops.Graph().as_default():
       # Run the test in a new graph so that collections get cleared when it's
@@ -572,18 +584,18 @@ def assert_no_garbage_created(f):
           "likely due to a reference cycle. New objects in cycle(s):")
       for i, obj in enumerate(gc.garbage[previous_garbage:]):
         try:
-          logging.error(
-              "Object %d of %d" % (i, len(gc.garbage) - previous_garbage))
+          logging.error("Object %d of %d", i,
+                        len(gc.garbage) - previous_garbage)
+
           def _safe_object_str(obj):
             return "<%s %d>" % (obj.__class__.__name__, id(obj))
-          logging.error("  Object type: %s" % (_safe_object_str(obj),))
-          logging.error("  Referrer types: %s" % (
-              ', '.join([_safe_object_str(ref)
-                         for ref in gc.get_referrers(obj)]),))
-          logging.error("  Referent types: %s" % (
-              ', '.join([_safe_object_str(ref)
-                         for ref in gc.get_referents(obj)]),))
-          logging.error("  Object attribute names: %s" % (dir(obj),))
+
+          logging.error("  Object type: %s", _safe_object_str(obj))
+          logging.error("  Referrer types: %s", ", ".join(
+              [_safe_object_str(ref) for ref in gc.get_referrers(obj)]))
+          logging.error("  Referent types: %s", ", ".join(
+              [_safe_object_str(ref) for ref in gc.get_referents(obj)]))
+          logging.error("  Object attribute names: %s", dir(obj))
           logging.error("  Object __str__:")
           logging.error(obj)
           logging.error("  Object __repr__:")
@@ -705,15 +717,23 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
       return 0, 0
     return int(match.group(1)), int(match.group(2))
 
-  for local_device in device_lib.list_local_devices():
-    if local_device.device_type == "GPU":
-      if (min_cuda_compute_capability is None or
-          compute_capability_from_device_desc(local_device.physical_device_desc)
-          >= min_cuda_compute_capability):
+  try:
+    for local_device in device_lib.list_local_devices():
+      if local_device.device_type == "GPU":
+        if (min_cuda_compute_capability is None or
+            compute_capability_from_device_desc(
+                local_device.physical_device_desc) >=
+            min_cuda_compute_capability):
+          return True
+      if local_device.device_type == "SYCL" and not cuda_only:
         return True
-    if local_device.device_type == "SYCL" and not cuda_only:
-      return True
-  return False
+    return False
+  except errors_impl.NotFoundError as e:
+    if not all([x in str(e) for x in ["CUDA", "not find"]]):
+      raise e
+    else:
+      logging.error(str(e))
+      return False
 
 
 @contextlib.contextmanager
@@ -902,9 +922,9 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Use the `use_gpu` and `force_gpu` options to control where ops are run. If
     `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if
-    `use_gpu`
-    is True, TensorFlow tries to run as many ops on the GPU as possible. If both
-    `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
+    `use_gpu` is True, TensorFlow tries to run as many ops on the GPU as
+    possible. If both `force_gpu and `use_gpu` are False, all ops are pinned to
+    the CPU.
 
     Example:
     ```python
@@ -954,8 +974,6 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
-      config.graph_options.rewrite_options.arithmetic_optimization = (
-          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:
@@ -1256,9 +1274,9 @@ class TensorFlowTestCase(googletest.TestCase):
             msg="Mismatched value: a%s is different from b%s." % (path_str,
                                                                   path_str))
       except TypeError as e:
-        msg = "Error: a%s has %s, but b%s has %s" % (
-            path_str, type(a), path_str, type(b))
-        e.args = ((e.args[0] + ' : ' + msg,) + e.args[1:])
+        msg = "Error: a%s has %s, but b%s has %s" % (path_str, type(a),
+                                                     path_str, type(b))
+        e.args = ((e.args[0] + " : " + msg,) + e.args[1:])
         raise
 
   def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
@@ -1438,8 +1456,7 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     device1 = pydev.canonical_name(device1)
     device2 = pydev.canonical_name(device2)
-    self.assertEqual(device1, device2,
-                     "Devices %s and %s are not equal. %s" % 
+    self.assertEqual(device1, device2, "Devices %s and %s are not equal. %s" %
                      (device1, device2, msg))
 
   # Fix Python 3 compatibility issues
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index 06955b885852a641bc814f88c99838effe03bfd4..d08b4bf48a3d3cc89fc73f4c97df0574deee871e 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -29,7 +29,7 @@ __cxx11_abi_flag__ = pywrap_tensorflow.__cxx11_abi_flag__
 __monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 
 VERSION = __version__
-tf_export("VERSION").export_constant(__name__, "VERSION")
+tf_export("VERSION", "__version__").export_constant(__name__, "VERSION")
 GIT_VERSION = __git_version__
 tf_export("GIT_VERSION").export_constant(__name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py
index a3c4c2bbeba7c4ee5d00268c0e475e11a31fa7eb..26c6f22d34b27c8b866c0b23a36fdef5164348a4 100644
--- a/tensorflow/python/grappler/cluster_test.py
+++ b/tensorflow/python/grappler/cluster_test.py
@@ -87,9 +87,10 @@ class ClusterTest(test.TestCase):
 
   def testVirtualCluster(self):
     with ops.Graph().as_default() as g:
-      a = random_ops.random_uniform(shape=())
-      b = random_ops.random_uniform(shape=())
-      c = a + b
+      with ops.device('/device:GPU:0'):
+        a = random_ops.random_uniform(shape=[1024, 1024])
+        b = random_ops.random_uniform(shape=[1024, 1024])
+        c = a + b
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(c)
       mg = meta_graph.create_meta_graph_def(graph=g)
@@ -102,10 +103,13 @@ class ClusterTest(test.TestCase):
               'architecture': '7'
           })
       named_device = device_properties_pb2.NamedDevice(
-          properties=device_properties, name='/GPU:0')
-      grappler_cluster = cluster.Cluster(devices=[named_device])
+          properties=device_properties, name='/device:GPU:0')
+      grappler_cluster = cluster.Cluster(
+          disable_detailed_stats=False,
+          disable_timeline=False,
+          devices=[named_device])
       op_perfs, run_time, _ = grappler_cluster.MeasureCosts(grappler_item)
-      self.assertGreater(run_time, 0)
+      self.assertEqual(run_time, 0.000545)
       self.assertEqual(len(op_perfs), 15)
 
       estimated_perf = grappler_cluster.EstimatePerformance(named_device)
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab1d0ed25b9130fabcffbb8da2265c046206da46
--- /dev/null
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Grappler Constant Folding."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ConstantFoldingTest(test.TestCase):
+
+  # See b/76008022.
+  def testScanInsideWhile(self):
+
+    def loop_cond(idx_step, *unused_args):
+      return idx_step < 1
+
+    def loop_body(idx_step, y):
+      x = array_ops.zeros([10, 20, 30], dtype=dtypes.float32)
+      x = functional_ops.scan(
+          math_ops.add,
+          x,
+          initializer=array_ops.zeros([20, 30], dtype=dtypes.float32),
+          back_prop=False,
+          parallel_iterations=1)
+
+      with ops.device('/cpu:0'):
+        y = array_ops.identity(x)
+
+        return idx_step + 1, y
+
+    if test.is_gpu_available(cuda_only=True):
+      init_y = array_ops.zeros([10, 20, 30], dtype=dtypes.float32)
+      _, y = control_flow_ops.while_loop(
+          loop_cond,
+          loop_body,
+          loop_vars=[0, init_y],
+          back_prop=False,
+          parallel_iterations=1)
+      with session.Session() as sess:
+        y_v = sess.run(y)
+        self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/grappler/item.i b/tensorflow/python/grappler/item.i
index 9a84c60b04029a64ed35a01f045a6eec5e492504..593d38206d127978f1982a0f2cc22e17daee1a3d 100644
--- a/tensorflow/python/grappler/item.i
+++ b/tensorflow/python/grappler/item.i
@@ -83,7 +83,6 @@ static GItem TF_NewItem(
   tensorflow::grappler::ItemConfig cfg;
   cfg.ignore_user_placement = ignore_user_placement;
   cfg.ignore_colocation = ignore_colocation;
-  cfg.inline_functions = true;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef("item", meta_graph, cfg);
   if (!item) {
diff --git a/tensorflow/python/grappler/item.py b/tensorflow/python/grappler/item.py
index 4a083849bd39f606877069419396d8c42ef077eb..1748efdd130268f2668cd8cb1b5c2da18bafd549 100644
--- a/tensorflow/python/grappler/item.py
+++ b/tensorflow/python/grappler/item.py
@@ -51,9 +51,7 @@ class Item(object):
     self._BuildTFItem()
 
   def IdentifyImportantOps(self, sort_topologically=False):
-    with errors.raise_exception_on_not_ok_status() as status:
-      return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically,
-                                             status)
+    return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically)
 
   def GetOpProperties(self):
     ret_from_swig = tf_item.TF_GetOpProperties(self.tf_item)
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index 7c3efd6249cbdaa2675632f7fc8e25fb88658a24..c40de9da0abca3bb99a82a1456261f45b1c45c99 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -111,7 +111,7 @@ class ItemTest(test.TestCase):
     with ops.Graph().as_default() as g:
       c = constant_op.constant([10])
       v = variables.Variable([3], dtype=dtypes.int32)
-      i = gen_array_ops._ref_identity(v)
+      i = gen_array_ops.ref_identity(v)
       a = state_ops.assign(i, c)
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(a)
diff --git a/tensorflow/python/grappler/model_analyzer.cc b/tensorflow/python/grappler/model_analyzer.cc
index d23eb811ac2b0a6a8802979b4d966b5617c8a8d9..5a76cdd8fb29361cd800dea60cb9ebc0e39f6487 100644
--- a/tensorflow/python/grappler/model_analyzer.cc
+++ b/tensorflow/python/grappler/model_analyzer.cc
@@ -26,9 +26,10 @@ namespace grappler {
 
 ModelAnalyzer::ModelAnalyzer(const GrapplerItem& item) : item_(item) {}
 
-Status ModelAnalyzer::GenerateReport(bool debug, std::ostream& os) {
+Status ModelAnalyzer::GenerateReport(bool debug, bool assume_valid_feeds,
+                                     std::ostream& os) {
   GraphProperties properties(item_);
-  TF_RETURN_IF_ERROR(properties.InferStatically(false));
+  TF_RETURN_IF_ERROR(properties.InferStatically(assume_valid_feeds));
 
   for (const auto& node : item_.MainOpsFanin()) {
     PrintNodeInfo(node, properties, debug, os);
diff --git a/tensorflow/python/grappler/model_analyzer.h b/tensorflow/python/grappler/model_analyzer.h
index 5bc551927d88db723e21b29903d6f5b941048139..97ffafabe1f785e3b2c3044143b8fb8006b59225 100644
--- a/tensorflow/python/grappler/model_analyzer.h
+++ b/tensorflow/python/grappler/model_analyzer.h
@@ -31,7 +31,7 @@ class GraphProperties;
 class ModelAnalyzer {
  public:
   explicit ModelAnalyzer(const GrapplerItem& item);
-  Status GenerateReport(bool debug, std::ostream& os);
+  Status GenerateReport(bool debug, bool assume_valid_feeds, std::ostream& os);
 
  private:
   void PrintNodeInfo(const NodeDef* node, const GraphProperties& properties,
diff --git a/tensorflow/python/grappler/model_analyzer.i b/tensorflow/python/grappler/model_analyzer.i
index 7c3a692d0efc501341ff1dff3cf24b8a4830ec84..4955780764be802b9e4be3598bf114b227757194 100644
--- a/tensorflow/python/grappler/model_analyzer.i
+++ b/tensorflow/python/grappler/model_analyzer.i
@@ -40,7 +40,8 @@ limitations under the License.
 %}
 
 %{
-string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug) {
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph,
+                           bool assume_valid_feeds, bool debug) {
   tensorflow::grappler::ItemConfig cfg;
   cfg.apply_optimizations = false;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
@@ -53,10 +54,11 @@ string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug
   tensorflow::grappler::ModelAnalyzer analyzer(*item);
 
   std::stringstream os;
-  analyzer.GenerateReport(debug, os);
+  analyzer.GenerateReport(debug, assume_valid_feeds, os);
   return os.str();
 }
 
 %}
 
-string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug);
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph,
+                           bool assume_valid_feeds, bool debug);
diff --git a/tensorflow/python/grappler/model_analyzer.py b/tensorflow/python/grappler/model_analyzer.py
index 535889e1c4034952562a05e4d044fcafeddbc0ca..98cdc5785011dcebbaaf43704772b3de00c9d6ca 100644
--- a/tensorflow/python/grappler/model_analyzer.py
+++ b/tensorflow/python/grappler/model_analyzer.py
@@ -22,11 +22,12 @@ from tensorflow.python import pywrap_tensorflow as tf_wrap
 from tensorflow.python.framework import errors
 
 
-def GenerateModelReport(metagraph, debug=False):
+def GenerateModelReport(metagraph, assume_valid_feeds=True, debug=False):
   """Report what's known statically about each node in the provided metagraph.
 
   Args:
     metagraph: A TensorFlow MetaGraphDef.
+    assume_valid_feeds: If True, assume that the shape of the fed nodes is valid
     debug: Add some information useful for debugging.
 
   Returns:
@@ -34,6 +35,6 @@ def GenerateModelReport(metagraph, debug=False):
   """
   with errors.raise_exception_on_not_ok_status():
     ret_from_swig = tf_wrap.GenerateModelReport(metagraph.SerializeToString(),
-                                                debug)
+                                                assume_valid_feeds, debug)
 
   return ret_from_swig
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index de9326ccfc1653c2afd0833dcdca2cc4bfdabed5..39ca71e99af06c19fb7fe5bf185c29106729f5e9 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -98,7 +98,6 @@ PyObject* TF_OptimizeGraph(
       const tensorflow::MetaGraphDef& metagraph,
       bool verbose, const string& graph_id, TF_Status* out_status) {
     tensorflow::grappler::ItemConfig item_config;
-    item_config.inline_functions = false;
     item_config.apply_optimizations = false;
     item_config.ignore_user_placement = false;
     std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index 3ee4d7807ea5677a742514eb56267b94c6b92bba..1c0f072dd32d38f048cfa48d38b45264951d095e 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -17,12 +17,16 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.grappler import item as gitem
 from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -74,6 +78,47 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     self.assertEqual(a2.op.name, optimized_graph.node[3].name)
     self.assertEqual('Variable/Assign', optimized_graph.node[4].name)
 
+  def testLoops(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      def _Cond(_, counter):
+        return counter < end
+
+      def _Body(buf, counter):
+        buf = array_ops.concat([buf, [counter]], 0)
+        counter += 1
+        return [buf, counter]
+
+      start = array_ops.placeholder(shape=[], dtype=dtypes.int32)
+      end = array_ops.placeholder(shape=[], dtype=dtypes.int32)
+      init_buf = array_ops.zeros(shape=[0], dtype=dtypes.int32)
+      loop_vars = [init_buf, start]
+      shape_inv = [
+          tensor_shape.TensorShape([None]),
+          tensor_shape.TensorShape([])
+      ]
+      buf, _ = control_flow_ops.while_loop(_Cond, _Body, loop_vars, shape_inv)
+
+      f = -array_ops.ones_like(buf, optimize=False)
+      buf_shape = array_ops.shape(buf)
+      f_shape = array_ops.shape(f)
+      ops.add_to_collection('train_op', buf_shape)
+      ops.add_to_collection('train_op', f_shape)
+
+    # Optimize the graph.
+    mg = meta_graph.create_meta_graph_def(graph=g)
+    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    mg.graph_def.CopyFrom(optimized_graph)
+
+    # Check that the nodes referenced in various collections have been preserved
+    item = gitem.Item(mg)
+    props = item.GetOpProperties()
+    buf_prop = props[buf.op.name]
+    f_prop = props[f.op.name]
+    self.assertEqual(buf_prop, f_prop)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index f47be178864a12249bbe6440fc02f2381903776d..57f5097639564eabadf848584fd68fb674511bda 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -8,6 +8,7 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//visibility:public"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 config_setting(
     name = "empty_condition",
@@ -401,11 +402,10 @@ py_test(
 
 py_test(
     name = "convolutional_recurrent_test",
-    size = "medium",
+    size = "large",
     srcs = ["_impl/keras/layers/convolutional_recurrent_test.py"],
     shard_count = 2,
     srcs_version = "PY2AND3",
-    tags = ["noasan"],  # times out b/63678675
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -612,6 +612,7 @@ py_test(
         "no_windows",
         "noasan",  # times out
         "notsan",
+        "optonly",  # times out
     ],
     deps = [
         ":keras",
@@ -659,16 +660,17 @@ py_test(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "multi_gpu_utils_test",
-    size = "medium",
     srcs = ["_impl/keras/utils/multi_gpu_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["multi_gpu"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "guitar",
+        "multi_gpu",
     ],
 )
 
@@ -777,6 +779,9 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/engine/topology_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no-internal-py3",
+    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -866,15 +871,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index 236e17653e1b762e1e6962f453b714d1bf7bcbf7..b518898ad8fe2fe7b859ec80714d610242d621dc 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -23,6 +23,8 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.layers.base import Layer
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -43,10 +45,10 @@ def softmax(x, axis=-1):
   """
   ndim = K.ndim(x)
   if ndim == 2:
-    return K.softmax(x)
+    return nn.softmax(x)
   elif ndim > 2:
-    e = K.exp(x - K.max(x, axis=axis, keepdims=True))
-    s = K.sum(e, axis=axis, keepdims=True)
+    e = math_ops.exp(x - math_ops.reduce_max(x, axis=axis, keepdims=True))
+    s = math_ops.reduce_sum(e, axis=axis, keepdims=True)
     return e / s
   else:
     raise ValueError('Cannot apply softmax to a tensor that is 1D')
@@ -79,12 +81,12 @@ def selu(x):
 
 @tf_export('keras.activations.softplus')
 def softplus(x):
-  return K.softplus(x)
+  return nn.softplus(x)
 
 
 @tf_export('keras.activations.softsign')
 def softsign(x):
-  return K.softsign(x)
+  return nn.softsign(x)
 
 
 @tf_export('keras.activations.relu')
@@ -94,12 +96,12 @@ def relu(x, alpha=0., max_value=None):
 
 @tf_export('keras.activations.tanh')
 def tanh(x):
-  return K.tanh(x)
+  return nn.tanh(x)
 
 
 @tf_export('keras.activations.sigmoid')
 def sigmoid(x):
-  return K.sigmoid(x)
+  return nn.sigmoid(x)
 
 
 @tf_export('keras.activations.hard_sigmoid')
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index c26a28ed4087e30968585ec8ac0b64b51513bcae..d928a7afdc639485d443be382420cac09ba9abd6 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -22,8 +22,10 @@ import json
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -151,11 +153,11 @@ def _preprocess_symbolic_input(x, data_format, mode):
     std = None
 
   if _IMAGENET_MEAN is None:
-    _IMAGENET_MEAN = K.constant(-np.array(mean))
+    _IMAGENET_MEAN = constant_op.constant(-np.array(mean), dtype=K.floatx())
 
   # Zero-center by mean pixel
   if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
-    x = K.bias_add(x, K.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
+    x = K.bias_add(x, math_ops.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
   else:
     x = K.bias_add(x, _IMAGENET_MEAN, data_format)
   if std is not None:
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 04866fbe0f48dabd57fd0336258420765b24a793..3aac6a9065cfa6189db1a3d3b33648dc980161b6 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -2795,6 +2796,8 @@ class Function(object):
     else:
       feed_dict = {}
 
+    session = get_session()
+    data_tensors_to_feed = []
     for tensor, value in zip(self.inputs, inputs):
       if value is None:
         continue
@@ -2803,9 +2806,20 @@ class Function(object):
         indices = np.concatenate((np.expand_dims(sparse_coo.row, 1),
                                   np.expand_dims(sparse_coo.col, 1)), 1)
         value = (indices, sparse_coo.data, sparse_coo.shape)
-      feed_dict[tensor] = value
+      elif tensor_util.is_tensor(value):
+        data_tensors_to_feed.append((tensor, value))
+      else:
+        feed_dict[tensor] = value
+
+    if data_tensors_to_feed:
+      # This is a *temporary* workaround (i.e. hack) to feed a symbolic tensor
+      # to `feed_dict`. It is very inefficient. It will be removed as soon
+      # as it becomes possible to pass symbolic tensors to `feed_dict`.
+      data_tensor_values = session.run([x[1] for x in data_tensors_to_feed])
+      for i, v in enumerate(data_tensor_values):
+        feed_dict[data_tensors_to_feed[i][0]] = v
+
     fetches = self.outputs + [self.updates_op] + self.fetches
-    session = get_session()
     updated = session.run(
         fetches=fetches, feed_dict=feed_dict, **self.session_kwargs)
     return updated[:len(self.outputs)]
@@ -3373,7 +3387,7 @@ def categorical_crossentropy(target, output, from_logits=False):
         target * math_ops.log(output),
         axis=len(output.get_shape()) - 1)
   else:
-    return nn.softmax_cross_entropy_with_logits(labels=target, logits=output)
+    return nn.softmax_cross_entropy_with_logits_v2(labels=target, logits=output)
 
 
 @tf_export('keras.backend.sparse_categorical_crossentropy')
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index 271fbbb63d3dfd50507837e190860d48315a14f2..abe95d8e0ca68b2e62f9574fba9ae912a9179fff 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -24,6 +24,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -65,7 +66,8 @@ class MaxNorm(Constraint):
     self.axis = axis
 
   def __call__(self, w):
-    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    norms = K.sqrt(
+        math_ops.reduce_sum(math_ops.square(w), axis=self.axis, keepdims=True))
     desired = K.clip(norms, 0, self.max_value)
     return w * (desired / (K.epsilon() + norms))
 
@@ -79,7 +81,7 @@ class NonNeg(Constraint):
   """
 
   def __call__(self, w):
-    return w * K.cast(K.greater_equal(w, 0.), K.floatx())
+    return w * math_ops.cast(math_ops.greater_equal(w, 0.), K.floatx())
 
 
 @tf_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
@@ -105,7 +107,9 @@ class UnitNorm(Constraint):
 
   def __call__(self, w):
     return w / (
-        K.epsilon() + K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True)))
+        K.epsilon() + K.sqrt(
+            math_ops.reduce_sum(
+                math_ops.square(w), axis=self.axis, keepdims=True)))
 
   def get_config(self):
     return {'axis': self.axis}
@@ -148,7 +152,8 @@ class MinMaxNorm(Constraint):
     self.axis = axis
 
   def __call__(self, w):
-    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    norms = K.sqrt(
+        math_ops.reduce_sum(math_ops.square(w), axis=self.axis, keepdims=True))
     desired = (
         self.rate * K.clip(norms, self.min_value, self.max_value) +
         (1 - self.rate) * norms)
diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
index 5615241ae3077102ef40f9c0619161964a62a335..755607aafb9343f0c9f10c5f9394bedc2f8afd76 100644
--- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
+
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
@@ -30,6 +32,8 @@ from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.utils import generic_utils
 from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -143,6 +147,7 @@ class Layer(tf_base_layers.Layer):
     super(Layer, self).__init__(
         name=name, dtype=dtype, trainable=trainable,
         activity_regularizer=kwargs.get('activity_regularizer'))
+    self._uses_inputs_arg = True
 
     # Add properties that are Keras-only for now.
     self.supports_masking = False
@@ -213,7 +218,71 @@ class Layer(tf_base_layers.Layer):
     """
     return inputs
 
-  def __call__(self, inputs, **kwargs):
+  def _inputs_from_call_args(self, call_args, call_kwargs):
+    """Get Layer inputs from __call__ *args and **kwargs.
+
+    Args:
+      call_args: The positional arguments passed to __call__.
+      call_kwargs: The keyword argument dict passed to __call__.
+
+    Returns:
+      A tuple of (inputs, non_input_kwargs). These may be the same objects as
+      were passed in (call_args and call_kwargs).
+    """
+    if getattr(self, '_uses_inputs_arg', True):
+      assert len(call_args) == 1  # TypeError raised earlier in __call__.
+      return call_args[0], call_kwargs
+    else:
+      call_arg_spec = tf_inspect.getargspec(self.call)
+      # There is no explicit "inputs" argument expected or provided to
+      # call(). Arguments which have default values are considered non-inputs,
+      # and arguments without are considered inputs.
+      if call_arg_spec.defaults:
+        if call_arg_spec.varargs is not None:
+          raise TypeError(
+              'Layer.call() may not accept both *args and arguments with '
+              'default values (unable to determine which are inputs to the '
+              'Layer).')
+        keyword_arg_names = set(
+            call_arg_spec.args[-len(call_arg_spec.defaults):])
+      else:
+        keyword_arg_names = set()
+        # Training is never an input argument name, to allow signatures like
+        # call(x, training).
+      keyword_arg_names.add('training')
+      _, unwrapped_call = tf_decorator.unwrap(self.call)
+      bound_args = inspect.getcallargs(
+          unwrapped_call, *call_args, **call_kwargs)
+      if call_arg_spec.keywords is not None:
+        var_kwargs = bound_args.pop(call_arg_spec.keywords)
+        bound_args.update(var_kwargs)
+        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
+      all_args = call_arg_spec.args
+      if all_args and bound_args[all_args[0]] is self:
+        # Ignore the 'self' argument of methods
+        bound_args.pop(call_arg_spec.args[0])
+        all_args = all_args[1:]
+      non_input_arg_values = {}
+      input_arg_values = []
+      remaining_args_are_keyword = False
+      for argument_name in all_args:
+        if argument_name in keyword_arg_names:
+          remaining_args_are_keyword = True
+        else:
+          if remaining_args_are_keyword:
+            raise TypeError(
+                'Found a positional argument to call() after a non-input '
+                'argument. All arguments after "training" must be keyword '
+                'arguments, and are not tracked as inputs to the Layer.')
+        if remaining_args_are_keyword:
+          non_input_arg_values[argument_name] = bound_args[argument_name]
+        else:
+          input_arg_values.append(bound_args[argument_name])
+      if call_arg_spec.varargs is not None:
+        input_arg_values.extend(bound_args[call_arg_spec.varargs])
+      return input_arg_values, non_input_arg_values
+
+  def __call__(self, inputs, *args, **kwargs):
     """Wrapper around self.call(), for handling internal references.
 
     If a Keras tensor is passed:
@@ -226,6 +295,10 @@ class Layer(tf_base_layers.Layer):
 
     Arguments:
         inputs: Can be a tensor or list/tuple of tensors.
+        *args: Additional positional arguments to be passed to `call()`. Only
+          allowed in subclassed Models with custom call() signatures. In other
+          cases, `Layer` inputs must be passed using the `inputs` argument and
+          non-inputs must be keyword arguments.
         **kwargs: Additional keyword arguments to be passed to `call()`.
 
     Returns:
@@ -234,12 +307,25 @@ class Layer(tf_base_layers.Layer):
     Raises:
         ValueError: in case the layer is missing shape information
             for its `build` call.
+        TypeError: If positional arguments are passed and this `Layer` is not a
+            subclassed `Model`.
     """
     # Actually call the layer (optionally building it).
-    output = super(Layer, self).__call__(inputs, **kwargs)
+    output = super(Layer, self).__call__(inputs, *args, **kwargs)
+
+    if args and getattr(self, '_uses_inputs_arg', True):
+      raise TypeError(
+          'This Layer takes an `inputs` argument to call(), and only the '
+          '`inputs` argument may be specified as a positional argument. Pass '
+          'everything else as a keyword argument (those arguments will not be '
+          'tracked as inputs to the Layer).')
+
     if context.executing_eagerly():
       return output
 
+    inputs, kwargs = self._inputs_from_call_args(
+        call_args=(inputs,) + args, call_kwargs=kwargs)
+
     if hasattr(self, '_symbolic_set_inputs') and not self.inputs:
       # Subclassed network: explicitly set metadata normally set by a call to
       # self._set_inputs().
diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py
index bf8239043874794c6617937cfc9c619d743502a9..9f1c7de1157a3659ccb27e4850e99e09016d0067 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/_impl/keras/engine/network.py
@@ -92,7 +92,9 @@ class Network(base_layer.Layer):
     self._expects_training_arg = False
 
     self.supports_masking = False
-    self.optimizer = None
+    if not hasattr(self, 'optimizer'):
+      # Don't reset optimizer if already set.
+      self.optimizer = None
 
     # Private attributes to implement compatibility with Layer.
     self._updates = []  # Used in symbolic mode only.
@@ -115,6 +117,7 @@ class Network(base_layer.Layer):
     self._inbound_nodes = []
 
   def _init_graph_network(self, inputs, outputs, name=None):
+    self._uses_inputs_arg = True
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, (list, tuple)):
       self.inputs = list(inputs)  # Tensor or list of tensors.
@@ -188,17 +191,6 @@ class Network(base_layer.Layer):
     self.built = True
     self._is_graph_network = True
 
-    # # List of initial layers (1 to 1 mapping with self.inputs,
-    # # hence the same layer might appear twice)
-    # self._input_layers = []
-    # self._input_layers_node_indices = []
-    # self._input_layers_tensor_indices = []
-    # # list of layers (1 to 1 mapping with self.inputs,
-    # # hence the same layer might appear twice)
-    # self._output_layers = []
-    # self._output_layers_node_indices = []
-    # self._output_layers_tensor_indices = []
-
     self._input_layers = []
     self._output_layers = []
     self._input_coordinates = []
@@ -283,11 +275,15 @@ class Network(base_layer.Layer):
   def _init_subclassed_network(self, name=None):
     self._base_init(name=name)
     self._is_graph_network = False
-    if 'training' in tf_inspect.getargspec(self.call).args:
+    call_args = tf_inspect.getargspec(self.call).args
+    if 'training' in call_args:
       self._expects_training_arg = True
     else:
       self._expects_training_arg = False
-
+    if 'inputs' in call_args:
+      self._uses_inputs_arg = True
+    else:
+      self._uses_inputs_arg = False
     self.outputs = None
     self.inputs = None
     self.built = False
@@ -1223,9 +1219,6 @@ class Network(base_layer.Layer):
     Returns:
         A JSON string.
     """
-    if not self._is_graph_network:
-      raise NotImplementedError
-
     def get_json_type(obj):
       # If obj is any numpy type
       if type(obj).__module__ == np.__name__:
@@ -1260,9 +1253,6 @@ class Network(base_layer.Layer):
     Raises:
         ImportError: if yaml module is not found.
     """
-    if not self._is_graph_network:
-      raise NotImplementedError
-
     if yaml is None:
       raise ImportError('Requires yaml module installed.')
     return yaml.dump(self._updated_config(), **kwargs)
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
index 4a18cc2e119d7cfb3f15da593d4944abd445905b..dde090120456f968267e1c572f22eda1bd6ed7c4 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
@@ -340,7 +340,7 @@ class TestWholeModelSaving(test.TestCase):
       model.add(keras.layers.Dense(2, input_shape=(3,)))
       model.add(keras.layers.Dense(3))
       model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-      model.model._make_train_function()
+      model._make_train_function()
 
       fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential.py b/tensorflow/python/keras/_impl/keras/engine/sequential.py
index db5e7754bc22ba360dbf635f1bd80334f58e8509..2ef99d5ab3f432058fdf685b99b01aa0b5eeffdc 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential.py
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential.py
@@ -20,26 +20,18 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import os
 
-from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers as layer_module
 from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.keras._impl.keras.engine import network
-from tensorflow.python.keras._impl.keras.engine import saving
 from tensorflow.python.keras._impl.keras.engine.input_layer import Input
 from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras._impl.keras.engine.training import Model
-from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpointable
 from tensorflow.python.util.tf_export import tf_export
 
-try:
-  import h5py  # pylint: disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
 
 @tf_export('keras.models.Sequential', 'keras.Sequential')
 class Sequential(Model):
@@ -48,77 +40,75 @@ class Sequential(Model):
   Arguments:
       layers: list of layers to add to the model.
 
-  # Note
-      The first layer passed to a Sequential model
-      should have a defined input shape. What that
-      means is that it should have received an `input_shape`
-      or `batch_input_shape` argument,
-      or for some type of layers (recurrent, Dense...)
-      an `input_dim` argument.
-
   Example:
 
-      ```python
-          model = Sequential()
-          # first layer must have a defined input shape
-          model.add(Dense(32, input_dim=500))
-          # afterwards, Keras does automatic shape inference
-          model.add(Dense(32))
-
-          # also possible (equivalent to the above):
-          model = Sequential()
-          model.add(Dense(32, input_shape=(500,)))
-          model.add(Dense(32))
-
-          # also possible (equivalent to the above):
-          model = Sequential()
-          # here the batch dimension is None,
-          # which means any batch size will be accepted by the model.
-          model.add(Dense(32, batch_input_shape=(None, 500)))
-          model.add(Dense(32))
-      ```
+  ```python
+  # Optionally, the first layer can receive an `input_shape` argument:
+  model = Sequential()
+  model.add(Dense(32, input_shape=(500,)))
+  # Afterwards, we do automatic shape inference:
+  model.add(Dense(32))
+
+  # This is identical to the following:
+  model = Sequential()
+  model.add(Dense(32, input_dim=500))
+
+  # And to the following:
+  model = Sequential()
+  model.add(Dense(32, batch_input_shape=(None, 500)))
+
+  # Note that you can also omit the `input_shape` argument:
+  # In that case the model gets built the first time you call `fit` (or other
+  # training and evaluation methods).
+  model = Sequential()
+  model.add(Dense(32))
+  model.add(Dense(32))
+  model.compile(optimizer=optimizer, loss=loss)
+  # This builds the model for the first time:
+  model.fit(x, y, batch_size=32, epochs=10)
+
+  # Note that when using this delayed-build pattern (no input shape specified),
+  # the model doesn't have any weights until the first call
+  # to a training/evaluation method (since it isn't yet built):
+  model = Sequential()
+  model.add(Dense(32))
+  model.add(Dense(32))
+  model.weights  # returns []
+
+  # Whereas if you specify the input shape, the model gets built continuously
+  # as you are adding layers:
+  model = Sequential()
+  model.add(Dense(32, input_shape=(500,)))
+  model.add(Dense(32))
+  model.weights  # returns list of length 4
+
+  When using the delayed-build pattern (no input shape specified), you can
+  choose to manually build your model by calling `build(batch_input_shape)`:
+  model = Sequential()
+  model.add(Dense(32))
+  model.add(Dense(32))
+  model.build((None, 500))
+  model.weights  # returns list of length 4
+  ```
   """
 
   def __init__(self, layers=None, name=None):
-    self._is_graph_network = True
-    self._is_compiled = False
-    self._layers = []  # Stack of layers.
-    self.model = None  # Internal Model instance.
-    self.inputs = []  # List of input tensors
-    self.outputs = []  # List of length 1: the output tensor (unique).
-    self._trainable = True
-    self._initial_weights = None
-    self._input_layers = []
-
-    # Model attributes.
-    self._inbound_nodes = []
-    self._outbound_nodes = []
-    self.built = False
-
-    # Set model name.
-    if not name:
-      prefix = 'sequential_'
-      name = prefix + str(K.get_uid(prefix))
-    self._name = name
-
-    # Used by Layer base class.
-    self._dtype = None
-    self._activity_regularizer = None
-
-    # The following properties are not actually used by Keras;
-    # they exist for compatibility with TF's variable scoping mechanism.
-    self._updates = []
-    self._losses = []
-    self._scope = None
-    self._reuse = None
-    self._base_name = name
-    self._graph = ops.get_default_graph()
+    super(Sequential, self).__init__(name=name)
 
     # Add to the model any layers passed to the constructor.
     if layers:
       for layer in layers:
         self.add(layer)
 
+  @property
+  def layers(self):
+    # Historically, `sequential.layers` only returns layers that were added
+    # via `add`, and omits the auto-generated `InputLayer` that comes at the
+    # bottom of the stack.
+    if self._layers and isinstance(self._layers[0], InputLayer):
+      return self._layers[1:]
+    return self._layers
+
   def add(self, layer):
     """Adds a layer instance on top of the layer stack.
 
@@ -137,11 +127,14 @@ class Sequential(Model):
       raise TypeError('The added layer must be '
                       'an instance of class Layer. '
                       'Found: ' + str(layer))
-    if not self.outputs:
+    self.built = False
+    if not self._layers:
+      set_inputs = False
       # First layer in model: check that it is an input layer.
       if not isinstance(layer, InputLayer):
-        # Create an input layer.
-        # First, we need to infer its expected input shape and dtype.
+        # Create an input tensor and call `layer` on the input tensor.
+        # First, we need to infer the expected input shape and dtype.
+        first_layer = layer
         if isinstance(layer, (Model, Sequential)):
           # We were passed a model as first layer.
           # This requires a specific way to figure out the
@@ -156,42 +149,39 @@ class Sequential(Model):
             first_layer = first_layer.layers[0]
           batch_shape = first_layer._batch_input_shape
           dtype = first_layer.dtype
+
+        if hasattr(first_layer, '_batch_input_shape'):
+          batch_shape = first_layer._batch_input_shape
+          dtype = first_layer.dtype
+          # Instantiate the input layer.
+          x = Input(
+              batch_shape=batch_shape,
+              dtype=dtype,
+              name=layer.name + '_input')
+          # This will build the current layer
+          # and create the node connecting the current layer
+          # to the input layer we just created.
+          layer(x)
+          set_inputs = True
         else:
-          # We were passed a regular layer, and it should
-          # know about its input shape. Otherwise, that's an error.
-          if not hasattr(layer, '_batch_input_shape'):
-            raise ValueError('The first layer in a '
-                             'Sequential model must '
-                             'get an `input_shape` argument.')
-          batch_shape = layer._batch_input_shape
-          dtype = layer.dtype
-        # Instantiate the input layer.
-        x = Input(
-            batch_shape=batch_shape, dtype=dtype, name=layer.name + '_input')
-        # This will build the current layer
-        # and create the node connecting the current layer
-        # to the input layer we just created.
-        layer(x)
-
-      if len(layer._inbound_nodes[-1].output_tensors) != 1:
-        raise ValueError('All layers in a Sequential model '
-                         'should have a single output tensor. '
-                         'For multi-output layers, '
-                         'use the functional API.')
-
-      self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
-      self.inputs = network.get_source_inputs(self.outputs[0])
-
-      # We create an input node, which we will keep updated
-      # as we add more layers
-      base_layer.Node(
-          outbound_layer=self,
-          inbound_layers=[],
-          node_indices=[],
-          tensor_indices=[],
-          input_tensors=self.inputs,
-          output_tensors=self.outputs)
-    else:
+          # The layer doesn't know about its expected shape. We will have to
+          # build the model lazily on `fit`/etc.
+          batch_shape = None
+      else:
+        # Corner case where the user passes an InputLayer layer via `add`.
+        assert len(layer._inbound_nodes[-1].output_tensors) == 1
+        set_inputs = True
+
+      if set_inputs:
+        if len(layer._inbound_nodes[-1].output_tensors) != 1:
+          raise ValueError('All layers in a Sequential model '
+                           'should have a single output tensor. '
+                           'For multi-output layers, '
+                           'use the functional API.')
+
+        self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
+        self.inputs = network.get_source_inputs(self.outputs[0])
+    elif self.outputs:
       output_tensor = layer(self.outputs[0])
       if isinstance(output_tensor, list):
         raise TypeError('All layers in a Sequential model '
@@ -199,12 +189,40 @@ class Sequential(Model):
                         'For multi-output layers, '
                         'use the functional API.')
       self.outputs = [output_tensor]
-      # update self._inbound_nodes
-      self._inbound_nodes[0].output_tensors = self.outputs
-      self._inbound_nodes[0].output_shapes = [K.int_shape(self.outputs[0])]
+    if self.inputs:
+      self.build()
+    else:
+      self._layers.append(layer)
+    # In implementing Checkpointable, Sequential does not track its Layers
+    # normally, since they may be added and removed (in pop()). Instead, it
+    # names everything on demand (gathering dependencies in
+    # _checkpoint_dependencies, and looking them up in
+    # _lookup_dependency). _handle_deferred_dependencies just checks whether an
+    # existing checkpoint load targets this Layer, it does not create a
+    # dependency on the Layer.
+    self._handle_deferred_dependencies(
+        name='layer-%d' % (len(self._layers) - 1), checkpointable=layer)
 
-    self._layers.append(layer)
-    self.built = False
+  @property
+  def _checkpoint_dependencies(self):
+    """For implementing Checkpointable. Layers which should be saved."""
+    return super(Sequential, self)._checkpoint_dependencies + [
+        checkpointable.CheckpointableReference(
+            name='layer-%d' % layer_index, ref=layer)
+        for layer_index, layer in enumerate(self._layers)]
+
+  def _lookup_dependency(self, name):
+    """For implementing Checkpointable. Looks up a Layer."""
+    super_lookup = super(Sequential, self)._lookup_dependency(name=name)
+    if super_lookup is not None:
+      return super_lookup
+    if name.startswith('layer-'):
+      try:
+        return self._layers[int(name[6:])]
+      except IndexError:
+        return None
+    else:
+      return None
 
   def pop(self):
     """Removes the last layer in the model.
@@ -215,478 +233,30 @@ class Sequential(Model):
     if not self.layers:
       raise TypeError('There are no layers in the model.')
 
-    self.layers.pop()
+    self._layers.pop()
+    self.built = False
     if not self.layers:
-      self.outputs = []
-      self._inbound_nodes = []
-      self._outbound_nodes = []
-    else:
+      self.outputs = None
+      self.inputs = None
+    elif self.outputs:
       self.layers[-1]._outbound_nodes = []
       self.outputs = [self.layers[-1].output]
-      # update self._inbound_nodes
-      self._inbound_nodes[0].output_tensors = self.outputs
-      self._inbound_nodes[0].output_shapes = [K.int_shape(self.outputs[0])]
-    self.built = False
-
-  def get_layer(self, name=None, index=None):
-    """Retrieve a layer that is part of the model.
-
-    Returns a layer based on either its name (unique)
-    or its index in the graph. Indices are based on
-    order of horizontal graph traversal (bottom-up).
-
-    Arguments:
-        name: string, name of layer.
-        index: integer, index of layer.
-
-    Returns:
-        A layer instance.
-    """
-    if not self.built:
       self.build()
-    return self.model.get_layer(name, index)
-
-  def call(self, inputs, **kwargs):
-    if not self.built:
-      self.build()
-    return self.model.call(inputs, **kwargs)
 
   def build(self, input_shape=None):
-    if not self.inputs or not self.outputs:
-      raise TypeError('Sequential model cannot be built: model is empty.'
-                      ' Add some layers first.')
-    # actually create the model
-    self.model = Model(self.inputs, self.outputs[0], name=self.name + '_model')
-    self.model.trainable = self.trainable
-
-    # mirror model attributes
-    self.supports_masking = self.model.supports_masking
-    self._output_mask_cache = self.model._output_mask_cache
-    self._output_tensor_cache = self.model._output_tensor_cache
-    self._output_shape_cache = self.model._output_shape_cache
-    self._input_layers = self.model._input_layers
-    self._output_layers = self.model._output_layers
-    self._input_coordinates = self.model._input_coordinates
-    self._output_coordinates = self.model._output_coordinates
-    self._nodes_by_depth = self.model._nodes_by_depth
-    self._network_nodes = self.model._network_nodes
-    self.output_names = self.model.output_names
-    self.input_names = self.model.input_names
-    self._feed_input_names = self.model._feed_input_names
-    self._feed_inputs = self.model._feed_inputs
-
-    # Make sure child model callbacks
-    # will call the parent Sequential model.
-    self.model.callback_model = self
-
-    self.built = True
-
-  @property
-  def uses_learning_phase(self):
-    if not self.built:
-      self.build()
-    return self.model.uses_learning_phase
-
-  def _gather_list_attr(self, attr):
-    all_attrs = []
-    for layer in self.layers:
-      all_attrs += getattr(layer, attr, [])
-    return all_attrs
-
-  def _make_train_function(self):
-    self.model._make_train_function()
-
-  def _make_test_function(self):
-    self.model._make_test_function()
-
-  def _make_predict_function(self):
-    self.model._make_predict_function()
-
-  @property
-  def trainable(self):
-    return self._trainable
-
-  @trainable.setter
-  def trainable(self, value):
-    if self.model:
-      self.model.trainable = value
-    self._trainable = value
-
-  @property
-  def trainable_weights(self):
-    if not self.trainable:
-      return []
-    return self._gather_list_attr('trainable_weights')
-
-  @property
-  def non_trainable_weights(self):
-    weights = self._gather_list_attr('non_trainable_weights')
-    if not self.trainable:
-      trainable_weights = self._gather_list_attr('trainable_weights')
-      return trainable_weights + weights
-    return weights
-
-  @property
-  def regularizers(self):
-    if not self.built:
-      self.build()
-    return self.model.regularizers
-
-  def get_weights(self):
-    """Retrieves the weights of the model.
-
-    Returns:
-        A flat list of Numpy arrays
-        (one array per model weight).
-    """
-    if not self.built:
-      self.build()
-    return self.model.get_weights()
-
-  def set_weights(self, weights):
-    """Sets the weights of the model.
-
-    Arguments:
-        weights: Should be a list
-            of Numpy arrays with shapes and types matching
-            the output of `model.get_weights()`.
-    """
-    if not self.built:
-      self.build()
-    self.model.set_weights(weights)
-
-  def load_weights(self, filepath, by_name=False):
-    if h5py is None:
-      raise ImportError('`load_weights` requires h5py.')
-    f = h5py.File(filepath, mode='r')
-    if 'layer_names' not in f.attrs and 'model_weights' in f:
-      f = f['model_weights']
-    layers = self.layers
-    if by_name:
-      saving.load_weights_from_hdf5_group_by_name(f, layers)
-    else:
-      saving.load_weights_from_hdf5_group(f, layers)
-    if hasattr(f, 'close'):
-      f.close()
-
-  def save_weights(self, filepath, overwrite=True):
-    if h5py is None:
-      raise ImportError('`save_weights` requires h5py.')
-    # If file exists and should not be overwritten:
-    if not overwrite and os.path.isfile(filepath):
-      proceed = ask_to_proceed_with_overwrite(filepath)
-      if not proceed:
-        return
-    layers = self.layers
-    f = h5py.File(filepath, 'w')
-    saving.save_weights_to_hdf5_group(f, layers)
-    f.flush()
-    f.close()
-
-  def compile(self,
-              optimizer,
-              loss,
-              metrics=None,
-              sample_weight_mode=None,
-              weighted_metrics=None,
-              target_tensors=None,
-              **kwargs):
-    """Configures the model for training.
-
-    Arguments:
-        optimizer: String (name of optimizer) or optimizer object.
-            See [optimizers](/optimizers).
-        loss: String (name of objective function) or objective function.
-            See [losses](/losses).
-            If the model has multiple outputs, you can use a different loss
-            on each output by passing a dictionary or a list of losses.
-            The loss value that will be minimized by the model
-            will then be the sum of all individual losses.
-        metrics: List of metrics to be evaluated by the model
-            during training and testing.
-            Typically you will use `metrics=['accuracy']`.
-            To specify different metrics for different outputs of a
-            multi-output model, you could also pass a dictionary,
-            such as `metrics={'output_a': 'accuracy'}`.
-        sample_weight_mode: If you need to do timestep-wise
-            sample weighting (2D weights), set this to `"temporal"`.
-            `None` defaults to sample-wise weights (1D).
-            If the model has multiple outputs, you can use a different
-            `sample_weight_mode` on each output by passing a
-            dictionary or a list of modes.
-        weighted_metrics: list of metrics to be evaluated and weighted
-             by `sample_weight` or `class_weight` during training and testing.
-        target_tensors: By default, Keras will create a placeholder for the
-            model's target, which will be fed with the target data during
-            training. If instead you would like to use your own
-            target tensor (in turn, Keras will not expect external
-            Numpy data for these targets at training time), you
-            can specify them via the `target_tensors` argument.
-            It should be a single tensor
-            (for a single-output `Sequential` model).
-        **kwargs: These arguments are passed into `tf.Session.run`.
-
-    Example:
-        ```python
-            model = Sequential()
-            model.add(Dense(32, input_shape=(500,)))
-            model.add(Dense(10, activation='softmax'))
-            model.compile(optimizer='rmsprop',
-                          loss='categorical_crossentropy',
-                          metrics=['accuracy'])
-        ```
-    """
-    # create the underlying model
-    self.build()
-    # call compile method of Model class
-    self.model.compile(
-        optimizer,
-        loss,
-        metrics=metrics,
-        sample_weight_mode=sample_weight_mode,
-        weighted_metrics=weighted_metrics,
-        target_tensors=target_tensors,
-        **kwargs)
-    self.optimizer = self.model.optimizer
-    self.loss = self.model.loss
-    self.metrics = self.model.metrics
-    self.loss_weights = self.model.loss_weights
-    self.sample_weight_mode = self.model.sample_weight_mode
-    self.weighted_metrics = self.model.weighted_metrics
-    self.targets = self.model.targets
-    self.metrics_tensors = self.model.metrics_tensors
-    self.metrics_names = self.model.metrics_names
-    self.sample_weights = self.model.sample_weights
-    self.total_loss = self.model.total_loss
-
-  def fit(self,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          **kwargs):
-    """Trains the model for a fixed number of epochs.
-
-    Arguments:
-        x: Numpy array of training data.
-            If the input layer in the model is named, you can also pass a
-            dictionary mapping the input name to a Numpy array.
-            `x` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        y: Numpy array of target (label) data.
-            If the output layer in the model is named, you can also pass a
-            dictionary mapping the output name to a Numpy array.
-            `y` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, it will default to 32.
-        epochs: Integer. Number of epochs to train the model.
-            An epoch is an iteration over the entire `x` and `y`
-            data provided.
-            Note that in conjunction with `initial_epoch`,
-            `epochs` is to be understood as "final epoch".
-            The model is not trained for a number of iterations
-            given by `epochs`, but merely until the epoch
-            of index `epochs` is reached.
-        verbose: 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = progress bar, 2 = one line per epoch.
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during training.
-            See [callbacks](/callbacks).
-        validation_split: Float between 0 and 1:
-            Fraction of the training data to be used as validation data.
-            The model will set apart this fraction of the training data,
-            will not train on it, and will evaluate
-            the loss and any model metrics
-            on this data at the end of each epoch.
-            The validation data is selected from the last samples
-            in the `x` and `y` data provided, before shuffling.
-        validation_data: tuple `(x_val, y_val)` or tuple
-            `(x_val, y_val, val_sample_weights)` on which to evaluate
-            the loss and any model metrics at the end of each epoch.
-            The model will not be trained on this data.
-            This will override `validation_split`.
-        shuffle: Boolean (whether to shuffle the training data
-            before each epoch) or str (for 'batch').
-            'batch' is a special option for dealing with the
-            limitations of HDF5 data; it shuffles in batch-sized chunks.
-            Has no effect when `steps_per_epoch` is not `None`.
-        class_weight: Optional dictionary mapping class indices (integers)
-            to a weight (float) value, used for weighting the loss function
-            (during training only).
-            This can be useful to tell the model to
-            "pay more attention" to samples from
-            an under-represented class.
-        sample_weight: Optional Numpy array of weights for
-            the training samples, used for weighting the loss function
-            (during training only). You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`.
-        initial_epoch: Epoch at which to start training
-            (useful for resuming a previous training run).
-        steps_per_epoch: Total number of steps (batches of samples)
-            before declaring one epoch finished and starting the
-            next epoch. When training with input tensors such as
-            TensorFlow data tensors, the default `None` is equal to
-            the number of unique samples in your dataset divided by
-            the batch size, or 1 if that cannot be determined.
-        validation_steps: Only relevant if `steps_per_epoch`
-            is specified. Total number of steps (batches of samples)
-            to validate before stopping.
-        **kwargs: Used for backwards compatibility support.
-
-    Returns:
-        A `History` object. Its `History.history` attribute is
-        a record of training loss values and metrics values
-        at successive epochs, as well as validation loss values
-        and validation metrics values (if applicable).
-
-    Raises:
-        RuntimeError: If the model was never compiled.
-        ValueError: In case of mismatch between the provided input data
-            and what the model expects.
-    """
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.fit(
-        x,
-        y,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_split=validation_split,
-        validation_data=validation_data,
-        shuffle=shuffle,
-        class_weight=class_weight,
-        sample_weight=sample_weight,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps)
-
-  def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):
-    """Computes the loss on some input data, batch by batch.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-        y: labels, as a Numpy array.
-        batch_size: integer. Number of samples per gradient update.
-        verbose: verbosity mode, 0 or 1.
-        sample_weight: sample weights, as a Numpy array.
-
-    Returns:
-        Scalar test loss (if the model has no metrics)
-        or list of scalars (if the model computes other metrics).
-        The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-    """
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.evaluate(
-        x,
-        y,
-        batch_size=batch_size,
-        verbose=verbose,
-        sample_weight=sample_weight)
-
-  def predict(self, x, batch_size=32, verbose=0):
-    """Generates output predictions for the input samples.
-
-    The input samples are processed batch by batch.
-
-    Arguments:
-        x: the input data, as a Numpy array.
-        batch_size: integer.
-        verbose: verbosity mode, 0 or 1.
-
-    Returns:
-        A Numpy array of predictions.
-    """
-    if not self.built:
-      self.build()
-    return self.model.predict(x, batch_size=batch_size, verbose=verbose)
-
-  def predict_on_batch(self, x):
-    """Returns predictions for a single batch of samples.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-
-    Returns:
-        A Numpy array of predictions.
-    """
-    if not self.built:
-      self.build()
-    return self.model.predict_on_batch(x)
-
-  def train_on_batch(self, x, y, class_weight=None, sample_weight=None):
-    """Single gradient update over one batch of samples.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-        y: labels, as a Numpy array.
-        class_weight: dictionary mapping classes to a weight value,
-            used for scaling the loss function (during training only).
-        sample_weight: sample weights, as a Numpy array.
-
-    Returns:
-        Scalar training loss (if the model has no metrics)
-        or list of scalars (if the model computes other metrics).
-        The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-    """
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.train_on_batch(
-        x, y, sample_weight=sample_weight, class_weight=class_weight)
-
-  def test_on_batch(self, x, y, sample_weight=None):
-    """Evaluates the model over a single batch of samples.
-
-    Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-        y: labels, as a Numpy array.
-        sample_weight: sample weights, as a Numpy array.
-
-    Returns:
-        Scalar test loss (if the model has no metrics)
-        or list of scalars (if the model computes other metrics).
-        The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-    """
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.test_on_batch(x, y, sample_weight=sample_weight)
+    if input_shape and not self.inputs:
+      batch_shape = tuple(input_shape)
+      dtype = K.floatx()
+      x = Input(
+          batch_shape=batch_shape, dtype=dtype, name=self.name + '_input')
+      self.inputs = [x]
+      for layer in self._layers:
+        x = layer(x)
+      self.outputs = [x]
+
+    if self.inputs:
+      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self.built = True
 
   def predict_proba(self, x, batch_size=32, verbose=0):
     """Generates class probability predictions for the input samples.
@@ -730,255 +300,6 @@ class Sequential(Model):
     else:
       return (proba > 0.5).astype('int32')
 
-  def fit_generator(self,
-                    generator,
-                    steps_per_epoch=None,
-                    epochs=1,
-                    verbose=1,
-                    callbacks=None,
-                    validation_data=None,
-                    validation_steps=None,
-                    class_weight=None,
-                    max_queue_size=10,
-                    workers=1,
-                    use_multiprocessing=False,
-                    shuffle=True,
-                    initial_epoch=0,
-                    **kwargs):
-    """Fits the model on data generated batch-by-batch by a Python generator.
-
-    The generator is run in parallel to the model, for efficiency.
-    For instance, this allows you to do real-time data augmentation
-    on images on CPU in parallel to training your model on GPU.
-
-    Arguments:
-        generator: A generator.
-            The output of the generator must be either
-            - a tuple (inputs, targets)
-            - a tuple (inputs, targets, sample_weights).
-            All arrays should contain the same number of samples.
-            The generator is expected to loop over its data
-            indefinitely. An epoch finishes when `steps_per_epoch`
-            batches have been seen by the model.
-        steps_per_epoch: Total number of steps (batches of samples)
-            to yield from `generator` before declaring one epoch
-            finished and starting the next epoch. It should typically
-            be equal to the number of samples of your dataset
-            divided by the batch size.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        epochs: Integer, total number of iterations on the data.
-            Note that in conjunction with initial_epoch, the parameter
-            epochs is to be understood as "final epoch". The model is
-            not trained for n steps given by epochs, but until the
-            epoch epochs is reached.
-        verbose: Verbosity mode, 0, 1, or 2.
-        callbacks: List of callbacks to be called during training.
-        validation_data: This can be either
-            - A generator for the validation data
-            - A tuple (inputs, targets)
-            - A tuple (inputs, targets, sample_weights).
-        validation_steps: Only relevant if `validation_data`
-            is a generator.
-            Number of steps to yield from validation generator
-            at the end of every epoch. It should typically
-            be equal to the number of samples of your
-            validation dataset divided by the batch size.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(validation_data)` as a number of steps.
-        class_weight: Dictionary mapping class indices to a weight
-            for the class.
-        max_queue_size: Maximum size for the generator queue
-        workers: Maximum number of processes to spin up
-        use_multiprocessing: If True, use process based threading.
-            Note that because
-            this implementation relies on multiprocessing,
-            you should not pass
-            non picklable arguments to the generator
-            as they can't be passed
-            easily to children processes.
-       shuffle: Whether to shuffle the order of the batches at
-              the beginning of each epoch. Only used with instances
-              of `Sequence` (keras.utils.Sequence).
-        initial_epoch: Epoch at which to start training
-            (useful for resuming a previous training run)
-        **kwargs: support for legacy arguments.
-
-    Returns:
-        A `History` object.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-        ValueError: In case the generator yields
-            data in an invalid format.
-
-    Example:
-
-    ```python
-        def generate_arrays_from_file(path):
-            while 1:
-                f = open(path)
-                for line in f:
-                    # create Numpy arrays of input data
-                    # and labels, from each line in the file
-                    x, y = process_line(line)
-                    yield (x, y)
-                    f.close()
-
-        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
-                            steps_per_epoch=1000, epochs=10)
-    ```
-    """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.fit_generator(
-        generator,
-        steps_per_epoch,
-        epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        class_weight=class_weight,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch)
-
-  def evaluate_generator(self,
-                         generator,
-                         steps=None,
-                         max_queue_size=10,
-                         workers=1,
-                         use_multiprocessing=False,
-                         **kwargs):
-    """Evaluates the model on a data generator.
-
-    The generator should return the same kind of data
-    as accepted by `test_on_batch`.
-
-    Arguments:
-        generator: Generator yielding tuples (inputs, targets)
-            or (inputs, targets, sample_weights)
-        steps: Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        max_queue_size: maximum size for the generator queue
-        workers: maximum number of processes to spin up
-        use_multiprocessing: if True, use process based threading.
-            Note that because this implementation
-            relies on multiprocessing, you should not pass
-            non picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        **kwargs: support for legacy arguments.
-
-    Returns:
-        Scalar test loss (if the model has no metrics)
-        or list of scalars (if the model computes other metrics).
-        The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        RuntimeError: if the model was never compiled.
-        ValueError: In case the generator yields
-            data in an invalid format.
-    """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    if not self.built:
-      raise RuntimeError('The model needs to be compiled before being used.')
-    return self.model.evaluate_generator(
-        generator,
-        steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
-
-  def predict_generator(self,
-                        generator,
-                        steps=None,
-                        max_queue_size=10,
-                        workers=1,
-                        use_multiprocessing=False,
-                        verbose=0,
-                        **kwargs):
-    """Generates predictions for the input samples from a data generator.
-
-    The generator should return the same kind of data as accepted by
-    `predict_on_batch`.
-
-    Arguments:
-        generator: generator yielding batches of input samples.
-        steps: Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        max_queue_size: maximum size for the generator queue
-        workers: maximum number of processes to spin up
-        use_multiprocessing: if True, use process based threading.
-            Note that because this implementation
-            relies on multiprocessing, you should not pass
-            non picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        verbose: verbosity mode, 0 or 1.
-        **kwargs: support for legacy arguments.
-
-    Returns:
-        A Numpy array of predictions.
-
-    Raises:
-        ValueError: In case the generator yields
-            data in an invalid format.
-    """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    if not self.built:
-      self.build()
-    return self.model.predict_generator(
-        generator,
-        steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose)
-
   def get_config(self):
     config = []
     for layer in self.layers:
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
index 166634bd8219b831ce212ba983a4ab695b00c3b7..c9a47581df03e0fc1ad38552ba8634862435cd80 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
@@ -20,90 +20,110 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.platform import test
+from tensorflow.python.training import rmsprop
 
 
 class TestSequential(test.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
   """
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_basic_methods(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(1, input_dim=2))
     model.add(keras.layers.Dropout(0.3, name='dp'))
     model.add(keras.layers.Dense(2, kernel_regularizer='l2',
                                  kernel_constraint='max_norm'))
-    model.build()
-    self.assertEqual(model.state_updates, model.model.state_updates)
+    self.assertEqual(len(model.layers), 3)
+    self.assertEqual(len(model.weights), 2 * 2)
     self.assertEqual(model.get_layer(name='dp').name, 'dp')
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_sequential_pop(self):
     num_hidden = 5
     input_dim = 3
     batch_size = 5
     num_classes = 2
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
-      model.add(keras.layers.Dense(num_classes))
-      model.compile(loss='mse', optimizer='sgd')
-      x = np.random.random((batch_size, input_dim))
-      y = np.random.random((batch_size, num_classes))
-      model.fit(x, y, epochs=1)
-      model.pop()
-      self.assertEqual(len(model.layers), 1)
-      self.assertEqual(model.output_shape, (None, num_hidden))
-      model.compile(loss='mse', optimizer='sgd')
-      y = np.random.random((batch_size, num_hidden))
-      model.fit(x, y, epochs=1)
 
-      # Test popping single-layer model
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+    model.add(keras.layers.Dense(num_classes))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    x = np.random.random((batch_size, input_dim))
+    y = np.random.random((batch_size, num_classes))
+    model.fit(x, y, epochs=1)
+    model.pop()
+    self.assertEqual(len(model.layers), 1)
+    self.assertEqual(model.output_shape, (None, num_hidden))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    y = np.random.random((batch_size, num_hidden))
+    model.fit(x, y, epochs=1)
+
+    # Test popping single-layer model
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+    model.pop()
+    self.assertEqual(model.layers, [])
+    self.assertEqual(model.outputs, None)
+
+    # Invalid use case
+    model = keras.models.Sequential()
+    with self.assertRaises(TypeError):
       model.pop()
-      self.assertEqual(len(model.layers), 0)
-      self.assertEqual(len(model.outputs), 0)
 
-      # Invalid use case
-      model = keras.models.Sequential()
-      with self.assertRaises(TypeError):
-        model.pop()
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_sequential_deferred_build(self):
+    num_hidden = 5
+    input_dim = 3
+    batch_size = 5
+    num_classes = 2
 
+    model = keras.models.Sequential()
+    # We don't specify the input shape.
+    model.add(keras.layers.Dense(num_hidden))
+    model.add(keras.layers.Dense(num_classes))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    self.assertEqual(len(model.layers), 2)
+    self.assertEqual(len(model.weights), 0)
+    self.assertFalse(model.built)
+
+    x = np.random.random((batch_size, input_dim))
+    y = np.random.random((batch_size, num_classes))
+    model.fit(x, y, epochs=1)
+    self.assertTrue(model.built)
+    self.assertEqual(model.inputs[0].get_shape().as_list(), [None, input_dim])
+    self.assertEqual(model.outputs[0].get_shape().as_list(),
+                     [None, num_classes])
+    self.assertEqual(len(model.weights), 2 * 2)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_invalid_use_cases(self):
-    with self.test_session():
-      # Added objects must be layer instances
-      with self.assertRaises(TypeError):
-        model = keras.models.Sequential()
-        model.add(None)
-
-      # Added layers must have an inputs shape
-      with self.assertRaises(ValueError):
-        model = keras.models.Sequential()
-        model.add(keras.layers.Dense(1))
-
-      # Added layers cannot have multiple outputs
-      class MyLayer(keras.layers.Layer):
-
-        def call(self, inputs):
-          return [3 * inputs, 2 * inputs]
-
-        def compute_output_shape(self, input_shape):
-          return [input_shape, input_shape]
-
-      with self.assertRaises(ValueError):
-        model = keras.models.Sequential()
-        model.add(MyLayer(input_shape=(3,)))
-      with self.assertRaises(TypeError):
-        model = keras.models.Sequential()
-        model.add(keras.layers.Dense(1, input_dim=1))
-        model.add(MyLayer())
-
-      # Building empty model
+    # Added objects must be layer instances
+    with self.assertRaises(TypeError):
       model = keras.models.Sequential()
-      with self.assertRaises(TypeError):
-        model.build()
+      model.add(None)
+
+    # Added layers cannot have multiple outputs
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return [3 * inputs, 2 * inputs]
+
+      def compute_output_shape(self, input_shape):
+        return [input_shape, input_shape]
 
+    with self.assertRaises(ValueError):
+      model = keras.models.Sequential()
+      model.add(MyLayer(input_shape=(3,)))
+    with self.assertRaises(TypeError):
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(1, input_dim=1))
+      model.add(MyLayer())
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_nested_sequential_trainability(self):
     input_dim = 20
     num_units = 10
@@ -116,6 +136,8 @@ class TestSequential(test.TestCase):
     model.add(inner_model)
     model.add(keras.layers.Dense(num_classes))
 
+    self.assertEqual(len(model.layers), 2)
+
     self.assertEqual(len(model.trainable_weights), 4)
     inner_model.trainable = False
     self.assertEqual(len(model.trainable_weights), 2)
@@ -135,7 +157,6 @@ class TestSequential(test.TestCase):
 
       model.compile('sgd', 'mse')
       assert not model.updates
-      assert not model.model.updates
 
       x1 = model.predict(val_a)
       model.train_on_batch(val_a, val_out)
@@ -145,8 +166,11 @@ class TestSequential(test.TestCase):
       model.trainable = True
       model.compile('sgd', 'mse')
       assert model.updates
-      assert model.model.updates
 
       model.train_on_batch(val_a, val_out)
       x2 = model.predict(val_a)
       assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index b50277c8fff917d77694903c989fd02ea98b1711..9ab4b6fdcf55cc6186b96dd4e747f3600a4f78f8 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -783,7 +783,7 @@ class TopologyConstructionTest(test.TestCase):
   def test_activity_regularization_with_model_composition(self):
 
     def reg(x):
-      return keras.backend.sum(x)
+      return math_ops.reduce_sum(x)
 
     net_a_input = keras.Input((2,))
     net_a = net_a_input
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 4acb41553eda2e07962f6ac510f08988a5adb90c..71de657da81b92a2fc6b1eef9041147be6ff307e 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -874,7 +874,19 @@ class Model(Network):
         whether to build the model's graph in inference mode (False), training
         mode (True), or using the Keras learning phase (None).
     """
-    if context.executing_eagerly():
+    if not getattr(self, '_uses_inputs_arg', True):
+      raise NotImplementedError(
+          'Subclassed Models without "inputs" in their call() signatures do '
+          'not yet support shape inference. File a feature request if this '
+          'limitation bothers you.')
+    if self.__class__.__name__ == 'Sequential':
+      # Note: we can't test whether the model is `Sequential` via `isinstance`
+      # since `Sequential` depends on `Model`.
+      if isinstance(inputs, list):
+        assert len(inputs) == 1
+        inputs = inputs[0]
+      self.build(input_shape=(None,) + inputs.shape[1:])
+    elif context.executing_eagerly():
       self._eager_set_inputs(inputs)
     else:
       self._symbolic_set_inputs(inputs, training=training)
@@ -1169,6 +1181,9 @@ class Model(Network):
           batch_size=batch_size)
 
     elif validation_split and 0. < validation_split < 1.:
+      if training_utils.has_symbolic_tensors(x):
+        raise ValueError('If your data is in the form of symbolic tensors, '
+                         'you cannot use `validation_split`.')
       if hasattr(x[0], 'shape'):
         split_at = int(x[0].shape[0] * (1. - validation_split))
       else:
@@ -1581,9 +1596,9 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    if not self._is_graph_network:
+    if not self.built and not self._is_graph_network:
       raise NotImplementedError(
-          '`fit_generator` is not yet enabled for Model subclasses')
+          '`fit_generator` is not yet enabled for unbuilt Model subclasses')
 
     return training_generator.fit_generator(
         self,
@@ -1647,9 +1662,10 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    if not self._is_graph_network:
+    if not self.built and not self._is_graph_network:
       raise NotImplementedError(
-          '`evaluate_generator` is not yet enabled for Model subclasses')
+          '`evaluate_generator` is not yet enabled for '
+          'unbuilt Model subclasses')
 
     return training_generator.evaluate_generator(
         self,
@@ -1700,9 +1716,9 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    if not self._is_graph_network:
+    if not self.built and not self._is_graph_network:
       raise NotImplementedError(
-          '`predict_generator` is not yet enabled for Model subclasses')
+          '`predict_generator` is not yet enabled for unbuilt Model subclasses')
 
     return training_generator.predict_generator(
         self,
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index 67858a578c5c95b3099e1e6713f3287748fc861f..4cdb5f108a05bb88ed328ca20351914160906e86 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -31,9 +31,8 @@ from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.keras._impl.keras.engine import training_utils
-from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -173,6 +172,41 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
   return outs, total_loss, loss_metrics
 
 
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
+
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
+
+  Arguments:
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
+
+  Returns:
+    Slice of data (either single array or list of arrays).
+  """
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    converted_to_list = False
+    if not isinstance(arrays, list):
+      converted_to_list = True
+      arrays = [arrays]
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
+    if converted_to_list:
+      slices = slices[0]
+    return slices
+  else:
+    return generic_utils.slice_arrays(arrays, indices)
+
+
 def _process_single_batch(model,
                           inputs,
                           targets,
@@ -270,9 +304,8 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
-  metric_names, metrics_results = _eager_metrics_fn(
+  _, metrics_results = _eager_metrics_fn(
       model, outs, targets)
-  model.metrics_names.append(metric_names)
   if not isinstance(loss, list):
     loss = [loss]
   return loss + loss_metrics + metrics_results
@@ -328,6 +361,12 @@ def fit_loop(
   Raises:
     ValueError: In case of invalid argument values.
   """
+  if not batch_size:
+    raise ValueError('With eager execution, `batch_size` should be specified.')
+  if steps_per_epoch or validation_steps:
+    raise ValueError('With eager execution, `steps_per_epoch` and '
+                     '`validation_steps` are not valid arguments '
+                     '(set `batch_size` instead).')
   # Required for Eager mode
   with backend.learning_phase_scope(1):
     do_validation = False
@@ -410,15 +449,18 @@ def fit_loop(
       elif shuffle:
         np.random.shuffle(index_array)
 
-      batches = make_batches(num_train_samples, batch_size)
+      batches = generic_utils.make_batches(num_train_samples, batch_size)
 
       for batch_index, (batch_start, batch_end) in enumerate(batches):
         batch_ids = index_array[batch_start:batch_end]
         try:
-          inputs_batch = slice_arrays(inputs, batch_ids)
-          targets_batch = slice_arrays(targets, batch_ids)
+          inputs_batch = slice_arrays(inputs, batch_ids,
+                                      contiguous=not shuffle)
+          targets_batch = slice_arrays(targets, batch_ids,
+                                       contiguous=not shuffle)
           if sample_weights:
-            sample_weights_batch = slice_arrays(sample_weights, batch_ids)
+            sample_weights_batch = slice_arrays(sample_weights, batch_ids,
+                                                contiguous=not shuffle)
           else:
             sample_weights_batch = None
         except TypeError:
@@ -539,8 +581,8 @@ def test_loop(model, inputs, targets,
         feed_data, batch_size=batch_size, steps=steps, steps_name='steps')
     outs = []
     if verbose == 1:
-      progbar = Progbar(target=num_samples)
-    batches = make_batches(num_samples, batch_size)
+      progbar = generic_utils.Progbar(target=num_samples)
+    batches = generic_utils.make_batches(num_samples, batch_size)
     index_array = np.arange(num_samples)
     for batch_index, (batch_start, batch_end) in enumerate(batches):
       batch_ids = index_array[batch_start:batch_end]
@@ -620,12 +662,12 @@ def predict_loop(model, inputs,
         inputs, batch_size, steps, 'steps')
     if verbose == 1:
       if steps is not None:
-        progbar = Progbar(target=steps)
+        progbar = generic_utils.Progbar(target=steps)
       else:
-        progbar = Progbar(target=num_samples)
+        progbar = generic_utils.Progbar(target=num_samples)
 
     outs = []
-    batches = make_batches(num_samples, batch_size)
+    batches = generic_utils.make_batches(num_samples, batch_size)
     index_array = np.arange(num_samples)
     for batch_index, (batch_start, batch_end) in enumerate(batches):
       batch_ids = index_array[batch_start:batch_end]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
index 8848b393d5e602e564cb357c32a937eaabd68203..6cdb6b0753fce1bebec0060524e76d32929d0228 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import numpy as np
 
 from tensorflow.python.framework import ops
@@ -308,6 +307,100 @@ class TrainingTest(test.TestCase):
       model.compile(loss=None,
                     optimizer='rms')
 
+  def test_model_methods_with_eager_tensors_multi_io(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=None)
+
+    input_a = keras.backend.zeros(shape=(10, 3))
+    input_b = keras.backend.zeros(shape=(10, 3))
+    target_d = keras.backend.zeros(shape=(10, 4))
+    target_e = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    # Test: no shuffle.
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0,
+        shuffle=False)
+    # Test: validation data.
+    model.fit([input_a, input_b], [target_d, target_e],
+              epochs=1, batch_size=2, verbose=0,
+              validation_data=([input_a, input_b], [target_d, target_e]))
+    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.predict([input_a, input_b], batch_size=5)
+    model.evaluate([input_a, input_b], [target_d, target_e],
+                   batch_size=2, verbose=0)
+    model.test_on_batch([input_a, input_b], [target_d, target_e])
+
+    # Test: mix np and tensors.
+    input_b = np.zeros(shape=(10, 3)).astype('float32')
+    target_e = np.zeros(shape=(10, 4)).astype('float32')
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit([input_a, input_b], [target_d, target_e],
+              epochs=1, batch_size=2, verbose=0,
+              validation_data=([input_a, input_b], [target_d, target_e]))
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0,
+        shuffle=False)
+    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.predict([input_a, input_b], batch_size=5)
+    model.evaluate([input_a, input_b], [target_d, target_e],
+                   batch_size=2, verbose=0)
+    model.test_on_batch([input_a, input_b], [target_d, target_e])
+
+  def test_model_methods_with_eager_tensors_single_io(self):
+    x = keras.layers.Input(shape=(3,), name='input')
+    y = keras.layers.Dense(4, name='dense')(x)
+    model = keras.Model(x, y)
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = keras.backend.zeros(shape=(10, 3))
+    targets = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0)
+    model.fit(inputs, targets, epochs=1, batch_size=3, verbose=0, shuffle=False)
+    model.fit(inputs, targets, epochs=1, batch_size=4, verbose=0,
+              validation_data=(inputs, targets))
+    model.evaluate(inputs, targets, batch_size=2, verbose=0)
+    model.predict(inputs, batch_size=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+
 
 class LossWeightingTest(test.TestCase):
 
@@ -533,14 +626,5 @@ class LossWeightingTest(test.TestCase):
 
 
 if __name__ == '__main__':
-  # Bazel sets these environment variables to very long paths.
-  # Tempfile uses them to create long paths, and in turn multiprocessing
-  # library tries to create sockets named after paths. Delete whatever bazel
-  # writes to these to avoid tests failing due to socket addresses being too
-  # long.
-  for var in ('TMPDIR', 'TMP', 'TEMP'):
-    if var in os.environ:
-      del os.environ[var]
-
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index fd91dbba52ff7d152335514085ef3b057ae5eec4..08fd26dd18d5bc1b171d780be133f02f51b9c248 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -1117,6 +1117,121 @@ class TestTrainingUtils(test.TestCase):
 
 class TestTrainingWithDataTensors(test.TestCase):
 
+  def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = keras.backend.zeros(shape=(10, 3))
+      targets = keras.backend.zeros(shape=(10, 4))
+
+      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+      model.predict(inputs, steps=2)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
+      model.fit(inputs, targets,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=(inputs, targets), validation_steps=2)
+
+  def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      loss_weights = [1., 0.5]
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+      input_a_tf = keras.backend.zeros(shape=(10, 3))
+      input_b_tf = keras.backend.zeros(shape=(10, 3))
+
+      output_d_tf = keras.backend.zeros(shape=(10, 4))
+      output_e_tf = keras.backend.zeros(shape=(10, 4))
+
+      model.fit(
+          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'should specify the `steps_per_epoch`'):
+        model.fit(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+            epochs=1,
+            batch_size=5,
+            verbose=0)
+      model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
+      # Test with dictionary inputs
+      model.fit(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf},
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0)
+      model.fit(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf},
+          validation_data=({'input_a': input_a_tf,
+                            'input_b': input_b_tf},
+                           {'dense': output_d_tf,
+                            'dropout': output_e_tf}),
+          epochs=1,
+          steps_per_epoch=2,
+          validation_steps=2,
+          verbose=0)
+      model.train_on_batch(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf})
+
+      # Test with validation data
+      model.fit(
+          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+          validation_data=([input_a_tf, input_b_tf],
+                           [output_d_tf, output_e_tf]),
+          epochs=1,
+          steps_per_epoch=2,
+          validation_steps=2,
+          verbose=0)
+      # Test with validation split
+      with self.assertRaisesRegexp(ValueError,
+                                   'you cannot use `validation_split`'):
+        model.fit(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+            epochs=2,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_split=0.2,
+            validation_steps=2)
+
+      # Test evaluation / prediction methods
+      model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+                     steps=2, verbose=0)
+      model.predict([input_a_tf, input_b_tf], steps=2)
+      model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
   def test_model_with_input_feed_tensor(self):
     """We test building a model with a TF variable as input.
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
index 105638ce1087e8668b49b6653a847667e8f9157d..a3fc8ef2a0359c527a2757c1888d61822e35d7a9 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -22,9 +22,11 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.ops import math_ops
 
 
 def check_num_samples(ins,
@@ -64,15 +66,29 @@ def check_num_samples(ins,
     if batch_size is not None:
       raise ValueError(
           'If ' + steps_name + ' is set, the `batch_size` must be None.')
-  elif ins and hasattr(ins[0], 'shape'):
-    num_samples = ins[0].shape[0]
-  else:
+  if has_symbolic_tensors(ins) and steps is None:
+    raise ValueError('If your data is in the form of symbolic tensors, '
+                     'you should specify the `' + steps_name + '` argument '
+                     '(instead of the `batch_size` argument).')
+  if ins and hasattr(ins[0], 'shape'):
+    num_samples = int(ins[0].shape[0])
+  elif steps is None:
     raise ValueError(
         'Either the input data should have '
         'a defined shape, or ' + steps_name + ' should be specified.')
   return num_samples
 
 
+def standardize_single_array(x):
+  if x is None:
+    return None
+  elif tensor_util.is_tensor(x):
+    return x
+  elif x.ndim == 1:
+    x = np.expand_dims(x, 1)
+  return x
+
+
 def standardize_input_data(data,
                            names,
                            shapes=None,
@@ -130,9 +146,7 @@ def standardize_input_data(data,
   else:
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
-  data = [
-      np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data
-  ]
+  data = [standardize_single_array(x) for x in data]
 
   if len(data) != len(names):
     if data and hasattr(data[0], 'shape'):
@@ -158,7 +172,7 @@ def standardize_input_data(data,
   # Check shapes compatibility.
   if shapes:
     for i in range(len(names)):
-      if shapes[i] is not None:
+      if shapes[i] is not None and not tensor_util.is_tensor(data[i]):
         data_shape = data[i].shape
         shape = shapes[i]
         if data[i].ndim != len(shape):
@@ -245,12 +259,13 @@ def check_array_lengths(inputs, targets, weights=None):
   """
 
   def set_of_lengths(x):
-    # return a set with the variation between
+    # Returns a set with the variation between
     # different shapes, with None => 0
     if x is None:
       return {}
     else:
-      return set([y.shape[0] for y in x if y is not None])
+      return set([y.shape[0] for y in x
+                  if y is not None and not tensor_util.is_tensor(y)])
 
   set_x = set_of_lengths(inputs)
   set_y = set_of_lengths(targets)
@@ -422,7 +437,7 @@ def weighted_masked_objective(fn):
     score_array = fn(y_true, y_pred)
     if mask is not None:
       # Cast the mask to floatX to avoid float64 upcasting in theano
-      mask = K.cast(mask, K.floatx())
+      mask = math_ops.cast(mask, K.floatx())
       # mask should have the same shape as score_array
       score_array *= mask
       #  the loss per batch should be proportional
@@ -436,7 +451,8 @@ def weighted_masked_objective(fn):
       weight_ndim = K.ndim(weights)
       score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
       score_array *= weights
-      score_array /= K.mean(K.cast(K.not_equal(weights, 0), K.floatx()))
+      score_array /= K.mean(
+          math_ops.cast(math_ops.not_equal(weights, 0), K.floatx()))
     return K.mean(score_array)
 
   return weighted
@@ -532,3 +548,8 @@ def standardize_weights(y,
     return weights
   else:
     return None
+
+
+def has_symbolic_tensors(ls):
+  return (any(tensor_util.is_tensor(v) for v in ls)
+          and not context.executing_eagerly())
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 081f25e91452582afabee1e43f6e15f7e5d0013d..5d370ebbb5f31d102c381e46bb8f696e151f492b 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -296,9 +296,6 @@ def _clone_and_build_model(mode,
         sample_weight_mode=keras_model.sample_weight_mode,
         weighted_metrics=keras_model.weighted_metrics,
         target_tensors=target_tensors)
-
-  if isinstance(model, models.Sequential):
-    model = model.model
   return model
 
 
@@ -396,8 +393,6 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
       training_util.create_global_step()
       model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model,
                                      custom_objects)
-      if isinstance(model, models.Sequential):
-        model = model.model
       # save to checkpoint
       with session.Session(config=estimator._session_config) as sess:
         model.set_weights(keras_weights)
@@ -471,8 +466,8 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Pass the config into keras backend's default session.
-  with session.Session(config=estimator._session_config) as sess:
-    K.set_session(sess)
+  sess = session.Session(config=estimator._session_config)
+  K.set_session(sess)
 
   keras_weights = keras_model.get_weights()
   if keras_model._is_graph_network:
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index c40ee109aaea7dacea72e095b1d8cea3ed2e9bf8..11ca89d625bebb607b2bddbe65b8251f52aa6e4c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -146,7 +147,7 @@ class PReLU(Layer):
     if K.backend() == 'theano':
       neg = (
           K.pattern_broadcast(self.alpha, self.param_broadcast) *
-          (inputs - K.abs(inputs)) * 0.5)
+          (inputs - math_ops.abs(inputs)) * 0.5)
     else:
       neg = -self.alpha * K.relu(-inputs)
     return pos + neg
@@ -232,7 +233,8 @@ class ThresholdedReLU(Layer):
     self.theta = K.cast_to_floatx(theta)
 
   def call(self, inputs, mask=None):
-    return inputs * K.cast(K.greater(inputs, self.theta), K.floatx())
+    return inputs * math_ops.cast(
+        math_ops.greater(inputs, self.theta), K.floatx())
 
   def get_config(self):
     config = {'theta': float(self.theta)}
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index d95a0942452afa82e277c358be5c3b2ba061ac98..b78962d66a3e8ee4ce8228393e7b21f919c524c1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -29,6 +29,8 @@ from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -438,9 +440,9 @@ class ConvLSTM2D(ConvRecurrent2D):
 
   def get_initial_state(self, inputs):
     # (samples, timesteps, rows, cols, filters)
-    initial_state = K.zeros_like(inputs)
+    initial_state = array_ops.zeros_like(inputs)
     # (samples, rows, cols, filters)
-    initial_state = K.sum(initial_state, axis=1)
+    initial_state = math_ops.reduce_sum(initial_state, axis=1)
     shape = list(self.kernel_shape)
     shape[-1] = self.filters
     initial_state = self.input_conv(
@@ -483,8 +485,8 @@ class ConvLSTM2D(ConvRecurrent2D):
   def get_constants(self, inputs, training=None):
     constants = []
     if self.implementation == 0 and 0 < self.dropout < 1:
-      ones = K.zeros_like(inputs)
-      ones = K.sum(ones, axis=1)
+      ones = array_ops.zeros_like(inputs)
+      ones = math_ops.reduce_sum(ones, axis=1)
       ones += 1
 
       def dropped_inputs():
@@ -501,8 +503,8 @@ class ConvLSTM2D(ConvRecurrent2D):
     if 0 < self.recurrent_dropout < 1:
       shape = list(self.kernel_shape)
       shape[-1] = self.filters
-      ones = K.zeros_like(inputs)
-      ones = K.sum(ones, axis=1)
+      ones = array_ops.zeros_like(inputs)
+      ones = math_ops.reduce_sum(ones, axis=1)
       ones = self.input_conv(ones, K.zeros(shape), padding=self.padding)
       ones += 1.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 73e4f15f7e259211c892fdc663e14dcb14aec58d..c74fc1e4c0a764b4cc0d09129be4e5287a9bdd05 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -37,6 +37,8 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import func_dump
 from tensorflow.python.keras._impl.keras.utils.generic_utils import func_load
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -75,12 +77,12 @@ class Masking(Layer):
     self.mask_value = mask_value
 
   def compute_mask(self, inputs, mask=None):
-    return K.any(K.not_equal(inputs, self.mask_value), axis=-1)
+    return K.any(math_ops.not_equal(inputs, self.mask_value), axis=-1)
 
   def call(self, inputs):
     boolean_mask = K.any(
-        K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
-    return inputs * K.cast(boolean_mask, inputs.dtype)
+        math_ops.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
+    return inputs * math_ops.cast(boolean_mask, inputs.dtype)
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -170,7 +172,7 @@ class SpatialDropout1D(Dropout):
     self.input_spec = InputSpec(ndim=3)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     noise_shape = (input_shape[0], 1, input_shape[2])
     return noise_shape
 
@@ -222,7 +224,7 @@ class SpatialDropout2D(Dropout):
     self.input_spec = InputSpec(ndim=4)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     if self.data_format == 'channels_first':
       return (input_shape[0], input_shape[1], 1, 1)
     elif self.data_format == 'channels_last':
@@ -275,7 +277,7 @@ class SpatialDropout3D(Dropout):
     self.input_spec = InputSpec(ndim=5)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     if self.data_format == 'channels_first':
       return (input_shape[0], input_shape[1], 1, 1, 1)
     elif self.data_format == 'channels_last':
@@ -414,7 +416,8 @@ class Reshape(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return K.reshape(inputs, (K.shape(inputs)[0],) + self.target_shape)
+    return array_ops.reshape(inputs,
+                             (array_ops.shape(inputs)[0],) + self.target_shape)
 
   def get_config(self):
     config = {'target_shape': self.target_shape}
@@ -467,7 +470,7 @@ class Permute(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return K.permute_dimensions(inputs, (0,) + self.dims)
+    return array_ops.transpose(inputs, perm=(0,) + self.dims)
 
   def get_config(self):
     config = {'dims': self.dims}
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index 2ca816adbdcecaf371776d99f3da60d0d8790832..551d1b1c3a0a80ed81ad03afc4c9510a231e33ef 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -159,7 +160,7 @@ class CoreLayersTest(test.TestCase):
 
     # test with lambda
     ld = keras.layers.Lambda(
-        lambda x: keras.backend.concatenate([keras.backend.square(x), x]))
+        lambda x: keras.backend.concatenate([math_ops.square(x), x]))
     config = ld.get_config()
     ld = keras.layers.Lambda.from_config(config)
 
@@ -235,4 +236,3 @@ class CoreLayersTest(test.TestCase):
 
 if __name__ == '__main__':
   test.main()
-
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 006ecd3135be25d43133daed1603734ecd1be955..540e2d945c986aebbd7028e4a1f2e4566747320f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -24,6 +24,8 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -126,7 +128,7 @@ class Embedding(Layer):
     if not self.mask_zero:
       return None
     else:
-      return K.not_equal(inputs, 0)
+      return math_ops.not_equal(inputs, 0)
 
   @shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -152,8 +154,8 @@ class Embedding(Layer):
 
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
-      inputs = K.cast(inputs, 'int32')
-    out = K.gather(self.embeddings, inputs)
+      inputs = math_ops.cast(inputs, 'int32')
+    out = array_ops.gather(self.embeddings, inputs)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index c660cbd449b11a139f64cfa8b3a35310a597491c..7c87e6c0671138efacbf1bca02fdf6779e21537f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -23,6 +23,9 @@ from __future__ import print_function
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -127,7 +130,7 @@ class _Merge(Layer):
         for x in inputs:
           x_ndim = K.ndim(x)
           for _ in range(max_ndim - x_ndim):
-            x = K.expand_dims(x, 1)
+            x = array_ops.expand_dims(x, axis=1)
           reshaped_inputs.append(x)
         return self._merge_function(reshaped_inputs)
       else:
@@ -137,19 +140,22 @@ class _Merge(Layer):
         for x in inputs:
           x_ndim = K.ndim(x)
           if x_ndim is None:
-            x_shape = K.shape(x)
+            x_shape = array_ops.shape(x)
             batch_size = x_shape[0]
-            new_shape = K.concatenate([x_shape[1:], K.expand_dims(batch_size)])
-            x_transposed = K.reshape(x,
-                                     K.stack([batch_size,
-                                              K.prod(x_shape[1:])]))
-            x_transposed = K.permute_dimensions(x_transposed, (1, 0))
-            x_transposed = K.reshape(x_transposed, new_shape)
+            new_shape = K.concatenate(
+                [x_shape[1:],
+                 array_ops.expand_dims(batch_size, axis=-1)])
+            x_transposed = array_ops.reshape(
+                x,
+                array_ops.stack(
+                    [batch_size, math_ops.reduce_prod(x_shape[1:])], axis=0))
+            x_transposed = array_ops.transpose(x_transposed, perm=(1, 0))
+            x_transposed = array_ops.reshape(x_transposed, new_shape)
             reshaped_inputs.append(x_transposed)
             transposed = True
           elif x_ndim > 1:
             dims = list(range(1, x_ndim)) + [0]
-            reshaped_inputs.append(K.permute_dimensions(x, dims))
+            reshaped_inputs.append(array_ops.transpose(x, perm=dims))
             transposed = True
           else:
             # We don't transpose inputs if they are 1D vectors or scalars.
@@ -159,17 +165,18 @@ class _Merge(Layer):
         if transposed:
           # If inputs have been transposed, we have to transpose the output too.
           if y_ndim is None:
-            y_shape = K.shape(y)
-            y_ndim = K.shape(y_shape)[0]
+            y_shape = array_ops.shape(y)
+            y_ndim = array_ops.shape(y_shape)[0]
             batch_size = y_shape[y_ndim - 1]
-            new_shape = K.concatenate(
-                [K.expand_dims(batch_size), y_shape[:y_ndim - 1]])
-            y = K.reshape(y, (-1, batch_size))
-            y = K.permute_dimensions(y, (1, 0))
-            y = K.reshape(y, new_shape)
+            new_shape = K.concatenate([
+                array_ops.expand_dims(batch_size, axis=-1), y_shape[:y_ndim - 1]
+            ])
+            y = array_ops.reshape(y, (-1, batch_size))
+            y = array_ops.transpose(y, perm=(1, 0))
+            y = array_ops.reshape(y, new_shape)
           elif y_ndim > 1:
             dims = [y_ndim - 1] + list(range(y_ndim - 1))
-            y = K.permute_dimensions(y, dims)
+            y = array_ops.transpose(y, perm=dims)
         return y
     else:
       return self._merge_function(inputs)
@@ -207,7 +214,7 @@ class _Merge(Layer):
                        'should have the same length.')
     if all([m is None for m in mask]):
       return None
-    masks = [K.expand_dims(m, 0) for m in mask if m is not None]
+    masks = [array_ops.expand_dims(m, axis=0) for m in mask if m is not None]
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
 
 
@@ -325,7 +332,7 @@ class Maximum(_Merge):
   def _merge_function(self, inputs):
     output = inputs[0]
     for i in range(1, len(inputs)):
-      output = K.maximum(output, inputs[i])
+      output = math_ops.maximum(output, inputs[i])
     return output
 
 
@@ -340,7 +347,7 @@ class Minimum(_Merge):
   def _merge_function(self, inputs):
     output = inputs[0]
     for i in range(1, len(inputs)):
-      output = K.minimum(output, inputs[i])
+      output = math_ops.minimum(output, inputs[i])
     return output
 
 
@@ -418,10 +425,10 @@ class Concatenate(_Merge):
     for input_i, mask_i in zip(inputs, mask):
       if mask_i is None:
         # Input is unmasked. Append all 1s to masks,
-        masks.append(K.ones_like(input_i, dtype='bool'))
+        masks.append(array_ops.ones_like(input_i, dtype='bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
-        masks.append(K.expand_dims(mask_i))
+        masks.append(array_ops.expand_dims(mask_i, axis=-1))
       else:
         masks.append(mask_i)
     concatenated = K.concatenate(masks, axis=self.axis)
@@ -511,8 +518,8 @@ class Dot(_Merge):
         else:
           axes.append(self.axes[i])
     if self.normalize:
-      x1 = K.l2_normalize(x1, axis=axes[0])
-      x2 = K.l2_normalize(x2, axis=axes[1])
+      x1 = nn.l2_normalize(x1, axis=axes[0])
+      x2 = nn.l2_normalize(x2, axis=axes[1])
     output = K.batch_dot(x1, x2, axes)
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index e309d160e5a9be97ff5f5356dad9dfaf85430233..72dc7a1ff8b7887ed97ae44bddf8ae3cd32c408d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -23,6 +23,8 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -58,7 +60,7 @@ class GaussianNoise(Layer):
 
     def noised():
       return inputs + K.random_normal(
-          shape=K.shape(inputs), mean=0., stddev=self.stddev)
+          shape=array_ops.shape(inputs), mean=0., stddev=self.stddev)
 
     return K.in_train_phase(noised, inputs, training=training)
 
@@ -104,7 +106,7 @@ class GaussianDropout(Layer):
       def noised():
         stddev = np.sqrt(self.rate / (1.0 - self.rate))
         return inputs * K.random_normal(
-            shape=K.shape(inputs), mean=1.0, stddev=stddev)
+            shape=array_ops.shape(inputs), mean=1.0, stddev=stddev)
 
       return K.in_train_phase(noised, inputs, training=training)
     return inputs
@@ -153,7 +155,7 @@ class AlphaDropout(Layer):
     self.supports_masking = True
 
   def _get_noise_shape(self, inputs):
-    return self.noise_shape if self.noise_shape else K.shape(inputs)
+    return self.noise_shape if self.noise_shape else array_ops.shape(inputs)
 
   def call(self, inputs, training=None):
     if 0. < self.rate < 1.:
@@ -164,9 +166,9 @@ class AlphaDropout(Layer):
         scale = 1.0507009873554804934193349852946
         alpha_p = -alpha * scale
 
-        kept_idx = K.greater_equal(
+        kept_idx = math_ops.greater_equal(
             K.random_uniform(noise_shape, seed=seed), rate)
-        kept_idx = K.cast(kept_idx, K.floatx())
+        kept_idx = math_ops.cast(kept_idx, K.floatx())
 
         # Get affine transformation params
         a = ((1 - rate) * (1 + rate * alpha_p**2))**-0.5
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 791f9b311300ed05591083d551c040eb25ac8e22..7f9f77c2963711c8677a784eed7c4142747a05ac 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -33,6 +33,9 @@ from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -503,9 +506,12 @@ class RNN(Layer):
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = array_ops.zeros_like(inputs)
+    # shape of initial_state = (samples, timesteps, input_dim)
+    initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
+    # shape of initial_state = (samples,)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)
+    # shape of initial_state = (samples, 1)
     if hasattr(self.cell.state_size, '__len__'):
       return [K.tile(initial_state, [1, dim]) for dim in self.cell.state_size]
     else:
@@ -631,7 +637,7 @@ class RNN(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append(K.update(self.states[i], states[i]))
+        updates.append(state_ops.assign(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     if self.return_sequences:
@@ -907,8 +913,7 @@ class SimpleRNNCell(Layer):
     prev_output = states[0]
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training)
     if (0 < self.recurrent_dropout < 1 and
@@ -1309,8 +1314,7 @@ class GRUCell(Layer):
 
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training,
           count=3)
@@ -1793,8 +1797,7 @@ class LSTMCell(Layer):
   def call(self, inputs, states, training=None):
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training,
           count=4)
@@ -2176,7 +2179,7 @@ class LSTM(RNN):
 
 
 def _generate_dropout_ones(inputs, dims):
-  return K.ones((K.shape(inputs)[0], dims))
+  return K.ones((array_ops.shape(inputs)[0], dims))
 
 
 def _generate_dropout_mask(ones, rate, training=None, count=1):
@@ -2351,9 +2354,12 @@ class Recurrent(Layer):
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = array_ops.zeros_like(inputs)
+    # shape of initial_state = (samples, timesteps, input_dim)
+    initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
+    # shape of initial_state = (samples,)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)
+    # shape of initial_state = (samples, 1)
     initial_state = K.tile(initial_state, [1,
                                            self.units])  # (samples, output_dim)
     initial_state = [initial_state for _ in range(len(self.states))]
@@ -2456,7 +2462,7 @@ class Recurrent(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append(K.update(self.states[i], states[i]))
+        updates.append(state_ops.assign(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     # Properly set learning phase
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index de022153f6f07240a0dff70e5faeed5b6d4a8c5f..fb743b617fab87503199eb08241c17acf2f3800d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -24,6 +24,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
@@ -395,8 +398,8 @@ class RNNTest(test.TestCase):
 
     # Test `get_losses_for` and `losses`
     x = keras.Input((None, 1))
-    loss_1 = keras.backend.sum(x)
-    loss_2 = keras.backend.sum(cells[0].kernel)
+    loss_1 = math_ops.reduce_sum(x)
+    loss_2 = math_ops.reduce_sum(cells[0].kernel)
     cells[0].add_loss(loss_1, inputs=x)
     cells[0].add_loss(loss_2)
     self.assertEqual(len(layer.losses), 2)
@@ -410,10 +413,10 @@ class RNNTest(test.TestCase):
     layer.build((None, None, 1))
 
     x = keras.Input((None, 1))
-    update_1 = keras.backend.update_add(
-        cells[0].kernel, x[0, 0, 0] * cells[0].kernel)
-    update_2 = keras.backend.update_add(
-        cells[0].kernel, keras.backend.ones_like(cells[0].kernel))
+    update_1 = state_ops.assign_add(cells[0].kernel,
+                                    x[0, 0, 0] * cells[0].kernel)
+    update_2 = state_ops.assign_add(cells[0].kernel,
+                                    array_ops.ones_like(cells[0].kernel))
     cells[0].add_update(update_1, inputs=x)
     cells[0].add_update(update_2)
     self.assertEqual(len(layer.updates), 2)
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index 76ddd9299dd669da35d89a6fe8fc521ce4c26337..c510e464ae54b2dbe87533f02971a4af6c9c7c45 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -28,6 +28,7 @@ from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import utils as tf_layers_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -209,11 +210,11 @@ class TimeDistributed(Wrapper):
       # We can go with reshape-based implementation for performance.
       input_length = input_shape[1]
       if not input_length:
-        input_length = K.shape(inputs)[1]
+        input_length = array_ops.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...). And track the
       # transformation in self._input_map.
       input_uid = tf_layers_util.object_list_uid(inputs)
-      inputs = K.reshape(inputs, (-1,) + input_shape[2:])
+      inputs = array_ops.reshape(inputs, (-1,) + input_shape[2:])
       self._input_map[input_uid] = inputs
       # (num_samples * timesteps, ...)
       y = self.layer.call(inputs, **kwargs)
@@ -221,7 +222,7 @@ class TimeDistributed(Wrapper):
         uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
       output_shape = self.compute_output_shape(input_shape).as_list()
-      y = K.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
+      y = array_ops.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
 
     # Apply activity regularizer if any:
     if (hasattr(self.layer, 'activity_regularizer') and
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 1576ed7b999f65992f46b357c8ebeda8935c68d0..1d634d38013164659f7360fce45704c19083f475 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -24,51 +24,55 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.mean_squared_error',
            'keras.losses.mean_squared_error')
 def mean_squared_error(y_true, y_pred):
-  return K.mean(K.square(y_pred - y_true), axis=-1)
+  return K.mean(math_ops.square(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_error',
            'keras.losses.mean_absolute_error')
 def mean_absolute_error(y_true, y_pred):
-  return K.mean(K.abs(y_pred - y_true), axis=-1)
+  return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_percentage_error',
            'keras.losses.mean_absolute_percentage_error')
 def mean_absolute_percentage_error(y_true, y_pred):
-  diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
+  diff = math_ops.abs(
+      (y_true - y_pred) / K.clip(math_ops.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
 
 
 @tf_export('keras.metrics.mean_squared_logarithmic_error',
            'keras.losses.mean_squared_logarithmic_error')
 def mean_squared_logarithmic_error(y_true, y_pred):
-  first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
-  second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
-  return K.mean(K.square(first_log - second_log), axis=-1)
+  first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
+  second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
+  return K.mean(math_ops.square(first_log - second_log), axis=-1)
 
 
 @tf_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
 def squared_hinge(y_true, y_pred):
-  return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1)
+  return K.mean(
+      math_ops.square(math_ops.maximum(1. - y_true * y_pred, 0.)), axis=-1)
 
 
 @tf_export('keras.metrics.hinge', 'keras.losses.hinge')
 def hinge(y_true, y_pred):
-  return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)
+  return K.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
 
 
 @tf_export('keras.losses.categorical_hinge')
 def categorical_hinge(y_true, y_pred):
-  pos = K.sum(y_true * y_pred, axis=-1)
-  neg = K.max((1. - y_true) * y_pred, axis=-1)
-  return K.maximum(0., neg - pos + 1.)
+  pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
+  neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
+  return math_ops.maximum(0., neg - pos + 1.)
 
 
 @tf_export('keras.losses.logcosh')
@@ -89,7 +93,7 @@ def logcosh(y_true, y_pred):
   """
 
   def _logcosh(x):
-    return x + K.softplus(-2. * x) - K.log(2.)
+    return x + nn.softplus(-2. * x) - math_ops.log(2.)
 
   return K.mean(_logcosh(y_pred - y_true), axis=-1)
 
@@ -117,19 +121,19 @@ def binary_crossentropy(y_true, y_pred):
 def kullback_leibler_divergence(y_true, y_pred):
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
-  return K.sum(y_true * K.log(y_true / y_pred), axis=-1)
+  return math_ops.reduce_sum(y_true * math_ops.log(y_true / y_pred), axis=-1)
 
 
 @tf_export('keras.metrics.poisson', 'keras.losses.poisson')
 def poisson(y_true, y_pred):
-  return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()), axis=-1)
+  return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
 
 @tf_export('keras.metrics.cosine_proximity', 'keras.losses.cosine_proximity')
 def cosine_proximity(y_true, y_pred):
-  y_true = K.l2_normalize(y_true, axis=-1)
-  y_pred = K.l2_normalize(y_pred, axis=-1)
-  return -K.sum(y_true * y_pred, axis=-1)
+  y_true = nn.l2_normalize(y_true, axis=-1)
+  y_pred = nn.l2_normalize(y_pred, axis=-1)
+  return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
 
 
 # Aliases.
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/_impl/keras/metrics.py
index 82778a3dc4fbdc13bb6682d01e28ff68882b6dd9..747c3e65157ded6b0d227c6d6667b9092d0eed44 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/_impl/keras/metrics.py
@@ -37,37 +37,45 @@ from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crosse
 from tensorflow.python.keras._impl.keras.losses import squared_hinge
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred):
-  return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1)
+  return K.mean(math_ops.equal(y_true, math_ops.round(y_pred)), axis=-1)
 
 
 @tf_export('keras.metrics.categorical_accuracy')
 def categorical_accuracy(y_true, y_pred):
-  return K.cast(
-      K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), K.floatx())
+  return math_ops.cast(
+      math_ops.equal(
+          math_ops.argmax(y_true, axis=-1), math_ops.argmax(y_pred, axis=-1)),
+      K.floatx())
 
 
 def sparse_categorical_accuracy(y_true, y_pred):
-  return K.cast(
-      K.equal(
-          K.max(y_true, axis=-1), K.cast(K.argmax(y_pred, axis=-1),
-                                         K.floatx())), K.floatx())
+  return math_ops.cast(
+      math_ops.equal(
+          math_ops.reduce_max(y_true, axis=-1),
+          math_ops.cast(math_ops.argmax(y_pred, axis=-1), K.floatx())),
+      K.floatx())
 
 
 @tf_export('keras.metrics.top_k_categorical_accuracy')
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
+  return K.mean(
+      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), axis=-1)
 
 
 @tf_export('keras.metrics.sparse_top_k_categorical_accuracy')
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(
-      K.in_top_k(y_pred, K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
-
+      nn.in_top_k(y_pred,
+                  math_ops.cast(math_ops.reduce_max(y_true, axis=-1), 'int32'),
+                  k),
+      axis=-1)
 
 # Aliases
 
diff --git a/tensorflow/python/keras/_impl/keras/metrics_test.py b/tensorflow/python/keras/_impl/keras/metrics_test.py
index 44289ea02abf5ae5f8befbe515552aea3d4b231e..9deaab0c056e4b71205422e56cc79202a8e73593 100644
--- a/tensorflow/python/keras/_impl/keras/metrics_test.py
+++ b/tensorflow/python/keras/_impl/keras/metrics_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
@@ -104,16 +106,15 @@ class KerasMetricsTest(test.TestCase):
             The total number of true positives seen this epoch at the
                 completion of the batch.
         """
-        y_true = keras.backend.cast(y_true, 'int32')
-        y_pred = keras.backend.cast(keras.backend.round(y_pred), 'int32')
-        correct_preds = keras.backend.cast(
-            keras.backend.equal(y_pred, y_true), 'int32')
-        true_pos = keras.backend.cast(
-            keras.backend.sum(correct_preds * y_true), 'int32')
+        y_true = math_ops.cast(y_true, 'int32')
+        y_pred = math_ops.cast(math_ops.round(y_pred), 'int32')
+        correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32')
+        true_pos = math_ops.cast(
+            math_ops.reduce_sum(correct_preds * y_true), 'int32')
         current_true_pos = self.true_positives * 1
-        self.add_update(keras.backend.update_add(self.true_positives,
-                                                 true_pos),
-                        inputs=[y_true, y_pred])
+        self.add_update(
+            state_ops.assign_add(self.true_positives, true_pos),
+            inputs=[y_true, y_pred])
         return current_true_pos + true_pos
 
     metric_fn = BinaryTruePositives()
diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
index 58b144365be6cd8ea5b2ea82e275eacdee6b6c84..444590033042d915b12645fb0239833b666a02f7 100644
--- a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
@@ -22,7 +22,9 @@ import os
 import tempfile
 
 import numpy as np
+import six
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
@@ -36,6 +38,7 @@ except ImportError:
   h5py = None
 
 
+# pylint: disable=not-callable
 class SimpleTestModel(keras.Model):
 
   def __init__(self, use_bn=False, use_dp=False, num_classes=10):
@@ -104,7 +107,7 @@ class NestedTestModel1(keras.Model):
   def call(self, inputs):
     x = self.dense1(inputs)
     x = self.bn(x)
-    x = self.test_net(x)  # pylint: disable=not-callable
+    x = self.test_net(x)
     return self.dense2(x)
 
 
@@ -161,7 +164,7 @@ def get_nested_model_3(input_dim, num_classes):
       return tensor_shape.TensorShape((input_shape[0], 5))
 
   test_model = Inner()
-  x = test_model(x)  # pylint: disable=not-callable
+  x = test_model(x)
   outputs = keras.layers.Dense(num_classes)(x)
   return keras.Model(inputs, outputs, name='nested_model_3')
 
@@ -574,5 +577,128 @@ class ModelSubclassingTest(test.TestCase):
     self.assertGreater(loss, 0.1)
 
 
+class CustomCallModel(keras.Model):
+
+  def __init__(self):
+    super(CustomCallModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1, activation='relu')
+    self.dense2 = keras.layers.Dense(1, activation='softmax')
+
+  def call(self, first, second, fiddle_with_output='no', training=True):
+    combined = self.dense1(first) + self.dense2(second)
+    if fiddle_with_output == 'yes':
+      return 10. * combined
+    else:
+      return combined
+
+
+class CustomCallSignatureTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_no_inputs_in_signature(self):
+    model = CustomCallModel()
+    first = array_ops.ones([2, 3])
+    second = array_ops.ones([2, 5])
+    output = model(first, second)
+    self.evaluate([v.initializer for v in model.variables])
+    expected_output = self.evaluate(model.dense1(first) + model.dense2(second))
+    self.assertAllClose(expected_output, self.evaluate(output))
+    output = model(first, second, fiddle_with_output='yes')
+    self.assertAllClose(10. * expected_output, self.evaluate(output))
+    output = model(first, second=second, training=False)
+    self.assertAllClose(expected_output, self.evaluate(output))
+    if not context.executing_eagerly():
+      six.assertCountEqual(self, [first, second], model.inputs)
+    with self.assertRaises(TypeError):
+      # tf.layers.Layer expects an "inputs" argument, so all-keywords doesn't
+      # work at the moment.
+      model(first=first, second=second, fiddle_with_output='yes')
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_inputs_in_signature(self):
+
+    class HasInputsAndOtherPositional(keras.Model):
+
+      def call(self, inputs, some_other_arg, training=False):
+        return inputs
+
+    model = HasInputsAndOtherPositional()
+    with self.assertRaisesRegexp(
+        TypeError, 'everything else as a keyword argument'):
+      model(array_ops.ones([]), array_ops.ones([]))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_kwargs_in_signature(self):
+
+    class HasKwargs(keras.Model):
+
+      def call(self, x, y=3, **key_words):
+        return x
+
+    model = HasKwargs()
+    arg = array_ops.ones([])
+    model(arg, a=3)
+    if not context.executing_eagerly():
+      six.assertCountEqual(self, [arg], model.inputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_args_in_signature(self):
+
+    class HasArgs(keras.Model):
+
+      def call(self, x, *args, **kwargs):
+        return [x] + list(args)
+
+    model = HasArgs()
+    arg1 = array_ops.ones([])
+    arg2 = array_ops.ones([])
+    arg3 = array_ops.ones([])
+    model(arg1, arg2, arg3, a=3)
+    if not context.executing_eagerly():
+      six.assertCountEqual(self, [arg1, arg2, arg3], model.inputs)
+
+  def test_args_and_keywords_in_signature(self):
+
+    class HasArgs(keras.Model):
+
+      def call(self, x, training=True, *args, **kwargs):
+        return x
+
+    with context.graph_mode():
+      model = HasArgs()
+      arg1 = array_ops.ones([])
+      arg2 = array_ops.ones([])
+      arg3 = array_ops.ones([])
+      with self.assertRaisesRegexp(TypeError, 'args and arguments with'):
+        model(arg1, arg2, arg3, a=3)
+
+  def test_training_no_default(self):
+
+    class TrainingNoDefault(keras.Model):
+
+      def call(self, x, training):
+        return x
+
+    with context.graph_mode():
+      model = TrainingNoDefault()
+      arg = array_ops.ones([])
+      model(arg, True)
+      six.assertCountEqual(self, [arg], model.inputs)
+
+  def test_training_no_default_with_positional(self):
+
+    class TrainingNoDefaultWithPositional(keras.Model):
+
+      def call(self, x, training, positional):
+        return x
+
+    with context.graph_mode():
+      model = TrainingNoDefaultWithPositional()
+      arg1 = array_ops.ones([])
+      arg2 = array_ops.ones([])
+      arg3 = array_ops.ones([])
+      with self.assertRaisesRegexp(TypeError, 'after a non-input'):
+        model(arg1, arg2, arg3)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index b715d722b98b9db3bdf0985da0954356a2facdfe..9f383deb725ac69bf2f17f3627010c4e1f567ef0 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -31,7 +31,10 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import optimizer as tf_optimizer_module
+from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -116,7 +119,8 @@ class Optimizer(object):
                        'Common ops without gradient: '
                        'K.argmax, K.round, K.eval.')
     if hasattr(self, 'clipnorm') and self.clipnorm > 0:
-      norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
+      norm = K.sqrt(
+          sum([math_ops.reduce_sum(math_ops.square(g)) for g in grads]))
       grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
     if hasattr(self, 'clipvalue') and self.clipvalue > 0:
       grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
@@ -202,20 +206,20 @@ class SGD(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
     # momentum
     shapes = [K.int_shape(p) for p in params]
     moments = [K.zeros(shape) for shape in shapes]
     self.weights = [self.iterations] + moments
     for p, g, m in zip(params, grads, moments):
       v = self.momentum * m - lr * g  # velocity
-      self.updates.append(K.update(m, v))
+      self.updates.append(state_ops.assign(m, v))
 
       if self.nesterov:
         new_p = p + self.momentum * v - lr * g
@@ -226,7 +230,7 @@ class SGD(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -275,25 +279,25 @@ class RMSprop(Optimizer):
     grads = self.get_gradients(loss, params)
     accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     self.weights = accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * K.square(g)
-      self.updates.append(K.update(a, new_a))
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
+      self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -337,24 +341,24 @@ class Adagrad(Optimizer):
     shapes = [K.int_shape(p) for p in params]
     accumulators = [K.zeros(shape) for shape in shapes]
     self.weights = accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
-      new_a = a + K.square(g)  # update accumulator
-      self.updates.append(K.update(a, new_a))
+      new_a = a + math_ops.square(g)  # update accumulator
+      self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -401,18 +405,18 @@ class Adadelta(Optimizer):
     accumulators = [K.zeros(shape) for shape in shapes]
     delta_accumulators = [K.zeros(shape) for shape in shapes]
     self.weights = accumulators + delta_accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
       # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * K.square(g)
-      self.updates.append(K.update(a, new_a))
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
+      self.updates.append(state_ops.assign(a, new_a))
 
       # use the new accumulator and the *old* delta_accumulator
       update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
@@ -422,11 +426,11 @@ class Adadelta(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
 
       # update delta_accumulator
-      new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
-      self.updates.append(K.update(d_a, new_d_a))
+      new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
+      self.updates.append(state_ops.assign(d_a, new_d_a))
     return self.updates
 
   def get_config(self):
@@ -481,17 +485,18 @@ class Adam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
-    t = K.cast(self.iterations, K.floatx()) + 1
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
     lr_t = lr * (
-        K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))
+        K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
+        (1. - math_ops.pow(self.beta_1, t)))
 
     ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
@@ -503,23 +508,23 @@ class Adam(Optimizer):
 
     for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
+      v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
       if self.amsgrad:
-        vhat_t = K.maximum(vhat, v_t)
+        vhat_t = math_ops.maximum(vhat, v_t)
         p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
-        self.updates.append(K.update(vhat, vhat_t))
+        self.updates.append(state_ops.assign(vhat, vhat_t))
       else:
         p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(v, v_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
       new_p = p_t
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -571,16 +576,16 @@ class Adamax(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
-    t = K.cast(self.iterations, K.floatx()) + 1
-    lr_t = lr / (1. - K.pow(self.beta_1, t))
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
 
     shapes = [K.int_shape(p) for p in params]
     # zero init of 1st moment
@@ -592,18 +597,18 @@ class Adamax(Optimizer):
     for p, g, m, u in zip(params, grads, ms, us):
 
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      u_t = K.maximum(self.beta_2 * u, K.abs(g))
+      u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
       p_t = p - lr_t * m_t / (u_t + self.epsilon)
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(u, u_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(u, u_t))
       new_p = p_t
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -657,16 +662,17 @@ class Nadam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
-    t = K.cast(self.iterations, K.floatx()) + 1
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
     momentum_cache_t = self.beta_1 * (
-        1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
+        1. - 0.5 *
+        (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
     momentum_cache_t_1 = self.beta_1 * (
         1. - 0.5 *
-        (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
+        (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
     m_schedule_new = self.m_schedule * momentum_cache_t
     m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
     self.updates.append((self.m_schedule, m_schedule_new))
@@ -682,13 +688,13 @@ class Nadam(Optimizer):
       g_prime = g / (1. - m_schedule_new)
       m_t = self.beta_1 * m + (1. - self.beta_1) * g
       m_t_prime = m_t / (1. - m_schedule_next)
-      v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
-      v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
+      v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
+      v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
       m_t_bar = (
           1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(v, v_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
 
       p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
       new_p = p_t
@@ -697,7 +703,7 @@ class Nadam(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -728,12 +734,27 @@ class TFOptimizer(Optimizer):
     return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
-    self.updates = [K.update_add(self.iterations, 1)]
-    if not params:
-      return self.updates
-    grads = self.optimizer.compute_gradients(loss, params)
-    opt_update = self.optimizer.apply_gradients(
-        grads, global_step=self.iterations)
+    if distribute_lib.has_distribution_strategy():
+      self.updates = []
+
+      if not params:
+        # After the model vars have been created, the second call to get_updates
+        # is called with params as an empty list. This ensures that we call
+        # compute_gradients with params=None.
+        grads = self.optimizer.compute_gradients(loss)
+      else:
+        grads = self.optimizer.compute_gradients(loss, params)
+      global_step = training_util.get_global_step()
+      opt_update = self.optimizer.apply_gradients(grads, global_step)
+    else:
+      self.updates = [state_ops.assign_add(self.iterations, 1)]
+      if not params:
+        return self.updates
+
+      grads = self.optimizer.compute_gradients(loss, params)
+      opt_update = self.optimizer.apply_gradients(
+          grads, global_step=self.iterations)
+
     self.updates.append(opt_update)
     return self.updates
 
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
index d12f10863921ee7d635930f34e8bc701c89864e8..6299445c34b99f20d7ae5090fc979d0ac2611109 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
@@ -43,6 +43,7 @@ except ImportError:
 
 
 try:
+  from PIL import ImageEnhance
   from PIL import Image as pil_image
 except ImportError:
   pil_image = None
@@ -227,6 +228,32 @@ def random_channel_shift(x, intensity, channel_axis=0):
   return x
 
 
+@tf_export('keras.preprocessing.image.random_brightness')
+def random_brightness(x, brightness_range):
+  """Performs a random adjustment of brightness of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      brightness_range: Tuple of floats; range to pick a brightness value from.
+
+  Returns:
+      Brightness adjusted Numpy image tensor.
+
+  Raises:
+      ValueError: if `brightness_range` isn't a tuple.
+  """
+  if len(brightness_range) != 2:
+    raise ValueError('`brightness_range should be tuple or list of two floats. '
+                     'Received arg: ', brightness_range)
+
+  x = array_to_img(x)
+  x = ImageEnhance.Brightness(x)
+  u = np.random.uniform(brightness_range[0], brightness_range[1])
+  x = x.enhance(u)
+  x = img_to_array(x)
+  return x
+
+
 def transform_matrix_offset_center(matrix, x, y):
   o_x = float(x) / 2 + 0.5
   o_y = float(y) / 2 + 0.5
@@ -265,7 +292,7 @@ def apply_transform(x,
           x_channel,
           final_affine_matrix,
           final_offset,
-          order=0,
+          order=1,
           mode=fill_mode,
           cval=cval) for x_channel in x
   ]
@@ -436,6 +463,7 @@ class ImageDataGenerator(object):
       rotation_range: degrees (0 to 180).
       width_shift_range: fraction of total width, if < 1, or pixels if >= 1.
       height_shift_range: fraction of total height, if < 1, or pixels if >= 1.
+      brightness_range: the range of brightness to apply
       shear_range: shear intensity (shear angle in degrees).
       zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
           in the range [1-z, 1+z]. A sequence of two can be passed instead
@@ -469,6 +497,8 @@ class ImageDataGenerator(object):
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
+      validation_split: fraction of images reserved for validation (strictly
+        between 0 and 1).
   """
 
   def __init__(self,
@@ -481,6 +511,7 @@ class ImageDataGenerator(object):
                rotation_range=0.,
                width_shift_range=0.,
                height_shift_range=0.,
+               brightness_range=None,
                shear_range=0.,
                zoom_range=0.,
                channel_shift_range=0.,
@@ -490,7 +521,8 @@ class ImageDataGenerator(object):
                vertical_flip=False,
                rescale=None,
                preprocessing_function=None,
-               data_format=None):
+               data_format=None,
+               validation_split=0.0):
     if data_format is None:
       data_format = K.image_data_format()
     self.featurewise_center = featurewise_center
@@ -502,6 +534,7 @@ class ImageDataGenerator(object):
     self.rotation_range = rotation_range
     self.width_shift_range = width_shift_range
     self.height_shift_range = height_shift_range
+    self.brightness_range = brightness_range
     self.shear_range = shear_range
     self.zoom_range = zoom_range
     self.channel_shift_range = channel_shift_range
@@ -526,6 +559,10 @@ class ImageDataGenerator(object):
       self.channel_axis = 3
       self.row_axis = 1
       self.col_axis = 2
+    if validation_split and not 0 < validation_split < 1:
+      raise ValueError('`validation_split` must be strictly between 0 and 1. '
+                       'Received arg: ', validation_split)
+    self.validation_split = validation_split
 
     self.mean = None
     self.std = None
@@ -574,7 +611,8 @@ class ImageDataGenerator(object):
            seed=None,
            save_to_dir=None,
            save_prefix='',
-           save_format='png'):
+           save_format='png',
+           subset=None):
     return NumpyArrayIterator(
         x,
         y,
@@ -585,7 +623,8 @@ class ImageDataGenerator(object):
         data_format=self.data_format,
         save_to_dir=save_to_dir,
         save_prefix=save_prefix,
-        save_format=save_format)
+        save_format=save_format,
+        subset=subset)
 
   def flow_from_directory(self,
                           directory,
@@ -600,6 +639,7 @@ class ImageDataGenerator(object):
                           save_prefix='',
                           save_format='png',
                           follow_links=False,
+                          subset=None,
                           interpolation='nearest'):
     return DirectoryIterator(
         directory,
@@ -616,6 +656,7 @@ class ImageDataGenerator(object):
         save_prefix=save_prefix,
         save_format=save_format,
         follow_links=follow_links,
+        subset=subset,
         interpolation=interpolation)
 
   def standardize(self, x):
@@ -628,7 +669,7 @@ class ImageDataGenerator(object):
         The inputs, normalized.
     """
     if self.preprocessing_function:
-      x = self.preprocessing_function(x)
+      x = self.image_data_generator.preprocessing_function(x)
     if self.rescale:
       x *= self.rescale
     if self.samplewise_center:
@@ -762,6 +803,9 @@ class ImageDataGenerator(object):
       if np.random.random() < 0.5:
         x = flip_axis(x, img_row_axis)
 
+    if self.brightness_range is not None:
+      x = random_brightness(x, self.brightness_range)
+
     return x
 
   def fit(self, x, augment=False, rounds=1, seed=None):
@@ -828,12 +872,10 @@ class ImageDataGenerator(object):
         raise ImportError('Scipy is required for zca_whitening.')
 
       flat_x = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]))
-      num_examples = flat_x.shape[0]
-      _, s, vt = linalg.svd(flat_x / np.sqrt(num_examples))
-      s_expand = np.hstack(
-          (s, np.zeros(vt.shape[0] - num_examples, dtype=flat_x.dtype)))
-      self.principal_components = (
-          vt.T / np.sqrt(s_expand**2 + self.zca_epsilon)).dot(vt)
+      sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
+      u, s, _ = linalg.svd(sigma)
+      s_inv = 1. / np.sqrt(s[np.newaxis] + self.zca_epsilon)
+      self.principal_components = (u * s_inv).dot(u.T)
 
 
 @tf_export('keras.preprocessing.image.Iterator')
@@ -947,6 +989,8 @@ class NumpyArrayIterator(Iterator):
           images (if `save_to_dir` is set).
       save_format: Format to use for saving sample images
           (if `save_to_dir` is set).
+      subset: Subset of data (`"training"` or `"validation"`) if
+          validation_split is set in ImageDataGenerator.
   """
 
   def __init__(self,
@@ -959,17 +1003,29 @@ class NumpyArrayIterator(Iterator):
                data_format=None,
                save_to_dir=None,
                save_prefix='',
-               save_format='png'):
+               save_format='png',
+               subset=None):
     if y is not None and len(x) != len(y):
-      raise ValueError('X (images tensor) and y (labels) '
+      raise ValueError('`x` (images tensor) and `y` (labels) '
                        'should have the same length. '
-                       'Found: X.shape = %s, y.shape = %s' %
+                       'Found: x.shape = %s, y.shape = %s' %
                        (np.asarray(x).shape, np.asarray(y).shape))
-
+    if subset is not None:
+      if subset not in {'training', 'validation'}:
+        raise ValueError('Invalid subset name:', subset,
+                         '; expected "training" or "validation".')
+      split_idx = int(len(x) * image_data_generator.validation_split)
+      if subset == 'validation':
+        x = x[:split_idx]
+        if y is not None:
+          y = y[:split_idx]
+      else:
+        x = x[split_idx:]
+        if y is not None:
+          y = y[split_idx:]
     if data_format is None:
       data_format = K.image_data_format()
     self.x = np.asarray(x, dtype=K.floatx())
-
     if self.x.ndim != 4:
       raise ValueError('Input data in `NumpyArrayIterator` '
                        'should have rank 4. You passed an array '
@@ -1032,8 +1088,7 @@ class NumpyArrayIterator(Iterator):
     return self._get_batches_of_transformed_samples(index_array)
 
 
-def _count_valid_files_in_directory(directory, white_list_formats,
-                                    follow_links):
+def _iter_valid_files(directory, white_list_formats, follow_links):
   """Count files with extension in `white_list_formats` contained in directory.
 
   Arguments:
@@ -1043,29 +1098,54 @@ def _count_valid_files_in_directory(directory, white_list_formats,
           the files to be counted.
       follow_links: boolean.
 
-  Returns:
-      the count of files with extension in `white_list_formats` contained in
-      the directory.
+  Yields:
+      tuple of (root, filename) with extension in `white_list_formats`.
   """
 
   def _recursive_list(subpath):
     return sorted(
-        os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+        os.walk(subpath, followlinks=follow_links), key=lambda x: x[0])
 
-  samples = 0
-  for _, _, files in _recursive_list(directory):
-    for fname in files:
-      is_valid = False
+  for root, _, files in _recursive_list(directory):
+    for fname in sorted(files):
       for extension in white_list_formats:
+        if fname.lower().endswith('.tiff'):
+          logging.warning(
+              'Using \'.tiff\' files with multiple bands will cause '
+              'distortion. Please verify your output.')
         if fname.lower().endswith('.' + extension):
-          is_valid = True
-          break
-      if is_valid:
-        samples += 1
-  return samples
+          yield root, fname
 
 
-def _list_valid_filenames_in_directory(directory, white_list_formats,
+def _count_valid_files_in_directory(directory, white_list_formats, split,
+                                    follow_links):
+  """Count files with extension in `white_list_formats` contained in directory.
+
+  Arguments:
+      directory: absolute path to the directory
+          containing files to be counted
+      white_list_formats: set of strings containing allowed extensions for
+          the files to be counted.
+      split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
+          account a certain fraction of files in each directory.
+          E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
+          of images in each directory.
+      follow_links: boolean.
+
+  Returns:
+      the count of files with extension in `white_list_formats` contained in
+      the directory.
+  """
+  num_files = len(
+      list(_iter_valid_files(directory, white_list_formats, follow_links)))
+  if split:
+    start, stop = int(split[0] * num_files), int(split[1] * num_files)
+  else:
+    start, stop = 0, num_files
+  return stop - start
+
+
+def _list_valid_filenames_in_directory(directory, white_list_formats, split,
                                        class_indices, follow_links):
   """List paths of files in `subdir` with extensions in `white_list_formats`.
 
@@ -1075,6 +1155,10 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
             `class_indices`.
       white_list_formats: set of strings containing allowed extensions for
           the files to be counted.
+      split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
+          account a certain fraction of files in each directory.
+          E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
+          of images in each directory.
       class_indices: dictionary mapping a class name to its index.
       follow_links: boolean.
 
@@ -1084,27 +1168,26 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
           `directory`'s parent (e.g., if `directory` is "dataset/class1",
           the filenames will be ["class1/file1.jpg", "class1/file2.jpg", ...]).
   """
-
-  def _recursive_list(subpath):
-    return sorted(
-        os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+  dirname = os.path.basename(directory)
+  if split:
+    num_files = len(
+        list(_iter_valid_files(directory, white_list_formats, follow_links)))
+    start, stop = int(split[0] * num_files), int(split[1] * num_files)
+    valid_files = list(
+        _iter_valid_files(directory, white_list_formats,
+                          follow_links))[start:stop]
+  else:
+    valid_files = _iter_valid_files(directory, white_list_formats, follow_links)
 
   classes = []
   filenames = []
-  subdir = os.path.basename(directory)
-  basedir = os.path.dirname(directory)
-  for root, _, files in _recursive_list(directory):
-    for fname in sorted(files):
-      is_valid = False
-      for extension in white_list_formats:
-        if fname.lower().endswith('.' + extension):
-          is_valid = True
-          break
-      if is_valid:
-        classes.append(class_indices[subdir])
-        # add filename relative to directory
-        absolute_path = os.path.join(root, fname)
-        filenames.append(os.path.relpath(absolute_path, basedir))
+  for root, fname in valid_files:
+    classes.append(class_indices[dirname])
+    absolute_path = os.path.join(root, fname)
+    relative_path = os.path.join(dirname,
+                                 os.path.relpath(absolute_path, directory))
+    filenames.append(relative_path)
+
   return classes, filenames
 
 
@@ -1144,6 +1227,8 @@ class DirectoryIterator(Iterator):
           images (if `save_to_dir` is set).
       save_format: Format to use for saving sample images
           (if `save_to_dir` is set).
+      subset: Subset of data (`"training"` or `"validation"`) if
+          validation_split is set in ImageDataGenerator.
       interpolation: Interpolation method used to resample the image if the
           target size is different from that of the loaded image.
           Supported methods are "nearest", "bilinear", and "bicubic".
@@ -1167,6 +1252,7 @@ class DirectoryIterator(Iterator):
                save_prefix='',
                save_format='png',
                follow_links=False,
+               subset=None,
                interpolation='nearest'):
     if data_format is None:
       data_format = K.image_data_format()
@@ -1200,7 +1286,20 @@ class DirectoryIterator(Iterator):
     self.save_format = save_format
     self.interpolation = interpolation
 
-    white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm'}
+    if subset is not None:
+      validation_split = self.image_data_generator.validation_split
+      if subset == 'validation':
+        split = (0, validation_split)
+      elif subset == 'training':
+        split = (validation_split, 1)
+      else:
+        raise ValueError('Invalid subset name: ', subset,
+                         '; expected "training" or "validation"')
+    else:
+      split = None
+    self.subset = subset
+
+    white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm', 'tif', 'tiff'}
 
     # first, count the number of samples and classes
     self.samples = 0
@@ -1217,7 +1316,8 @@ class DirectoryIterator(Iterator):
     function_partial = partial(
         _count_valid_files_in_directory,
         white_list_formats=white_list_formats,
-        follow_links=follow_links)
+        follow_links=follow_links,
+        split=split)
     self.samples = sum(
         pool.map(function_partial,
                  (os.path.join(directory, subdir) for subdir in classes)))
@@ -1233,14 +1333,15 @@ class DirectoryIterator(Iterator):
     i = 0
     for dirpath in (os.path.join(directory, subdir) for subdir in classes):
       results.append(
-          pool.apply_async(
-              _list_valid_filenames_in_directory,
-              (dirpath, white_list_formats, self.class_indices, follow_links)))
+          pool.apply_async(_list_valid_filenames_in_directory,
+                           (dirpath, white_list_formats, split,
+                            self.class_indices, follow_links)))
     for res in results:
       classes, filenames = res.get()
       self.classes[i:i + len(classes)] = classes
       self.filenames += filenames
       i += len(classes)
+
     pool.close()
     pool.join()
     super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle,
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
index c0790b5a5140193b18907d9375530f4f06e137da..001fee91f9ed609c0b3cd88d4079e75c0e585b02 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 import shutil
+import tempfile
 
 import numpy as np
 
@@ -74,6 +75,7 @@ class TestImage(test.TestCase):
           shear_range=0.5,
           zoom_range=0.2,
           channel_shift_range=0.,
+          brightness_range=(1, 5),
           fill_mode='nearest',
           cval=0.5,
           horizontal_flip=True,
@@ -92,6 +94,47 @@ class TestImage(test.TestCase):
         self.assertEqual(x.shape[1:], images.shape[1:])
         break
 
+  def test_image_data_generator_with_validation_split(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    for test_images in _generate_test_images():
+      img_list = []
+      for im in test_images:
+        img_list.append(keras.preprocessing.image.img_to_array(im)[None, ...])
+
+      images = np.vstack(img_list)
+      generator = keras.preprocessing.image.ImageDataGenerator(
+          validation_split=0.5)
+      seq = generator.flow(
+          images,
+          np.arange(images.shape[0]),
+          shuffle=False,
+          batch_size=3,
+          subset='validation')
+      _, y = seq[0]
+      self.assertEqual(list(y), [0, 1, 2])
+      seq = generator.flow(
+          images,
+          np.arange(images.shape[0]),
+          shuffle=False,
+          batch_size=3,
+          subset='training')
+      _, y2 = seq[0]
+      self.assertEqual(list(y2), [4, 5, 6])
+
+      with self.assertRaises(ValueError):
+        generator.flow(
+            images,
+            np.arange(images.shape[0]),
+            shuffle=False,
+            batch_size=3,
+            subset='foo')
+
+  def test_image_data_generator_with_split_value_error(self):
+    with self.assertRaises(ValueError):
+      keras.preprocessing.image.ImageDataGenerator(validation_split=5)
+
   def test_image_data_generator_invalid_data(self):
     generator = keras.preprocessing.image.ImageDataGenerator(
         featurewise_center=True,
@@ -202,9 +245,80 @@ class TestImage(test.TestCase):
     # check number of classes and images
     self.assertEqual(len(dir_iterator.class_indices), num_classes)
     self.assertEqual(len(dir_iterator.classes), count)
-    self.assertEqual(sorted(dir_iterator.filenames), sorted(filenames))
+    self.assertEqual(set(dir_iterator.filenames), set(filenames))
     _ = dir_iterator.next()
 
+  def directory_iterator_with_validation_split_test_helper(
+      self, validation_split):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    num_classes = 2
+    tmp_folder = tempfile.mkdtemp(prefix='test_images')
+
+    # create folders and subfolders
+    paths = []
+    for cl in range(num_classes):
+      class_directory = 'class-{}'.format(cl)
+      classpaths = [
+          class_directory,
+          os.path.join(class_directory, 'subfolder-1'),
+          os.path.join(class_directory, 'subfolder-2'),
+          os.path.join(class_directory, 'subfolder-1', 'sub-subfolder')
+      ]
+      for path in classpaths:
+        os.mkdir(os.path.join(tmp_folder, path))
+      paths.append(classpaths)
+
+    # save the images in the paths
+    count = 0
+    filenames = []
+    for test_images in _generate_test_images():
+      for im in test_images:
+        # rotate image class
+        im_class = count % num_classes
+        # rotate subfolders
+        classpaths = paths[im_class]
+        filename = os.path.join(classpaths[count % len(classpaths)],
+                                'image-{}.jpg'.format(count))
+        filenames.append(filename)
+        im.save(os.path.join(tmp_folder, filename))
+        count += 1
+
+    # create iterator
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        validation_split=validation_split)
+
+    with self.assertRaises(ValueError):
+      generator.flow_from_directory(tmp_folder, subset='foo')
+
+    num_validation = int(count * validation_split)
+    num_training = count - num_validation
+    train_iterator = generator.flow_from_directory(
+        tmp_folder, subset='training')
+    self.assertEqual(train_iterator.samples, num_training)
+
+    valid_iterator = generator.flow_from_directory(
+        tmp_folder, subset='validation')
+    self.assertEqual(valid_iterator.samples, num_validation)
+
+    # check number of classes and images
+    self.assertEqual(len(train_iterator.class_indices), num_classes)
+    self.assertEqual(len(train_iterator.classes), num_training)
+    self.assertEqual(
+        len(set(train_iterator.filenames) & set(filenames)), num_training)
+
+    shutil.rmtree(tmp_folder)
+
+  def test_directory_iterator_with_validation_split_25_percent(self):
+    self.directory_iterator_with_validation_split_test_helper(0.25)
+
+  def test_directory_iterator_with_validation_split_40_percent(self):
+    self.directory_iterator_with_validation_split_test_helper(0.40)
+
+  def test_directory_iterator_with_validation_split_50_percent(self):
+    self.directory_iterator_with_validation_split_test_helper(0.50)
+
   def test_img_utils(self):
     if PIL is None:
       return  # Skip test if PIL is not available.
@@ -241,6 +355,41 @@ class TestImage(test.TestCase):
     x = keras.preprocessing.image.img_to_array(img, data_format='channels_last')
     self.assertEqual(x.shape, (height, width, 1))
 
+  def test_batch_standardize(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    # ImageDataGenerator.standardize should work on batches
+    for test_images in _generate_test_images():
+      img_list = []
+      for im in test_images:
+        img_list.append(keras.preprocessing.image.img_to_array(im)[None, ...])
+
+      images = np.vstack(img_list)
+      generator = keras.preprocessing.image.ImageDataGenerator(
+          featurewise_center=True,
+          samplewise_center=True,
+          featurewise_std_normalization=True,
+          samplewise_std_normalization=True,
+          zca_whitening=True,
+          rotation_range=90.,
+          width_shift_range=0.1,
+          height_shift_range=0.1,
+          shear_range=0.5,
+          zoom_range=0.2,
+          channel_shift_range=0.,
+          brightness_range=(1, 5),
+          fill_mode='nearest',
+          cval=0.5,
+          horizontal_flip=True,
+          vertical_flip=True)
+      generator.fit(images, augment=True)
+
+      transformed = np.copy(images)
+      for i, im in enumerate(transformed):
+        transformed[i] = generator.random_transform(im)
+      transformed = generator.standardize(transformed)
+
   def test_img_transforms(self):
     x = np.random.random((3, 200, 200))
     _ = keras.preprocessing.image.random_rotation(x, 20)
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
index a423d96d3d8578df347b7ee36fb53dfd335e0d65..e68c171d9c7e33d7e932f5d5b7f15859faa2348b 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
@@ -22,6 +22,8 @@ import random
 
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
+
+from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -32,29 +34,40 @@ def pad_sequences(sequences,
                   padding='pre',
                   truncating='pre',
                   value=0.):
-  """Pads each sequence to the same length (length of the longest sequence).
+  """Pads sequences to the same length.
+
+  This function transforms a list of
+  `num_samples` sequences (lists of integers)
+  into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
+  `num_timesteps` is either the `maxlen` argument if provided,
+  or the length of the longest sequence otherwise.
+
+  Sequences that are shorter than `num_timesteps`
+  are padded with `value` at the end.
 
-  If maxlen is provided, any sequence longer
-  than maxlen is truncated to maxlen.
-  Truncation happens off either the beginning (default) or
-  the end of the sequence.
+  Sequences longer than `num_timesteps` are truncated
+  so that they fit the desired length.
+  The position where padding or truncation happens is determined by
+  the arguments `padding` and `truncating`, respectively.
 
-  Supports post-padding and pre-padding (default).
+  Pre-padding is the default.
 
   Arguments:
-      sequences: list of lists where each element is a sequence
-      maxlen: int, maximum length
-      dtype: type to cast the resulting sequence.
-      padding: 'pre' or 'post', pad either before or after each sequence.
-      truncating: 'pre' or 'post', remove values from sequences larger than
-          maxlen either in the beginning or in the end of the sequence
-      value: float, value to pad the sequences to the desired value.
+      sequences: List of lists, where each element is a sequence.
+      maxlen: Int, maximum length of all sequences.
+      dtype: Type of the output sequences.
+      padding: String, 'pre' or 'post':
+          pad either before or after each sequence.
+      truncating: String, 'pre' or 'post':
+          remove values from sequences larger than
+          `maxlen`, either at the beginning or at the end of the sequences.
+      value: Float, padding value.
 
   Returns:
-      x: numpy array with dimensions (number_of_sequences, maxlen)
+      x: Numpy array with shape `(len(sequences), maxlen)`
 
   Raises:
-      ValueError: in case of invalid values for `truncating` or `padding`,
+      ValueError: In case of invalid values for `truncating` or `padding`,
           or in case of invalid shape for a `sequences` entry.
   """
   if not hasattr(sequences, '__len__'):
@@ -92,10 +105,9 @@ def pad_sequences(sequences,
     # check `trunc` has expected shape
     trunc = np.asarray(trunc, dtype=dtype)
     if trunc.shape[1:] != sample_shape:
-      raise ValueError(
-          'Shape of sample %s of sequence at position %s is different from '
-          'expected shape %s'
-          % (trunc.shape[1:], idx, sample_shape))
+      raise ValueError('Shape of sample %s of sequence at position %s '
+                       'is different from expected shape %s' %
+                       (trunc.shape[1:], idx, sample_shape))
 
     if padding == 'post':
       x[idx, :len(trunc)] = trunc
@@ -110,22 +122,26 @@ def pad_sequences(sequences,
 def make_sampling_table(size, sampling_factor=1e-5):
   """Generates a word rank-based probabilistic sampling table.
 
-  This generates an array where the ith element
-  is the probability that a word of rank i would be sampled,
-  according to the sampling distribution used in word2vec.
+  Used for generating the `sampling_table` argument for `skipgrams`.
+  `sampling_table[i]` is the probability of sampling
+  the word i-th most common word in a dataset
+  (more common words should be sampled less frequently, for balance).
 
-  The word2vec formula is:
-      p(word) = min(1, sqrt(word.frequency/sampling_factor) /
-      (word.frequency/sampling_factor))
+  The sampling probabilities are generated according
+  to the sampling distribution used in word2vec:
+
+  `p(word) = min(1, sqrt(word_frequency / sampling_factor) / (word_frequency /
+  sampling_factor))`
 
   We assume that the word frequencies follow Zipf's law (s=1) to derive
   a numerical approximation of frequency(rank):
-     frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))
-      where gamma is the Euler-Mascheroni constant.
+
+  `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`
+  where `gamma` is the Euler-Mascheroni constant.
 
   Arguments:
-      size: int, number of possible words to sample.
-      sampling_factor: the sampling factor in the word2vec formula.
+      size: Int, number of possible words to sample.
+      sampling_factor: The sampling factor in the word2vec formula.
 
   Returns:
       A 1D Numpy array of length `size` where the ith entry
@@ -151,30 +167,37 @@ def skipgrams(sequence,
               seed=None):
   """Generates skipgram word pairs.
 
-  Takes a sequence (list of indexes of words),
-  returns couples of [word_index, other_word index] and labels (1s or 0s),
-  where label = 1 if 'other_word' belongs to the context of 'word',
-  and label=0 if 'other_word' is randomly sampled
+  This function transforms a sequence of word indexes (list of integers)
+  into tuples of words of the form:
+
+  - (word, word in the same window), with label 1 (positive samples).
+  - (word, random word from the vocabulary), with label 0 (negative samples).
+
+  Read more about Skipgram in this gnomic paper by Mikolov et al.:
+  [Efficient Estimation of Word Representations in
+  Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)
 
   Arguments:
-      sequence: a word sequence (sentence), encoded as a list
+      sequence: A word sequence (sentence), encoded as a list
           of word indices (integers). If using a `sampling_table`,
           word indices are expected to match the rank
           of the words in a reference dataset (e.g. 10 would encode
           the 10-th most frequently occurring token).
           Note that index 0 is expected to be a non-word and will be skipped.
-      vocabulary_size: int. maximum possible word index + 1
-      window_size: int. actually half-window.
-          The window of a word wi will be [i-window_size, i+window_size+1]
-      negative_samples: float >= 0. 0 for no negative (=random) samples.
-          1 for same number as positive samples. etc.
-      shuffle: whether to shuffle the word couples before returning them.
+      vocabulary_size: Int, maximum possible word index + 1
+      window_size: Int, size of sampling windows (technically half-window).
+          The window of a word `w_i` will be
+          `[i - window_size, i + window_size+1]`.
+      negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.
+          1 for same number as positive samples.
+      shuffle: Whether to shuffle the word couples before returning them.
       categorical: bool. if False, labels will be
-          integers (eg. [0, 1, 1 .. ]),
-          if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]
+          integers (eg. `[0, 1, 1 .. ]`),
+          if `True`, labels will be categorical, e.g.
+          `[[1,0],[0,1],[0,1] .. ]`.
       sampling_table: 1D array of size `vocabulary_size` where the entry i
           encodes the probability to sample a word of rank i.
-      seed: random seed.
+      seed: Random seed.
 
   Returns:
       couples, labels: where `couples` are int pairs and
@@ -234,9 +257,9 @@ def _remove_long_seq(maxlen, seq, label):
   """Removes sequences that exceed the maximum length.
 
   Arguments:
-      maxlen: int, maximum length
-      seq: list of lists where each sublist is a sequence
-      label: list where each element is an integer
+      maxlen: Int, maximum length of the output sequences.
+      seq: List of lists, where each sublist is a sequence.
+      label: List where each element is an integer.
 
   Returns:
       new_seq, new_label: shortened lists for `seq` and `label`.
@@ -247,3 +270,120 @@ def _remove_long_seq(maxlen, seq, label):
       new_seq.append(x)
       new_label.append(y)
   return new_seq, new_label
+
+
+@tf_export('keras.preprocessing.sequence.TimeseriesGenerator')
+class TimeseriesGenerator(Sequence):
+  """Utility class for generating batches of temporal data.
+
+  This class takes in a sequence of data-points gathered at
+  equal intervals, along with time series parameters such as
+  stride, length of history, etc., to produce batches for
+  training/validation.
+
+  Arguments:
+      data: Indexable generator (such as list or Numpy array)
+          containing consecutive data points (timesteps).
+          The data should be at 2D, and axis 0 is expected
+          to be the time dimension.
+      targets: Targets corresponding to timesteps in `data`.
+          It should have same length as `data`.
+      length: Length of the output sequences (in number of timesteps).
+      sampling_rate: Period between successive individual timesteps
+          within sequences. For rate `r`, timesteps
+          `data[i]`, `data[i-r]`, ... `data[i - length]`
+          are used for create a sample sequence.
+      stride: Period between successive output sequences.
+          For stride `s`, consecutive output samples would
+          be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
+      start_index, end_index: Data points earlier than `start_index`
+          or later than `end_index` will not be used in the output sequences.
+          This is useful to reserve part of the data for test or validation.
+      shuffle: Whether to shuffle output samples,
+          or instead draw them in chronological order.
+      reverse: Boolean: if `true`, timesteps in each output sample will be
+          in reverse chronological order.
+      batch_size: Number of timeseries samples in each batch
+          (except maybe the last one).
+
+  Returns:
+      A [Sequence](/utils/#sequence) instance.
+
+  Examples:
+
+  ```python
+  from keras.preprocessing.sequence import TimeseriesGenerator
+  import numpy as np
+
+  data = np.array([[i] for i in range(50)])
+  targets = np.array([[i] for i in range(50)])
+
+  data_gen = TimeseriesGenerator(data, targets,
+                                 length=10, sampling_rate=2,
+                                 batch_size=2)
+  assert len(data_gen) == 20
+
+  batch_0 = data_gen[0]
+  x, y = batch_0
+  assert np.array_equal(x,
+                        np.array([[[0], [2], [4], [6], [8]],
+                                  [[1], [3], [5], [7], [9]]]))
+  assert np.array_equal(y,
+                        np.array([[10], [11]]))
+  ```
+  """
+
+  def __init__(self,
+               data,
+               targets,
+               length,
+               sampling_rate=1,
+               stride=1,
+               start_index=0,
+               end_index=None,
+               shuffle=False,
+               reverse=False,
+               batch_size=128):
+    self.data = data
+    self.targets = targets
+    self.length = length
+    self.sampling_rate = sampling_rate
+    self.stride = stride
+    self.start_index = start_index + length
+    if end_index is None:
+      end_index = len(data) - 1
+    self.end_index = end_index
+    self.shuffle = shuffle
+    self.reverse = reverse
+    self.batch_size = batch_size
+
+  def __len__(self):
+    length = int(
+        np.ceil((self.end_index - self.start_index) /
+                (self.batch_size * self.stride)))
+    return length if length >= 0 else 0
+
+  def _empty_batch(self, num_rows):
+    samples_shape = [num_rows, self.length // self.sampling_rate]
+    samples_shape.extend(self.data.shape[1:])
+    targets_shape = [num_rows]
+    targets_shape.extend(self.targets.shape[1:])
+    return np.empty(samples_shape), np.empty(targets_shape)
+
+  def __getitem__(self, index):
+    if self.shuffle:
+      rows = np.random.randint(
+          self.start_index, self.end_index, size=self.batch_size)
+    else:
+      i = self.start_index + self.batch_size * self.stride * index
+      rows = np.arange(i, min(i + self.batch_size * self.stride,
+                              self.end_index), self.stride)
+
+    samples, targets = self._empty_batch(len(rows))
+    for j in range(len(rows)):
+      indices = range(rows[j] - self.length, rows[j], self.sampling_rate)
+      samples[j] = self.data[indices]
+      targets[j] = self.targets[rows[j]]
+    if self.reverse:
+      return samples[:, ::-1, ...], targets
+    return samples, targets
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
index 4529e6e94fc42661fb0474c1a827863ddb654776..b9bfdd000484665e8771f4bccef59738e5c26120 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
@@ -84,15 +84,91 @@ class TestSequence(test.TestCase):
     couples, labels = keras.preprocessing.sequence.skipgrams(
         np.arange(3), vocabulary_size=3)
     for couple in couples:
-      assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2]
+      self.assertIn(couple[0], [0, 1, 2])
+      self.assertIn(couple[1], [0, 1, 2])
 
     # test window size and categorical labels
     couples, labels = keras.preprocessing.sequence.skipgrams(
         np.arange(5), vocabulary_size=5, window_size=1, categorical=True)
     for couple in couples:
-      assert couple[0] - couple[1] <= 3
+      self.assertLessEqual(couple[0] - couple[1], 3)
     for l in labels:
-      assert len(l) == 2
+      self.assertEqual(len(l), 2)
+
+  def test_TimeseriesGenerator(self):
+    data = np.array([[i] for i in range(50)])
+    targets = np.array([[i] for i in range(50)])
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data, targets, length=10, sampling_rate=2, batch_size=2)
+    self.assertEqual(len(data_gen), 20)
+    self.assertAllClose(data_gen[0][0],
+                        np.array([[[0], [2], [4], [6], [8]], [[1], [3], [5],
+                                                              [7], [9]]]))
+    self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
+    self.assertAllClose(data_gen[1][0],
+                        np.array([[[2], [4], [6], [8], [10]], [[3], [5], [7],
+                                                               [9], [11]]]))
+    self.assertAllClose(data_gen[1][1], np.array([[12], [13]]))
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data, targets, length=10, sampling_rate=2, reverse=True, batch_size=2)
+    self.assertEqual(len(data_gen), 20)
+    self.assertAllClose(data_gen[0][0],
+                        np.array([[[8], [6], [4], [2], [0]], [[9], [7], [5],
+                                                              [3], [1]]]))
+    self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data, targets, length=10, sampling_rate=2, shuffle=True, batch_size=1)
+    batch = data_gen[0]
+    r = batch[1][0][0]
+    self.assertAllClose(batch[0],
+                        np.array([[[r - 10], [r - 8], [r - 6], [r - 4],
+                                   [r - 2]]]))
+    self.assertAllClose(batch[1], np.array([
+        [r],
+    ]))
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data, targets, length=10, sampling_rate=2, stride=2, batch_size=2)
+    self.assertEqual(len(data_gen), 10)
+    self.assertAllClose(data_gen[1][0],
+                        np.array([[[4], [6], [8], [10], [12]], [[6], [8], [10],
+                                                                [12], [14]]]))
+    self.assertAllClose(data_gen[1][1], np.array([[14], [16]]))
+
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data,
+        targets,
+        length=10,
+        sampling_rate=2,
+        start_index=10,
+        end_index=30,
+        batch_size=2)
+    self.assertEqual(len(data_gen), 5)
+    self.assertAllClose(data_gen[0][0],
+                        np.array([[[10], [12], [14], [16], [18]],
+                                  [[11], [13], [15], [17], [19]]]))
+    self.assertAllClose(data_gen[0][1], np.array([[20], [21]]))
+
+    data = np.array([np.random.random_sample((1, 2, 3, 4)) for i in range(50)])
+    targets = np.array([np.random.random_sample((3, 2, 1)) for i in range(50)])
+    data_gen = keras.preprocessing.sequence.TimeseriesGenerator(
+        data,
+        targets,
+        length=10,
+        sampling_rate=2,
+        start_index=10,
+        end_index=30,
+        batch_size=2)
+
+    self.assertEqual(len(data_gen), 5)
+    self.assertAllClose(data_gen[0][0],
+                        np.array(
+                            [np.array(data[10:19:2]),
+                             np.array(data[11:20:2])]))
+    self.assertAllClose(data_gen[0][1], np.array([targets[20], targets[21]]))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text.py b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
index 1e3828ccf1e3bf9c443691e1c1da5697bedb4653..f652f318f3d6dae20b1113a50cd02930abb851af 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
@@ -91,6 +91,7 @@ def one_hot(text,
       text, n, hash_function=hash, filters=filters, lower=lower, split=split)
 
 
+@tf_export('keras.preprocessing.text.hashing_trick')
 def hashing_trick(text,
                   n,
                   hash_function=None,
@@ -187,21 +188,27 @@ class Tokenizer(object):
     self.document_count = 0
     self.char_level = char_level
     self.oov_token = oov_token
+    self.index_docs = {}
 
   def fit_on_texts(self, texts):
     """Updates internal vocabulary based on a list of texts.
 
+    In the case where texts contains lists, we assume each entry of the lists
+    to be a token.
+
     Required before using `texts_to_sequences` or `texts_to_matrix`.
 
     Arguments:
         texts: can be a list of strings,
-            or a generator of strings (for memory-efficiency)
+            a generator of strings (for memory-efficiency),
+            or a list of list of strings.
     """
-    self.document_count = 0
     for text in texts:
       self.document_count += 1
-      seq = text if self.char_level else text_to_word_sequence(
-          text, self.filters, self.lower, self.split)
+      if self.char_level or isinstance(text, list):
+        seq = text
+      else:
+        seq = text_to_word_sequence(text, self.filters, self.lower, self.split)
       for w in seq:
         if w in self.word_counts:
           self.word_counts[w] += 1
@@ -226,7 +233,6 @@ class Tokenizer(object):
       if i is None:
         self.word_index[self.oov_token] = len(self.word_index) + 1
 
-    self.index_docs = {}
     for w, c in list(self.word_docs.items()):
       self.index_docs[self.word_index[w]] = c
 
@@ -240,8 +246,7 @@ class Tokenizer(object):
         sequences: A list of sequence.
             A "sequence" is a list of integer word indices.
     """
-    self.document_count = len(sequences)
-    self.index_docs = {}
+    self.document_count += len(sequences)
     for seq in sequences:
       seq = set(seq)
       for i in seq:
@@ -268,7 +273,11 @@ class Tokenizer(object):
     return res
 
   def texts_to_sequences_generator(self, texts):
-    """Transforms each text in texts in a sequence of integers.
+    """Transforms each text in `texts` in a sequence of integers.
+
+    Each item in texts can also be a list, in which case we assume each item of
+    that list
+    to be a token.
 
     Only top "num_words" most frequent words will be taken into account.
     Only words known by the tokenizer will be taken into account.
@@ -281,8 +290,10 @@ class Tokenizer(object):
     """
     num_words = self.num_words
     for text in texts:
-      seq = text if self.char_level else text_to_word_sequence(
-          text, self.filters, self.lower, self.split)
+      if self.char_level or isinstance(text, list):
+        seq = text
+      else:
+        seq = text_to_word_sequence(text, self.filters, self.lower, self.split)
       vect = []
       for w in seq:
         i = self.word_index.get(w)
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
index a934e331c4a14d9bd170258b6b6183e6a15bb561..c6a267e57e4e2dc04156483d1cf85a42a78eb395 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -80,17 +81,52 @@ class TestText(test.TestCase):
     x_train = ['This text has only known words']
     x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
 
-    # Defalut, without OOV flag
+    # Default, without OOV flag
     tokenizer = keras.preprocessing.text.Tokenizer()
     tokenizer.fit_on_texts(x_train)
     x_test_seq = tokenizer.texts_to_sequences(x_test)
-    assert len(x_test_seq[0]) == 4  # discards 2 OOVs
+    self.assertEqual(len(x_test_seq[0]), 4)  # discards 2 OOVs
 
     # With OOV feature
     tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<unk>')
     tokenizer.fit_on_texts(x_train)
     x_test_seq = tokenizer.texts_to_sequences(x_test)
-    assert len(x_test_seq[0]) == 6  # OOVs marked in place
+    self.assertEqual(len(x_test_seq[0]), 6)  # OOVs marked in place
+
+  def test_sequential_fit(self):
+    texts = [
+        'The cat sat on the mat.', 'The dog sat on the log.',
+        'Dogs and cats living together.'
+    ]
+    word_sequences = [['The', 'cat', 'is', 'sitting'],
+                      ['The', 'dog', 'is', 'standing']]
+    tokenizer = keras.preprocessing.text.Tokenizer()
+    tokenizer.fit_on_texts(texts)
+    tokenizer.fit_on_texts(word_sequences)
+
+    self.assertEqual(tokenizer.document_count, 5)
+
+    tokenizer.texts_to_matrix(texts)
+    tokenizer.texts_to_matrix(word_sequences)
+
+  def test_text_to_word_sequence(self):
+    text = 'hello! ? world!'
+    seq = keras.preprocessing.text.text_to_word_sequence(text)
+    self.assertEqual(seq, ['hello', 'world'])
+
+  def test_text_to_word_sequence_unicode(self):
+    text = u'ali! veli? kırk dokuz elli'
+    seq = keras.preprocessing.text.text_to_word_sequence(text)
+    self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
+
+  def test_tokenizer_unicode(self):
+    texts = [
+        u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'
+    ]
+    tokenizer = keras.preprocessing.text.Tokenizer(num_words=5)
+    tokenizer.fit_on_texts(texts)
+
+    self.assertEqual(len(tokenizer.word_counts), 5)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/_impl/keras/regularizers.py
index 2c30844647acdb78d1ca31d052ec7e5ecc6dcc2a..74c37d370ea630ca3c3e5e0945828f63928572e1 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/_impl/keras/regularizers.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,9 +56,9 @@ class L1L2(Regularizer):
   def __call__(self, x):
     regularization = 0.
     if self.l1:
-      regularization += K.sum(self.l1 * K.abs(x))
+      regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
     if self.l2:
-      regularization += K.sum(self.l2 * K.square(x))
+      regularization += math_ops.reduce_sum(self.l2 * math_ops.square(x))
     return regularization
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
index 4c8009dfd80e1aec457fa03687f2840c7fe4607b..902972ecbb8fd69a9252b7e19e32bee5e33e4f97 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
@@ -35,7 +35,7 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  return int(np.sum([K.count_params(p) for p in set(weights)]))
+  return int(np.sum([np.prod(p.get_shape().as_list()) for p in set(weights)]))
 
 
 def print_summary(model, line_length=None, positions=None, print_fn=None):
@@ -193,8 +193,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   else:
     trainable_count = count_params(model.trainable_weights)
 
-  non_trainable_count = int(
-      np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
+  non_trainable_count = count_params(model.non_trainable_weights)
 
   print_fn('Total params: {:,}'.format(trainable_count + non_trainable_count))
   print_fn('Trainable params: {:,}'.format(trainable_count))
diff --git a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
index 45c1b92075c50956fee004409e98898411e83d27..4761cece82c727e4962d0374f8efb80dfaeac3c6 100644
--- a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
@@ -120,7 +120,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
     layer_id = str(id(layer))
     for i, node in enumerate(layer._inbound_nodes):
       node_key = layer.name + '_ib-' + str(i)
-      if node_key in model._container_nodes:
+      if node_key in model._network_nodes:  # pylint: disable=protected-access
         for inbound_layer in node.inbound_layers:
           inbound_layer_id = str(id(inbound_layer))
           layer_id = str(id(layer))
diff --git a/tensorflow/python/keras/preprocessing/image/__init__.py b/tensorflow/python/keras/preprocessing/image/__init__.py
index b96e7675527041d3952b049f5f431d3df36eea4c..6aba5fc8252e1acf604a89a4e66c2a7db080aa73 100644
--- a/tensorflow/python/keras/preprocessing/image/__init__.py
+++ b/tensorflow/python/keras/preprocessing/image/__init__.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras._impl.keras.preprocessing.image import img_to_array
 from tensorflow.python.keras._impl.keras.preprocessing.image import Iterator
 from tensorflow.python.keras._impl.keras.preprocessing.image import load_img
 from tensorflow.python.keras._impl.keras.preprocessing.image import NumpyArrayIterator
+from tensorflow.python.keras._impl.keras.preprocessing.image import random_brightness
 from tensorflow.python.keras._impl.keras.preprocessing.image import random_channel_shift
 from tensorflow.python.keras._impl.keras.preprocessing.image import random_rotation
 from tensorflow.python.keras._impl.keras.preprocessing.image import random_shear
diff --git a/tensorflow/python/keras/preprocessing/sequence/__init__.py b/tensorflow/python/keras/preprocessing/sequence/__init__.py
index 112f6af5e588bcb2e85fdbecea86f402742d44e7..b7a7149cc40654c878e3c0db1fc78d8912abf498 100644
--- a/tensorflow/python/keras/preprocessing/sequence/__init__.py
+++ b/tensorflow/python/keras/preprocessing/sequence/__init__.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.keras._impl.keras.preprocessing.sequence import make_sampling_table
 from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences
 from tensorflow.python.keras._impl.keras.preprocessing.sequence import skipgrams
+from tensorflow.python.keras._impl.keras.preprocessing.sequence import TimeseriesGenerator
 
 del absolute_import
 del division
diff --git a/tensorflow/python/keras/preprocessing/text/__init__.py b/tensorflow/python/keras/preprocessing/text/__init__.py
index 5bf1a2fb21dc27f7aa10cd08b1496e3991c61d2f..000ad68a0c01e9067f8852836ba5d502deb3fcd4 100644
--- a/tensorflow/python/keras/preprocessing/text/__init__.py
+++ b/tensorflow/python/keras/preprocessing/text/__init__.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras._impl.keras.preprocessing.text import hashing_trick
 from tensorflow.python.keras._impl.keras.preprocessing.text import one_hot
 from tensorflow.python.keras._impl.keras.preprocessing.text import text_to_word_sequence
 from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index f27ca5c2051be81eb94488ba388c11bcfebafc4b..6c34ea181654c29da74164f0e220b2b9ee8d939e 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -392,6 +392,7 @@ tf_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
     ],
+    shard_count = 5,
 )
 
 tf_py_test(
@@ -407,6 +408,7 @@ tf_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
     ],
+    shard_count = 5,
 )
 
 tf_py_test(
@@ -1086,6 +1088,7 @@ cuda_py_test(
     tags = [
         "no_windows",
         "noasan",
+        "noguitar",
         "notap",
     ],
 )
@@ -1564,7 +1567,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "init_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["init_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1906,7 +1909,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "softmax_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["softmax_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2516,7 +2519,10 @@ cuda_py_test(
         "//tensorflow/python:sparse_ops",
     ],
     shard_count = 5,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",  # b/77589990
+    ],
 )
 
 cuda_py_test(
@@ -2717,7 +2723,9 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
     ],
+    data = ["//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files"],
     shard_count = 20,
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2937,15 +2945,3 @@ tf_py_test(
         "//tensorflow/python/eager:tape",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index d0ba8020c1eaee74ded5ad67cae39b51d44097bd..5a20eebbc559cf6a3cad97adb8aa056cb88719cb 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -315,21 +315,39 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
             self.assertAllEqual(x_tf_4, np.asarray(x_np)[:, ::-1])
             self.assertAllEqual(x_tf_5, np.asarray(x_np)[::-1, ::-1])
 
+  # This test covers the axis validation in the shape function
+  # (no eval())
+  def testInvalidAxis(self):
+    x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+    with self.assertRaisesRegexp(ValueError,
+                                 "is out of valid range"):
+      array_ops.reverse_v2(x_np, [-30])
+    with self.assertRaisesRegexp(ValueError,
+                                 "is out of valid range"):
+      array_ops.reverse_v2(x_np, [2])
+    with self.assertRaisesRegexp(ValueError,
+                                 "axis 0 specified more than once"):
+      array_ops.reverse_v2(x_np, [0, -2])
+
   # This is the version of reverse that uses axis indices rather than
   # bool tensors
   # TODO(b/32254538): Change this test to use array_ops.reverse
+  #
+  # Note: this test passes placeholder as constant axis is validated
+  # in shape function (see testInvalidAxis)
   def testInvalid(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+    axis = array_ops.placeholder(dtypes.int32)
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "is out of valid range"):
-        array_ops.reverse_v2(x_np, [-30]).eval()
+        array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [-30]})
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "is out of valid range"):
-        array_ops.reverse_v2(x_np, [2]).eval()
+        array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [2]})
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "axis 0 specified more than once"):
-        array_ops.reverse_v2(x_np, [0, -2]).eval()
+        array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [0, -2]})
 
   def testReverse1DimAuto(self):
     for dtype in [
@@ -762,6 +780,14 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
       grad = GradSliceChecker(self, sess, var, np.array(8))
       _ = grad[tuple()]
 
+  def testInt64Indices(self):
+    with self.test_session(use_gpu=True) as sess:
+      a = math_ops.range(3)
+      index = constant_op.constant(1, dtype=dtypes.int64)
+      b = 2 * a[index]
+      grad, = gradients_impl.gradients(b, a)
+      self.assertAllEqual(sess.run(grad), [0, 2, 0])
+
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
   """Test varied index types and host located memory."""
@@ -890,7 +916,7 @@ class StridedSliceAssignChecker(object):
         var = resource_variable_ops.ResourceVariable(self.x)
       else:
         var = variables.Variable(self.x)
-      sess.run(variables.initialize_variables([var]))
+      sess.run(variables.variables_initializer([var]))
       val = sess.run(var[index].assign(value))
       # val_copy is used to check that tf.assign works equivalently to the
       # assign method above.
@@ -981,30 +1007,38 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testDenseShape(self):
-    with self.test_session():
-      t_value = [[0, 42], [24, 0]]
-      self.assertAllEqual((2, 2), array_ops.shape(t_value).eval())
-      self.assertEqual(4, array_ops.size(t_value).eval())
-      self.assertEqual(2, array_ops.rank(t_value).eval())
+    t_value = [[0, 42], [24, 0]]
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t_value)))
+    self.assertEqual(4, self.evaluate(array_ops.size(t_value)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(t_value)))
 
-      t = constant_op.constant(t_value)
-      self.assertAllEqual((2, 2), array_ops.shape(t).eval())
-      self.assertEqual(4, array_ops.size(t).eval())
-      self.assertEqual(2, array_ops.rank(t).eval())
+    t = constant_op.constant(t_value)
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t)))
+    self.assertEqual(4, self.evaluate(array_ops.size(t)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(t)))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSparseShape(self):
-    with self.test_session():
-      sp_value = sparse_tensor.SparseTensorValue(
-          indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
-      self.assertAllEqual((2, 2), array_ops.shape(sp_value).eval())
-      self.assertEqual(4, array_ops.size(sp_value).eval())
-      self.assertEqual(2, array_ops.rank(sp_value).eval())
-
-      sp = sparse_tensor.SparseTensor.from_value(sp_value)
-      self.assertAllEqual((2, 2), array_ops.shape(sp).eval())
-      self.assertEqual(4, array_ops.size(sp).eval())
-      self.assertEqual(2, array_ops.rank(sp).eval())
+    sp_value = sparse_tensor.SparseTensorValue(
+        indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp_value)))
+    self.assertEqual(4, self.evaluate(array_ops.size(sp_value)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(sp_value)))
+
+    sp = sparse_tensor.SparseTensor.from_value(sp_value)
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp)))
+    self.assertEqual(4, self.evaluate(array_ops.size(sp)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(sp)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSizeDtype(self):
+    tensor = [1]
+    self.assertEqual(dtypes.int32, self.evaluate(array_ops.size(tensor)).dtype)
+    self.assertEqual(
+        dtypes.int64,
+        self.evaluate(array_ops.size(tensor, out_type=dtypes.int64)).dtype)
 
 
 @test_util.with_c_api
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 405651e8ae97fbc5eefd4aba0a95a99ff8fd8c26..987a6ffcd4b18eb5857ff9e82206de7f6ebe8a27 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.ops.gen_array_ops import _broadcast_gradient_args
+from tensorflow.python.ops.gen_array_ops import broadcast_gradient_args
 from tensorflow.python.platform import test
 
 
@@ -157,7 +157,7 @@ class BroadcastSimpleTest(test.TestCase):
 
   def _GetGradientArgs(self, xs, ys):
     with self.test_session(use_gpu=True) as sess:
-      return sess.run(_broadcast_gradient_args(xs, ys))
+      return sess.run(broadcast_gradient_args(xs, ys))
 
   def testBroadcast(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
diff --git a/tensorflow/python/kernel_tests/bcast_ops_test.py b/tensorflow/python/kernel_tests/bcast_ops_test.py
index cb46fcb0076c1ca437089f5b9d87100667e2a404..3305e55c05bd03d31c46fd333db09dbab9a5d09c 100644
--- a/tensorflow/python/kernel_tests/bcast_ops_test.py
+++ b/tensorflow/python/kernel_tests/bcast_ops_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.ops.gen_array_ops import _broadcast_gradient_args
 from tensorflow.python.ops.gen_array_ops import broadcast_args
+from tensorflow.python.ops.gen_array_ops import broadcast_gradient_args
 from tensorflow.python.platform import test
 
 
@@ -33,7 +33,7 @@ class BcastOpsTest(test.TestCase):
 
   def _GetGradientArgs(self, xs, ys):
     with self.test_session() as sess:
-      return sess.run(_broadcast_gradient_args(xs, ys))
+      return sess.run(broadcast_gradient_args(xs, ys))
 
   def testBasic(self):
     r = self._GetBroadcastShape([2, 3, 5], [1])
diff --git a/tensorflow/python/kernel_tests/boosted_trees/BUILD b/tensorflow/python/kernel_tests/boosted_trees/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..30e6289420b36a75589ef25150480e48f8245ec2
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/BUILD
@@ -0,0 +1,76 @@
+# Description:
+#   Kernel tests for Boosted Trees.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_py_test(
+    name = "resource_ops_test",
+    size = "small",
+    srcs = ["resource_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "prediction_ops_test",
+    size = "small",
+    srcs = ["prediction_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:resources",
+    ],
+)
+
+tf_py_test(
+    name = "stats_ops_test",
+    size = "small",
+    srcs = ["stats_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "training_ops_test",
+    size = "small",
+    srcs = ["training_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resources",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/__init__.py b/tensorflow/python/kernel_tests/boosted_trees/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d132f15e51dbc6cd8e706e36b889352c20792cdf
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -0,0 +1,926 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests boosted_trees prediction kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from google.protobuf import text_format
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
+  """Tests prediction ops for training."""
+
+  def testCachedPredictionOnEmptyEnsemble(self):
+    """Tests that prediction on a dummy ensemble does not fail."""
+    with self.test_session() as session:
+      # Create a dummy ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto='')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # No previous cached values.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [0, 0]
+
+      # We have two features: 0 and 1. Values don't matter here on a dummy
+      # ensemble.
+      feature_0_values = [67, 5]
+      feature_1_values = [9, 17]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=2,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # Nothing changed.
+      self.assertAllClose(cached_tree_ids, new_tree_ids)
+      self.assertAllClose(cached_node_ids, new_node_ids)
+      self.assertAllClose([[0], [0]], logits_updates)
+
+  def testNoCachedPredictionButTreeExists(self):
+    """Tests that predictions are updated once trees are added."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, none were cached before.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [0, 0]
+
+      feature_0_values = [67, 5]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=2,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the first tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      self.assertAllClose([2, 1], new_node_ids)
+      self.assertAllClose([[0.1 * 8.79], [0.1 * 1.14]], logits_updates)
+
+  def testCachedPredictionIsCurrent(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+              original_leaf {
+                scalar: -2
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 0.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 2]
+
+      # We have two features: 0 and 1. Values don't matter because trees didn't
+      # change.
+      feature_0_values = [67, 5]
+      feature_1_values = [9, 17]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=4,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # Nothing changed.
+      self.assertAllClose(cached_tree_ids, new_tree_ids)
+      self.assertAllClose(cached_node_ids, new_node_ids)
+      self.assertAllClose([[0], [0]], logits_updates)
+
+  def testCachedPredictionFromTheSameTree(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+              original_leaf {
+                scalar: -2
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 7
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 7.14
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -5.875
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -2.075
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 0.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 0]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [67, 5]
+      feature_1_values = [9, 17]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=4,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are still in the same tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      # When using the full tree, the first example will end up in node 4,
+      # the second in node 5.
+      self.assertAllClose([4, 5], new_node_ids)
+      # Full predictions for each instance would be 8.79 and -5.875,
+      # so an update from the previous cached values lr*(7.14 and -2) would be
+      # 1.65 and -3.875, and then multiply them by 0.1 (lr)
+      self.assertAllClose([[0.1 * 1.65], [0.1 * -3.875]], logits_updates)
+
+  def testCachedPredictionFromPreviousTree(self):
+    """Tests the predictions work when we have cache from previous trees."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 28
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 34
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+        }
+        tree_metadata {
+          is_finalized: true
+        }
+        tree_metadata {
+          is_finalized: true
+        }
+        tree_metadata {
+          is_finalized: false
+        }
+        tree_weights: 0.1
+        tree_weights: 0.1
+        tree_weights: 0.1
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 2.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 0]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=2,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+      # Example 1 will get to node 3 in tree 1 and node 2 of tree 2
+      # Example 2 will get to node 2 in tree 1 and node 1 of tree 2
+
+      # We are in the last tree.
+      self.assertAllClose([2, 2], new_tree_ids)
+      # When using the full tree, the first example will end up in node 4,
+      # the second in node 5.
+      self.assertAllClose([2, 1], new_node_ids)
+      # Example 1: tree 0: 8.79, tree 1: 5.0, tree 2: 5.0 = >
+      #            change = 0.1*(5.0+5.0)
+      # Example 2: tree 0: 1.14, tree 1: 7.0, tree 2: -7 = >
+      #            change= 0.1(1.14+7.0-7.0)
+      self.assertAllClose([[1], [0.114]], logits_updates)
+
+  def testCachedPredictionFromTheSameTreeWithPostPrunedNodes(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:0
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 5
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0783
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+          }
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 3
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      cached_tree_ids = [0, 0, 0, 0, 0, 0]
+      # Leaves 3,4, 7 and 8 got deleted during post-pruning, leaves 5 and 6
+      # changed the ids to 3 and 4 respectively.
+      cached_node_ids = [3, 4, 5, 6, 7, 8]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [12, 17, 35, 36, 23, 11]
+      feature_1_values = [12, 12, 17, 18, 123, 24]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=3,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are still in the same tree.
+      self.assertAllClose([0, 0, 0, 0, 0, 0], new_tree_ids)
+      # Examples from leaves 3,4,7,8 should be in leaf 1, examples from leaf 5
+      # and 6 in leaf 3 and 4.
+      self.assertAllClose([1, 1, 3, 4, 1, 1], new_node_ids)
+
+      cached_values = [[0.08], [0.093], [0.0553], [0.0783], [0.15 + 0.08],
+                       [0.5 + 0.08]]
+      self.assertAllClose([[0.01], [0.01], [0.0553], [0.0783], [0.01], [0.01]],
+                          logits_updates + cached_values)
+
+  def testCachedPredictionFromThePreviousTreeWithPostPrunedNodes(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:0
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 5
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0783
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.55
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+          }
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 4
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      cached_tree_ids = [0, 0, 0, 0, 0, 0]
+      # Leaves 3,4, 7 and 8 got deleted during post-pruning, leaves 5 and 6
+      # changed the ids to 3 and 4 respectively.
+      cached_node_ids = [3, 4, 5, 6, 7, 8]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [12, 17, 35, 36, 23, 11]
+      feature_1_values = [12, 12, 17, 18, 123, 24]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=3,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the last tree.
+      self.assertAllClose([1, 1, 1, 1, 1, 1], new_tree_ids)
+      # Examples from leaves 3,4,7,8 should be in leaf 1, examples from leaf 5
+      # and 6 in leaf 3 and 4 in tree 0. For tree 1, all of the examples are in
+      # the root node.
+      self.assertAllClose([0, 0, 0, 0, 0, 0], new_node_ids)
+
+      cached_values = [[0.08], [0.093], [0.0553], [0.0783], [0.15 + 0.08],
+                       [0.5 + 0.08]]
+      root = 0.55
+      self.assertAllClose([[root + 0.01], [root + 0.01], [root + 0.0553],
+                           [root + 0.0783], [root + 0.01], [root + 0.01]],
+                          logits_updates + cached_values)
+
+  def testCachedPredictionTheWholeTreeWasPruned(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.00
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: -6.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 5.0
+          }
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      cached_tree_ids = [
+          0,
+          0,
+      ]
+      # The predictions were cached in 1 and 2, both were pruned to the root.
+      cached_node_ids = [1, 2]
+
+      # We have two features: 0 and 1.These are not going to be used anywhere.
+      feature_0_values = [12, 17]
+      feature_1_values = [12, 12]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=1,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the last tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      self.assertAllClose([0, 0], new_node_ids)
+
+      self.assertAllClose([[-6.0], [5.0]], logits_updates)
+
+
+class PredictionOpsTest(test_util.TensorFlowTestCase):
+  """Tests prediction ops for inference."""
+
+  def testPredictionMultipleTree(self):
+    """Tests the predictions work when we have multiple trees."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 28
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6.0
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 34
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_weights: 0.2
+        tree_weights: 1.0
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+
+      # Example 1: tree 0: 1.14, tree 1: 5.0, tree 2: 5.0 = >
+      #            logit = 0.1*5.0+0.2*5.0+1*5
+      # Example 2: tree 0: 1.14, tree 1: 7.0, tree 2: -7 = >
+      #            logit= 0.1*1.14+0.2*7.0-1*7.0
+      expected_logits = [[6.114], [-5.486]]
+
+      # Do with parallelization, e.g. EVAL
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1,
+          max_depth=2)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
+      # Do without parallelization, e.g. INFER - the result is the same
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1,
+          max_depth=2)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a223241e893d6838faec9a48cb4ca9cb3c24a211
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -0,0 +1,228 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for boosted_trees resource kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from google.protobuf import text_format
+
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+class ResourceOpsTest(test_util.TensorFlowTestCase):
+  """Tests resource_ops."""
+
+  def testCreate(self):
+    with self.test_session():
+      ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
+      resources.initialize_resources(resources.shared_resources()).run()
+      stamp_token = ensemble.get_stamp_token()
+      self.assertEqual(0, stamp_token.eval())
+      (_, num_trees, num_finalized_trees,
+       num_attempted_layers) = ensemble.get_states()
+      self.assertEqual(0, num_trees.eval())
+      self.assertEqual(0, num_finalized_trees.eval())
+      self.assertEqual(0, num_attempted_layers.eval())
+
+  def testCreateWithProto(self):
+    with self.test_session():
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            bucketized_split {
+              threshold: 21
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 7.14
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6.54
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.305
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.525
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.145
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              threshold: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 6
+        }
+      """, ensemble_proto)
+      ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble',
+          stamp_token=7,
+          serialized_proto=ensemble_proto.SerializeToString())
+      resources.initialize_resources(resources.shared_resources()).run()
+      (stamp_token, num_trees, num_finalized_trees,
+       num_attempted_layers) = ensemble.get_states()
+      self.assertEqual(7, stamp_token.eval())
+      self.assertEqual(2, num_trees.eval())
+      self.assertEqual(1, num_finalized_trees.eval())
+      self.assertEqual(6, num_attempted_layers.eval())
+
+  def testSerializeDeserialize(self):
+    with self.test_session():
+      # Initialize.
+      ensemble = boosted_trees_ops.TreeEnsemble('ensemble', stamp_token=5)
+      resources.initialize_resources(resources.shared_resources()).run()
+      (stamp_token, num_trees, num_finalized_trees,
+       num_attempted_layers) = ensemble.get_states()
+      self.assertEqual(5, stamp_token.eval())
+      self.assertEqual(0, num_trees.eval())
+      self.assertEqual(0, num_finalized_trees.eval())
+      self.assertEqual(0, num_attempted_layers.eval())
+
+      # Deserialize.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              threshold: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.5
+        tree_metadata {
+          num_layers_grown: 4  # it's fake intentionally.
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 5
+        }
+      """, ensemble_proto)
+      with ops.control_dependencies([
+          ensemble.deserialize(
+              stamp_token=3,
+              serialized_proto=ensemble_proto.SerializeToString())
+      ]):
+        (stamp_token, num_trees, num_finalized_trees,
+         num_attempted_layers) = ensemble.get_states()
+      self.assertEqual(3, stamp_token.eval())
+      self.assertEqual(1, num_trees.eval())
+      # This reads from metadata, not really counting the layers.
+      self.assertEqual(5, num_attempted_layers.eval())
+      self.assertEqual(0, num_finalized_trees.eval())
+
+      # Serialize.
+      new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      new_stamp_token, new_serialized = ensemble.serialize()
+      self.assertEqual(3, new_stamp_token.eval())
+      new_ensemble_proto.ParseFromString(new_serialized.eval())
+      self.assertProtoEquals(ensemble_proto, new_ensemble_proto)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a54cc43517f4513b88b94ceb9b401b84b5ca053f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -0,0 +1,289 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for boosted_trees stats kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.platform import googletest
+
+
+class StatsOpsTest(test_util.TensorFlowTestCase):
+  """Tests stats_ops."""
+
+  def testCalculateBestGainsWithoutRegularization(self):
+    """Testing Gain calculation without any regularization."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.0,
+          tree_complexity=0.0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllClose([[0.004775, 0.41184], [0.02823, 0.41184]],
+                          sess.run(gains_list))
+      self.assertAllEqual([[1, 1], [1, 1]], sess.run(thresholds_list))
+      # The left node contrib will be later added to the previous node value to
+      # make the left node value, and the same for right node contrib.
+      self.assertAllClose([[[-.416667], [.568966]], [[-.6], [-.75]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-.592593], [-.75]], [[-.076923], [.568966]]],
+                          sess.run(right_node_contribs_list))
+
+  def testCalculateBestGainsWithL2(self):
+    """Testing Gain calculation with L2."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.1,
+          tree_complexity=0.0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllClose([[0., 0.33931375], [0.01879096, 0.33931375]],
+                          sess.run(gains_list))
+      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      # The left node contrib will be later added to the previous node value to
+      # make the left node value, and the same for right node contrib.
+      self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
+                          sess.run(right_node_contribs_list))
+
+  def testCalculateBestGainsWithL1(self):
+    """Testing Gain calculation with L1."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      l1 = 0.1
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=l1,
+          l2=0.0,
+          tree_complexity=0.0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllClose([[[0.0], [0.3965517]], [[-0.4], [-0.5]]],
+                          sess.run(left_node_contribs_list))
+
+      self.assertAllClose([[[-0.3333333], [-0.5]], [[0.0], [0.396552]]],
+                          sess.run(right_node_contribs_list))
+
+      # Gain should also include an adjustment of the gradient by l1.
+      self.assertAllClose([[0.0, 0.191207], [0.01, 0.191207]],
+                          sess.run(gains_list))
+
+  def testCalculateBestGainsWithTreeComplexity(self):
+    """Testing Gain calculation with L2."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      l2 = 0.1
+      tree_complexity = 3.
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=l2,
+          tree_complexity=tree_complexity,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+
+      self.assertAllClose([[-3., -2.66068625], [-2.98120904, -2.66068625]],
+                          sess.run(gains_list))
+
+      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      # The left node contrib will be later added to the previous node value to
+      # make the left node value, and the same for right node contrib.
+      self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
+                          sess.run(right_node_contribs_list))
+
+  def testMakeStatsSummarySimple(self):
+    """Simple test for MakeStatsSummary."""
+    with self.test_session():
+      self.assertAllClose([[[[1., 5.], [2., 6.]], [[3., 7.], [4., 8.]]]],
+                          boosted_trees_ops.make_stats_summary(
+                              node_ids=[0, 0, 1, 1],
+                              gradients=[[1.], [2.], [3.], [4.]],
+                              hessians=[[5.], [6.], [7.], [8.]],
+                              bucketized_features_list=[[0, 1, 0, 1]],
+                              max_splits=2,
+                              num_buckets=2).eval())
+
+  def testMakeStatsSummaryAccumulate(self):
+    """Tests that Summary actually accumulates."""
+    with self.test_session():
+      max_splits = 3
+      num_buckets = 4
+      node_ids = [1, 1, 2, 2, 1, 1, 2, 0]
+      gradients = [[.1], [.2], [.3], [-.4], [-.05], [.06], [.07], [.08]]
+      hessians = [[.2], [.3], [.4], [.5], [.06], [.07], [.08], [.09]]
+
+      # Tests a single feature.
+      bucketized_features = [[3, 1, 2, 0, 1, 2, 0, 1]]
+      result = boosted_trees_ops.make_stats_summary(
+          node_ids, gradients, hessians, bucketized_features, max_splits,
+          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+      self.assertAllClose(
+          [[
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
+          ]],
+          result.eval())
+
+  def testMakeStatsSummaryMultipleFeatures(self):
+    """Tests that MakeStatsSummary works for multiple features."""
+    with self.test_session():
+      max_splits = 3
+      num_buckets = 4
+      node_ids = [1, 1, 2, 2, 1, 1, 2, 0]
+      gradients = [[.1], [.2], [.3], [-.4], [-.05], [.06], [.07], [.08]]
+      hessians = [[.2], [.3], [.4], [.5], [.06], [.07], [.08], [.09]]
+
+      # Tests multiple features.
+      # The output from another feature will stored be in 3rd dimension.
+      bucketized_features = [[3, 1, 2, 0, 1, 2, 0, 1], [0, 0, 0, 2, 2, 3, 3, 2]]
+      result = boosted_trees_ops.make_stats_summary(
+          node_ids, gradients, hessians, bucketized_features, max_splits,
+          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+      self.assertAllClose(
+          [
+              [
+                  [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0
+                  [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+                  [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
+              ],  # feature 0
+              [
+                  [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0
+                  [[.3, .5], [0., 0.], [-.05, .06], [.06, .07]],  # node 1
+                  [[.3, .4], [0., 0.], [-.4, .5], [.07, .08]],  # node 2
+              ],  # feature 1
+          ],
+          result.eval())
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4226ff75c2327d09c0d89b29950605b610672603
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -0,0 +1,1465 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for boosted_trees training kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
+  """Tests for growing tree ensemble from split candidates."""
+
+  def testGrowWithEmptyEnsemble(self):
+    """Test growing an empty ensemble."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_ids = [0, 2, 6]
+
+      # Prepare feature inputs.
+      # Note that features 1 & 3 have the same gain but different splits.
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([0.63], dtype=np.float32)
+      feature2_thresholds = np.array([23], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24]], dtype=np.float32)
+
+      # Feature split with the highest gain.
+      feature3_nodes = np.array([0], dtype=np.int32)
+      feature3_gains = np.array([7.65], dtype=np.float32)
+      feature3_thresholds = np.array([7], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-4.89]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[5.3]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # Tree will be finalized now, since we will reach depth 1.
+          max_depth=1,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      # Note that since the tree is finalized, we added a new dummy tree.
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 6
+              threshold: 7
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.65
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.489
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.53
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testGrowExistingEnsembleTreeNotFinalized(self):
+    """Test growing an existing ensemble with the last tree not finalized."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([0.63, 2.7], dtype=np.float32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([1.7], dtype=np.float32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # tree is going to be finalized now, since we reach depth 2.
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should be finalized as max tree depth is 2 and we have
+      # grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            bucketized_split {
+              threshold: 21
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 0.714
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -0.4375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.114
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.879
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.5875
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.2075
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testGrowExistingEnsembleTreeFinalized(self):
+    """Test growing an existing ensemble with the last tree finalized."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+
+      feature_ids = [75]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          learning_rate=0.1,
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+      session.run(grow_op)
+
+      # Expect a new tree added, with a split on feature 75
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+       trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              threshold: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testPrePruning(self):
+    """Test growing an existing ensemble with pre-pruning."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # For node 1, the best split is on feature 2 (gain -0.63), but the gain
+      # is negative so node 1 will not be split.
+      # For node 2, the best split is on feature 3, gain is positive.
+
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([-0.63, 2.7], dtype=np.float32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([2.8], dtype=np.float32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.PRE_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should not be finalized as max tree depth is 3 and
+      # it's only grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.8
+              original_leaf {
+                scalar: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.45
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.182
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: false
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testMetadataWhenCantSplitDueToEmptySplits(self):
+    """Test that the metadata is updated even though we can't split."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          max_depth=2,
+          # No splits are available.
+          feature_ids=[],
+          node_ids=[],
+          gains=[],
+          thresholds=[],
+          left_node_contribs=[],
+          right_node_contribs=[])
+      session.run(grow_op)
+
+      # Expect no new splits created, but attempted (global) stats updated. Meta
+      # data for this tree should not be updated (we didn't succeed building a
+      # layer.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testMetadataWhenCantSplitDuePrePruning(self):
+    """Test metadata is updated correctly when no split due to prepruning."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      feature_ids = [0, 1, 0]
+
+      # All the gains are negative.
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([-0.63, -2.7], dtype=np.float32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([-2.8], dtype=np.float32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.PRE_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      # Expect that no new split was created because all the gains were negative
+      # Global metadata should be updated, tree metadata should not be updated.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testPostPruningOfSomeNodes(self):
+    """Test growing an ensemble with post-pruning."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs.
+      # Second feature has larger (but still negative gain).
+      feature_ids = [0, 1]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.3], dtype=np.float32)
+      feature1_thresholds = np.array([7], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.013]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([-0.2], dtype=np.float32)
+      feature2_thresholds = np.array([33], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[0.01]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ])
+
+      session.run(grow_op)
+
+      # Expect the split from second features to be chosen despite the negative
+      # gain.
+      # No pruning happened just yet.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0143
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+      # Prepare the second layer.
+      # Note that node 1 gain is negative and node 2 gain is positive.
+      feature_ids = [3]
+      feature1_nodes = np.array([1, 2], dtype=np.int32)
+      feature1_gains = np.array([-0.2, 0.5], dtype=np.float32)
+      feature1_thresholds = np.array([7, 5], dtype=np.int32)
+      feature1_left_node_contribs = np.array(
+          [[0.07], [0.041]], dtype=np.float32)
+      feature1_right_node_contribs = np.array(
+          [[0.083], [0.064]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+
+      session.run(grow_op)
+
+      # After adding this layer, the tree will not be finalized
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 7
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: -0.2
+              original_leaf {
+                scalar: 0.01
+               }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 5
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.08
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.093
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+                scalar: 0.0783
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+       """
+      self.assertEqual(new_stamp, 2)
+
+      self.assertProtoEquals(expected_result, res_ensemble)
+      # Now split the leaf 3, again with negative gain. After this layer, the
+      # tree will be finalized, and post-pruning happens. The leafs 3,4,7,8 will
+      # be pruned out.
+
+      # Prepare the third layer.
+      feature_ids = [92]
+      feature1_nodes = np.array([3], dtype=np.int32)
+      feature1_gains = np.array([-0.45], dtype=np.float32)
+      feature1_thresholds = np.array([11], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.15]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.5]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+
+      session.run(grow_op)
+      # After adding this layer, the tree will be finalized
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+      # Node that nodes 3, 4, 7 and 8 got deleted, so metadata stores has ids
+      # mapped to their parent node 1, with the respective change in logits.
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 5
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0783
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+          }
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 3
+        }
+       """
+      self.assertEqual(new_stamp, 3)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+  def testPostPruningOfAllNodes(self):
+    """Test growing an ensemble with post-pruning, with all nodes are pruned."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs. All have negative gains.
+      feature_ids = [0, 1]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.3], dtype=np.float32)
+      feature1_thresholds = np.array([7], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.013]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([-0.62], dtype=np.float32)
+      feature2_thresholds = np.array([33], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[0.01]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ])
+
+      session.run(grow_op)
+
+      # Expect the split from feature 2 to be chosen despite the negative gain.
+      # The grown tree should not be finalized as max tree depth is 2 so no
+      # pruning occurs.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0143
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+      # Prepare inputs.
+      # All have negative gain.
+      feature_ids = [3]
+      feature1_nodes = np.array([1, 2], dtype=np.int32)
+      feature1_gains = np.array([-0.2, -0.5], dtype=np.float32)
+      feature1_thresholds = np.array([77, 79], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.023], [0.3]], dtype=np.float32)
+      feature1_right_node_contribs = np.array(
+          [[0.012343], [24]], dtype=np.float32)
+
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+
+      session.run(grow_op)
+
+      # Expect the split from feature 1 to be chosen despite the negative gain.
+      # The grown tree should be finalized. Since all nodes have negative gain,
+      # the whole tree is pruned.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      # Expect the ensemble to be empty as post-pruning will prune
+      # the entire finalized tree.
+      self.assertEqual(new_stamp, 2)
+      self.assertProtoEquals("""
+      trees {
+        nodes {
+          leaf {
+          }
+        }
+      }
+      trees {
+        nodes {
+          leaf {
+          }
+        }
+      }
+      tree_weights: 1.0
+      tree_weights: 1.0
+      tree_metadata{
+        num_layers_grown: 2
+        is_finalized: true
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: 0.0
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.01
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.0143
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.033
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.022343
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.3143
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -24.0143
+        }
+      }
+      tree_metadata {
+      }
+      growing_metadata {
+        num_trees_attempted: 1
+        num_layers_attempted: 2
+      }
+      """, res_ensemble)
+
+  def testPostPruningChangesNothing(self):
+    """Test growing an ensemble with post-pruning with all gains >0."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs.
+      # Second feature has larger (but still negative gain).
+      feature_ids = [3, 4]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([0.63], dtype=np.float32)
+      feature2_thresholds = np.array([23], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=1,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ])
+
+      session.run(grow_op)
+
+      # Expect the split from the first feature to be chosen.
+      # Pruning got triggered but changed nothing.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 52
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.143
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 26d3df9e635ea49c6f25e3bb1df25ea72df27304..5a83ec8d302b4c26aef7abfa7465eb9fd0cca019 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -212,6 +212,12 @@ First 2 elements of y:
         out = array_ops.identity(small)
       self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_raises_when_not_equal_and_broadcastable_shapes(self):
+    cond = constant_op.constant([True, False], name="small")
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+      check_ops.assert_equal(cond, False, message="fail")
+
   @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
     larry = constant_op.constant([])
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 81c6a4aa6e6edc4b49338a0bfd354302b1dfac0b..c22934ce47543ab11b6a5b9acde2e2ec3aec9da7 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -606,6 +606,17 @@ class ConcatOpTest(test.TestCase):
           inp_tensors_placeholders, -2, output_shape=[2, 3],
           gather_indexes=[2, 0], feed_dict=feed_dict)
 
+  def testConcatAxisType(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        t1 = [[1, 2, 3], [4, 5, 6]]
+        t2 = [[7, 8, 9], [10, 11, 12]]
+
+        c = gen_array_ops.concat_v2([t1, t2],
+                                    constant_op.constant(1, dtype=dtype))
+        self.assertEqual([2, 6], c.get_shape().as_list())
+        output = c.eval()
+        self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
 class ConcatOffsetTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index ffbdb0e61a8403fe7ccd36bad40887543023ad40..18796f709566f022258806ce46cc706e8fe34354 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -881,7 +881,7 @@ versions {
 class PlaceholderWithDefaultTest(test.TestCase):
 
   def testFullShape(self):
-    with self.test_session():
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([[2, 2], [2, 2]], shape=[2, 2])
       a = array_ops.identity(p)
       self.assertAllEqual([[2, 2], [2, 2]], a.eval())
@@ -892,7 +892,7 @@ class PlaceholderWithDefaultTest(test.TestCase):
         a.eval(feed_dict={p: [[6, 6, 6], [6, 6, 6]]})
 
   def testPartialShape(self):
-    with self.test_session():
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([1, 2, 3], shape=[None])
       a = array_ops.identity(p)
       self.assertAllEqual([1, 2, 3], a.eval())
@@ -902,7 +902,7 @@ class PlaceholderWithDefaultTest(test.TestCase):
         a.eval(feed_dict={p: [[2, 2], [2, 2]]})
 
   def testNoShape(self):
-    with self.test_session():
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([17], shape=None)
       a = array_ops.identity(p)
       self.assertAllEqual([17], a.eval())
@@ -911,11 +911,12 @@ class PlaceholderWithDefaultTest(test.TestCase):
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
   def testGradient(self):
-    with self.test_session():
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       x = array_ops.placeholder(dtypes_lib.float32, [5, 7])
       y = array_ops.placeholder_with_default(x, None)
       err = gradient_checker.compute_gradient_error(x, [5, 7], y, [5, 7])
       self.assertLess(err, 1e-3)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index b429fa5c423effce0f0ccb0ad34875dab2808777..75f8644f694c4cebb7dbdac4599244dda427bc05 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -144,7 +144,7 @@ class ControlFlowTest(test.TestCase):
 
       enter_v = control_flow_ops._Enter(v, "foo_1", is_constant=True)
       nine = constant_op.constant(9)
-      enter_nine = gen_control_flow_ops._enter(nine, "foo_1")
+      enter_nine = gen_control_flow_ops.enter(nine, "foo_1")
       op = state_ops.assign(enter_v, enter_nine)
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
@@ -164,9 +164,9 @@ class ControlFlowTest(test.TestCase):
   def testEnterMulExit(self):
     with self.test_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
-      enter_data = gen_control_flow_ops._enter(data, "foo_1", False)
+      enter_data = gen_control_flow_ops.enter(data, "foo_1", False)
       five = constant_op.constant(5)
-      enter_five = gen_control_flow_ops._enter(five, "foo_1", False)
+      enter_five = gen_control_flow_ops.enter(five, "foo_1", False)
       mul_op = math_ops.multiply(enter_data, enter_five)
       exit_op = control_flow_ops.exit(mul_op)
 
@@ -178,12 +178,12 @@ class ControlFlowTest(test.TestCase):
       v = variables.Variable([0.0, 0.0], dtype=dtypes.float32)
 
       # If is_constant=True, the shape information should be propagated.
-      enter_v_constant = gen_control_flow_ops._enter(
+      enter_v_constant = gen_control_flow_ops.enter(
           v, "frame1", is_constant=True)
       self.assertEqual(enter_v_constant.shape, [2])
 
       # Otherwise, the shape should be unknown.
-      enter_v_non_constant = gen_control_flow_ops._enter(
+      enter_v_non_constant = gen_control_flow_ops.enter(
           v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
@@ -257,8 +257,8 @@ class ControlFlowTest(test.TestCase):
       false = ops.convert_to_tensor(False)
       n = constant_op.constant(10)
 
-      enter_false = gen_control_flow_ops._enter(false, "foo_1", False)
-      enter_n = gen_control_flow_ops._enter(n, "foo_1", False)
+      enter_false = gen_control_flow_ops.enter(false, "foo_1", False)
+      enter_n = gen_control_flow_ops.enter(n, "foo_1", False)
 
       merge_n = control_flow_ops.merge([enter_n, enter_n], name="merge_n")[0]
       switch_n = control_flow_ops.switch(merge_n, enter_false)
@@ -275,9 +275,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
-      enter_one = gen_control_flow_ops._enter(one, "foo", True)
-      enter_n = gen_control_flow_ops._enter(n, "foo", True)
+      enter_i = gen_control_flow_ops.enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops.enter(one, "foo", True)
+      enter_n = gen_control_flow_ops.enter(n, "foo", True)
 
       with ops.device(test.gpu_device_name()):
         merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
@@ -301,9 +301,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
-      enter_one = gen_control_flow_ops._enter(one, "foo", True)
-      enter_n = gen_control_flow_ops._enter(n, "foo", True)
+      enter_i = gen_control_flow_ops.enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops.enter(one, "foo", True)
+      enter_n = gen_control_flow_ops.enter(n, "foo", True)
 
       merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
 
@@ -324,8 +324,8 @@ class ControlFlowTest(test.TestCase):
   def testDifferentFrame(self):
     with self.test_session():
       data = array_ops.placeholder(dtypes.float32, shape=[])
-      enter_1 = gen_control_flow_ops._enter(data, "foo_1", False)
-      enter_2 = gen_control_flow_ops._enter(data, "foo_2", False)
+      enter_1 = gen_control_flow_ops.enter(data, "foo_1", False)
+      enter_2 = gen_control_flow_ops.enter(data, "foo_2", False)
       res = math_ops.add(enter_1, enter_2)
       with self.assertRaisesOpError("has inputs from different frames"):
         res.eval(feed_dict={data: 1.0})
@@ -591,10 +591,10 @@ class ControlFlowTest(test.TestCase):
       # Both v_f and v_t are uninitialized references. However, an actual use
       # of the reference in the 'true' branch in the 'tf.identity' op will
       # not 'fire' when v is uninitialized, so this is a valid construction.
-      # This test tests that _ref_identity allows uninitialized ref as input
+      # This test tests that ref_identity allows uninitialized ref as input
       # so that this construction is allowed.
-      v_f_op = gen_array_ops._ref_identity(v_f)
-      v_t_op = gen_array_ops._ref_identity(v_t)
+      v_f_op = gen_array_ops.ref_identity(v_f)
+      v_t_op = gen_array_ops.ref_identity(v_t)
       with ops.control_dependencies([v_f_op]):
         assign_v = state_ops.assign(v, [1.0])
       with ops.control_dependencies([v_t_op]):
@@ -633,7 +633,8 @@ class ControlFlowTest(test.TestCase):
       sess.run(r)
 
   def testCondGrad_1(self):
-    with self.test_session():
+    graph = ops.Graph()
+    with graph.as_default():
       x = constant_op.constant(10.0, name="x")
       pred = math_ops.less(1, 2)
       fn1 = lambda: array_ops.identity(x)
@@ -641,8 +642,14 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       grad = gradients_impl.gradients(r, [x])[0]
-      result = grad.eval()
-    self.assertAllEqual(1.0, result)
+      with self.test_session():
+        self.assertAllEqual(1.0, grad.eval())
+    # The gradients computation creates a tensor with zeros by broadcasting a
+    # zeros constant to the required shape. Verify that the zero constant
+    # feeding into the fill is dominated by a Switch.
+    zero = graph.get_operation_by_name("gradients/zeros/Const")
+    self.assertEqual(len(zero.control_inputs), 1)
+    self.assertEqual(zero.control_inputs[0].type, "Switch")
 
   def testCondGrad_2(self):
     with self.test_session():
@@ -744,7 +751,7 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         self.assertEqual(x.dtype, dtypes.int32_ref)
-        return (i + 1, gen_array_ops._ref_identity(x))
+        return (i + 1, gen_array_ops.ref_identity(x))
 
       r = control_flow_ops.while_loop(c, b, [i, x], parallel_iterations=5)
 
@@ -2205,12 +2212,9 @@ class ControlFlowTest(test.TestCase):
 
       self.assertEqual(x.dtype, dtypes.int32_ref)
 
-      # pylint: disable=protected-access
       def body(i, x):
         self.assertEqual(x.dtype, dtypes.int32_ref)
-        return [i + 1, gen_array_ops._ref_identity(x)]
-
-      # pylint: enable=protected-access
+        return [i + 1, gen_array_ops.ref_identity(x)]
 
       r = control_flow_ops.while_loop(c, body, [i, x], parallel_iterations=5)
 
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
index 23185eaeece0d56fd83ecdf9e02c778712420465..39e96f74b0461da0cf499e303b30a4a41aae4899 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -41,17 +41,17 @@ class ControlFlowUtilTest(test.TestCase):
     self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
 
   def testIsLoopEnter(self):
-    enter = gen_control_flow_ops._enter(1, frame_name="name").op
+    enter = gen_control_flow_ops.enter(1, frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(enter))
     self.assertFalse(control_flow_util.IsLoopConstantEnter(enter))
 
-    ref_enter = gen_control_flow_ops._ref_enter(test_ops.ref_output(),
-                                                frame_name="name").op
+    ref_enter = gen_control_flow_ops.ref_enter(test_ops.ref_output(),
+                                               frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(ref_enter))
     self.assertFalse(control_flow_util.IsLoopConstantEnter(ref_enter))
 
-    const_enter = gen_control_flow_ops._enter(1, frame_name="name",
-                                              is_constant=True).op
+    const_enter = gen_control_flow_ops.enter(1, frame_name="name",
+                                             is_constant=True).op
     self.assertTrue(control_flow_util.IsLoopEnter(const_enter))
     self.assertTrue(control_flow_util.IsLoopConstantEnter(const_enter))
 
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index ec8ac74163d093c57e6e4ffbab6977ce732cc3ef..f4616fd661f989c1c3e4939a3d062b0260f8572e 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -344,6 +345,8 @@ class Conv3DTest(test.TestCase):
         if data_format == "NCDHW":
           conv = test_util.NCHWToNHWC(conv)
 
+        self.assertEqual(conv.shape, tensor_shape.TensorShape(output_shape))
+
         if test_input:
           jacob_t, jacob_n = gradient_checker.compute_gradient(
               orig_input_tensor, input_shape, conv, output_shape)
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 25525cc1285d8dbc305fdcf838143b6e1c420316..a291bef0ad6f16184ff29f665457a53b77447d54 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -159,11 +159,11 @@ class Conv2DTest(test.TestCase):
 
   def _DtypesToTest(self, use_gpu):
     if use_gpu and not test_util.CudaSupportsHalfMatMulAndConv():
-      return [dtypes.float32]
+      return [dtypes.float32, dtypes.float64]
     else:
       # It is important that float32 comes before float16 here,
       # as we will be using its gradients as reference for fp16 gradients.
-      return [dtypes.float32, dtypes.float16]
+      return [dtypes.float32, dtypes.float16, dtypes.float64]
 
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, dilations,
                             strides, padding, data_format, dtype, use_gpu):
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 8db0bb6f0dc495e7be2cd717787acf87156f42af..34e77512434ea26d2693f0bb1600ff81dd15e84f 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -2165,5 +2165,47 @@ class AccumulateTest(test.TestCase):
         math_ops.accumulate_n([a], tensor_dtype=np.int32)
 
 
+class PolyvalTest(test.TestCase):
+
+  def _runtest(self, dtype, degree):
+    x = np.random.rand(2, 2).astype(dtype)
+    coeffs = [np.random.rand(2, 2).astype(dtype) for _ in range(degree + 1)]
+    np_val = np.polyval(coeffs, x)
+    with self.test_session():
+      tf_val = math_ops.polyval(coeffs, x)
+      self.assertAllClose(np_val, tf_val.eval())
+
+  def testSimple(self):
+    for dtype in [
+        np.int32, np.float32, np.float64, np.complex64, np.complex128
+    ]:
+      for degree in range(5):
+        self._runtest(dtype, degree)
+
+  def testBroadcast(self):
+    dtype = np.float32
+    degree = 3
+    shapes = [(1,), (2, 1), (1, 2), (2, 2)]
+    for x_shape in shapes:
+      for coeff_shape in shapes:
+        x = np.random.rand(*x_shape).astype(dtype)
+        coeffs = [
+            np.random.rand(*coeff_shape).astype(dtype)
+            for _ in range(degree + 1)
+        ]
+        np_val = np.polyval(coeffs, x)
+        with self.test_session():
+          tf_val = math_ops.polyval(coeffs, x)
+          self.assertAllClose(np_val, tf_val.eval())
+
+  def testEmpty(self):
+    x = np.random.rand(2, 2).astype(np.float32)
+    coeffs = []
+    np_val = np.polyval(coeffs, x)
+    with self.test_session():
+      tf_val = math_ops.polyval(coeffs, x)
+      self.assertAllClose(np_val, tf_val.eval())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index e220d0569281c6dbe4107fdfb8013e99592f153c..f3cc9636f91f7d3573d8a66d6b1b4936e49a9141 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -280,15 +280,3 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/distributions/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
index df99a0ed257da20179909eb44eacf7d44528dad2..a8def95b147b6dd4825675769187733b8493b374 100644
--- a/tensorflow/python/kernel_tests/distributions/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -281,6 +281,22 @@ class UniformTest(test.TestCase):
       expected_pdf = [1.0, 0.1]
       self.assertAllClose(expected_pdf, pdf.eval())
 
+  def testUniformFloat64(self):
+    uniform = uniform_lib.Uniform(
+        low=np.float64(0.), high=np.float64(1.))
+
+    self.assertAllClose(
+        [1., 1.],
+        self.evaluate(uniform.prob(np.array([0.5, 0.6], dtype=np.float64))))
+
+    self.assertAllClose(
+        [0.5, 0.6],
+        self.evaluate(uniform.cdf(np.array([0.5, 0.6], dtype=np.float64))))
+
+    self.assertAllClose(0.5, self.evaluate(uniform.mean()))
+    self.assertAllClose(1 / 12., self.evaluate(uniform.variance()))
+    self.assertAllClose(0., self.evaluate(uniform.entropy()))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index f5717a5a21a0be82382c5da556ed6f5540591abf..34fb655035d6cadab583c1f66dbeae3f7a0b65b0 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -39,6 +40,7 @@ import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+# pylint: disable=invalid-name
 def simple_scoped_fn(a, x):
   """Simple function: (a, x) -> 2(x+a), but with "2" as a variable in scope."""
   with variable_scope.variable_scope("body"):
@@ -158,6 +160,13 @@ class FunctionalOpsTest(test.TestCase):
                 values=constant_op.constant([0, 1, 2]),
                 dense_shape=[2, 2]))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testMapOverScalarErrors(self):
+    with self.assertRaisesRegexp(ValueError, "not scalars"):
+      functional_ops.map_fn(lambda x: x, [1, 2])
+    with self.assertRaisesRegexp(ValueError, "not a scalar"):
+      functional_ops.map_fn(lambda x: x, 1)
+
   def testMap_Scoped(self):
     with self.test_session() as sess:
 
@@ -229,7 +238,7 @@ class FunctionalOpsTest(test.TestCase):
     with self.test_session():
       nums = np.array([1, 2, 3, 4, 5, 6])
       with self.assertRaisesRegexp(
-          TypeError, r"two structures don't have the same sequence type."):
+          TypeError, r"two structures don't have the same nested structure"):
         # lambda emits tuple, but dtype is a list
         functional_ops.map_fn(
             lambda x: ((x + 3) * 2, -(x + 3) * 2),
@@ -316,7 +325,7 @@ class FunctionalOpsTest(test.TestCase):
       initializer = np.array(1.0)
       # Multiply a * 1 each time
       with self.assertRaisesRegexp(
-          ValueError, "two structures don't have the same number of elements"):
+          ValueError, "two structures don't have the same nested structure"):
         functional_ops.scan(lambda a, x: (a, -a), elems, initializer)
 
   def testScan_Scoped(self):
@@ -607,6 +616,276 @@ class FunctionalOpsTest(test.TestCase):
       mul = sess.run(remote_op)
       self.assertEqual(mul, 9)
 
+  def testIf(self):
+
+    @function.Defun(dtypes.float32)
+    def Twice(x):
+      return x * 2
+
+    @function.Defun(dtypes.float32)
+    def Thrice(x):
+      return x * 3 + 1
+
+    with self.test_session(use_gpu=False) as sess:
+
+      def Run(x):
+        return sess.run(
+            functional_ops.If(math_ops.greater(x, 0), [x], Twice, Thrice))[0]
+
+      self.assertAllEqual(Run(9.), 18.)
+      self.assertAllEqual(Run(-8.), -23.)
+      self.assertAllEqual(Run(0.), 1.)
+
+  def testWhile(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Cond(n, unused_x):
+      return n > 0
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False) as sess:
+
+      def Run(n):
+        return sess.run(functional_ops.While([n, 0.], Cond, Body))[1]
+
+      self.assertAllEqual(Run(20.), 210.)
+      self.assertAllEqual(Run(100.), 5050.)
+
+  def testWhileError(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Cond(n, unused_x):
+      return n > 0
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def CondReturnsTooManyArgs(n, x):
+      return n > 0, x
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def BodyReturnsTooManyArgs(n, x):
+      return n - 1, x + n, x
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Expected a single scalar.*got 2 tensors."):
+        functional_ops.While([5., 0.], CondReturnsTooManyArgs, Body)[0].eval()
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "While loop body returned 3 arguments. Expected: 2"):
+        functional_ops.While([5., 0.], Cond, BodyReturnsTooManyArgs)[0].eval()
+
+  def testWhileInMultipleSubgraphs(self):
+
+    @function.Defun(* [dtypes.float32] * 2)
+    def Cond(n, x):  # pylint: disable=unused-argument
+      return n > 0
+
+    @function.Defun(* [dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False) as sess:
+      n = array_ops.placeholder(dtypes.float32)
+      _, result = functional_ops.While([n, 0.], Cond, Body)
+      c = constant_op.constant(37.)
+
+      self.assertAllEqual(210., sess.run(result, feed_dict={n: 20.}))
+      self.assertAllEqual(5050., sess.run(result, feed_dict={n: 100.}))
+      # Test that the result is the same when we run a different subgraph.
+      self.assertAllEqual(5050., sess.run([result, c], feed_dict={n: 100.})[0])
+
+  def _tfSum(self, rewrite_with_while):
+    # On GPU, don't rewrite using a while loop.
+    use_gpu = not rewrite_with_while
+    with self.test_session(use_gpu=use_gpu) as sess:
+
+      @function.Defun(dtypes.int32, dtypes.float32)
+      def Body(n, x):
+        return x + math_ops.to_float(n)
+
+      xs = [
+          # 1 + 2  + ... + 20
+          functional_ops.For(
+              1, 21, 1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+          # 100 + 99 + ... + 1
+          functional_ops.For(
+              100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+      ]
+      xvals = sess.run(xs)
+    self.assertAllEqual(210, xvals[0])
+    self.assertAllEqual(5050, xvals[1])
+
+  def testFor(self):
+    self._tfSum(False)
+
+  def testForWithWhile(self):
+    self._tfSum(True)
+
+  def testForWithWhileNaming(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      @function.Defun(dtypes.int32, dtypes.float32, func_name="TestBody")
+      def TestBody(n, x):
+        return x + math_ops.to_float(n)
+
+      _ = functional_ops.For(
+          1, 21, 1, [0.], TestBody, rewrite_with_while=True)[0]
+
+    names = []
+    for func in g.as_graph_def().library.function:
+      names.append(func.signature.name)
+    self.assertTrue("TestBody" in names)
+    self.assertTrue("TestBody_Cond" in names)
+    self.assertTrue("TestBody_Body" in names)
+
+  def testForCapturedInputs(self):
+    v = variables.Variable(1.0)
+
+    @function.Defun(dtypes.int32)
+    def TestNullary(n):
+      v + math_ops.to_float(n)  # pylint: disable=expression-not-assigned
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def TestUnary(n, x):
+      return x + math_ops.to_float(n) + v
+
+    @function.Defun(dtypes.int32, dtypes.float32, dtypes.float32)
+    def TestBinary(n, x, x2):
+      return x + math_ops.to_float(n) + v, x2 + v
+
+    for rewrite_with_while in (True, False):
+      # TODO(b/65752372): Set `use_gpu=False` because
+      # `functional_ops.While()` does not reliably work on GPU (apparently
+      # because the result of evaluating the condition may be in device
+      # memory, but it is read on the host).
+      use_gpu = not rewrite_with_while
+      with self.test_session(use_gpu=use_gpu) as sess:
+        result_nullary = functional_ops.For(
+            1, 10, 1, [], TestNullary,
+            rewrite_with_while=rewrite_with_while)
+        result_unary = functional_ops.For(
+            1, 10, 1, [0.], TestUnary,
+            rewrite_with_while=rewrite_with_while)
+        result_binary = functional_ops.For(
+            1, 10, 1, [0., 0.], TestBinary,
+            rewrite_with_while=rewrite_with_while)
+        sess.run(variables.global_variables_initializer())
+        assert not result_nullary
+        # The nullary variant doesn't return anything so we can't easily run it.
+        # As a total hack, fetch the operation by name and run it.
+        sess.run(ops.get_default_graph().get_operation_by_name(
+            "While" if rewrite_with_while else "For"))
+        assert len(result_unary) == 1
+        self.assertEqual([54.0], sess.run(result_unary))
+        assert len(result_binary) == 2
+        self.assertEqual([54.0, 9.0], sess.run(result_binary))
+
+  def _tfMLP(self, xval, wsval, bsval, rewrite_with_while):
+    # On GPU, don't rewrite using a while loop.
+    use_gpu = not rewrite_with_while
+    with self.test_session(use_gpu=use_gpu):
+
+      @function.Defun(dtypes.int32, *[dtypes.float64] * 3)
+      def MLP(i, a, ws, bs):
+        a = math_ops.tanh(math_ops.matmul(a, ws[i, :]) + bs[i, :])
+        return a, ws, bs
+
+      ret = functional_ops.For(
+          0,
+          wsval.shape[0],
+          1, [xval, wsval, bsval],
+          MLP,
+          rewrite_with_while=rewrite_with_while)[0]
+
+      return ret.eval()
+
+  def _npMLP(self, xval, wsval, bsval):
+    for i in range(wsval.shape[0]):
+      xval = np.tanh(np.dot(xval, wsval[i, :]) + bsval[i, :])
+    return xval
+
+  def _testForMLP(self, rewrite_with_while):
+    # We construct a 5-layer Multi-Layer Perceptron network here.
+    # Each layer have the same number of hidden unites (3), and the
+    # activation function is tanh().  We feed the input (xval) with
+    # batch size 2.
+    xval = np.random.normal(size=(2, 3))
+    wsval = np.random.normal(size=(5, 3, 3))
+    bsval = np.random.normal(size=(5, 3))
+    np_ans = self._npMLP(xval, wsval, bsval)
+    tf_for_ans = self._tfMLP(xval, wsval, bsval, rewrite_with_while)
+    self.assertAllClose(np_ans, tf_for_ans)
+
+  def testForMLP(self):
+    self._testForMLP(False)
+
+  def testForMLPWhile(self):
+    self._testForMLP(True)
+
+  def testForError(self):
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def Foo(i, v):
+      return math_ops.to_float(i) + v
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def ReturnsTooManyArgs(unused_i, v):
+      return v, v
+
+    with self.test_session(use_gpu=True):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must be a scalar"):
+        functional_ops.For([0], 10, 1, [0.0], Foo)[0].eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Invalid start/limit/delta"):
+        functional_ops.For(0, 10, -1, [0.0], Foo)[0].eval()
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "For loop body returned 2 arguments. Expected: 1"):
+        functional_ops.For(0, 10, 1, [0.0], ReturnsTooManyArgs)[0].eval()
+
+  def testGradient(self):
+
+    @function.Defun(dtypes.float32)
+    def Poly(x):
+      # y = 2x^3+3x^2+4x+8
+      return 2 * x * x * x + 3 * x * x + 4 * x + 8
+
+    @function.Defun(dtypes.float32)
+    def Grad(x):
+      # dy/dx = dy/dy * dy/dx = 1.0 * (6x^2+6x+4)
+      return functional_ops.Gradient([x, 1.0], Poly)[0]
+
+    with self.test_session(use_gpu=False) as sess:
+      a = constant_op.constant(0.)
+      avals = [Poly(a), Grad(a)]
+      b = constant_op.constant(1.)
+      bvals = [Poly(b), Grad(b)]
+      self.assertAllEqual(sess.run(avals), [8., 4.])
+      self.assertAllEqual(sess.run(bvals), [17., 16.])
+
 
 if __name__ == "__main__":
   test.main()
+
+# pylint: enable=invalid-name
diff --git a/tensorflow/python/kernel_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/identity_op_py_test.py
index 2cfe420bd49ec44815d1386bd873b234d8710e9d..49fb76d5b41de18ed3ba2187e85cb288e7344c38 100644
--- a/tensorflow/python/kernel_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_op_py_test.py
@@ -65,7 +65,7 @@ class IdentityOpTest(test.TestCase):
           constant_op.constant(
               [[1, 2, 3], [6, 5, 4]], dtype=dtypes.int32))
       self.assertEquals(shape, tensor.get_shape())
-      self.assertEquals(shape, gen_array_ops._ref_identity(tensor).get_shape())
+      self.assertEquals(shape, gen_array_ops.ref_identity(tensor).get_shape())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index c1755985ee85c62005c8d3d5fb916859193aa5f3..1e5c118cbc3573af0a2ce95239f499a5e52a0c86 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -618,7 +618,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
     for dtype in [dtypes.float32]:
       for kernel_size in [[3], [8], [3, 5], [2, 4], [3, 3, 3], [2, 2, 2]]:
         tol = 1e-2
-        # Check orthogonality by computing the 2-norms of the inputs and ouputs.
+        # Check orthogonality by computing the 2-norms of the inputs and outputs.
         if len(kernel_size) == 1:
           shape = [4, 32, 64]
           convolution = convolutional.conv1d
@@ -649,6 +649,30 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
               sess.run(outputs_2norm)/(np.sqrt(np.prod(shape))*np.sqrt(3.14)),
               rtol=tol, atol=tol)
 
+  def testNonuniformity(self):
+    value = 0
+    abs_value = 0
+    shape = [3, 3, 10, 10]
+    count = 70
+    tol = 1e-5
+    with self.test_session(use_gpu=True):  # as sess:
+      for i in range(count):
+        x = variable_scope.get_variable("{}".format(i), shape=shape,
+                                        initializer=
+                                        init_ops.convolutional_delta_orthogonal)
+        x.initializer.run()
+        y = x.eval()[1, 1, :, :]
+        determinant = np.linalg.det(y)
+        value += determinant
+        abs_value += np.abs(determinant)
+
+      # Check there is some variation in the signs of the determinants
+      self.assertLess(value, count - tol)
+      self.assertLess(-count + tol, value)
+      # Check all determinants have absolute value 1
+      # Compute the sum of the absolute values of 'count' determinants
+      self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
+
 
 class IdentityInitializerTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 66afb6ec014991ca32efd5b0895ff695d3d1015f..184d1dde2aa5e8d786cb85141f8dfb90c0bdad63 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -19,10 +19,12 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class LargeConcatOpTest(test.TestCase):
   """Tests that belong in concat_op_test.py, but run over large tensors."""
 
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index fd1b5bab6f5aa072c8821eb053bd8d39391be4d4..9555e510997a6aa07797dffa1a6e4810b0b4e5d2 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -140,15 +140,3 @@ cuda_py_test(
     ],
     shard_count = 5,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index e1edffc3d9afec618f8dfcf74bae1b0f1bde2772..7b291e29de41d2fe37257bb42222ac23fc8e1d3f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -94,8 +95,8 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
 class BroadcastMatrixBatchDimsTest(test.TestCase):
 
   def test_zero_batch_matrices_returned_as_empty_list(self):
-    self.assertAllEqual(
-        [], linear_operator_util.broadcast_matrix_batch_dims([]))
+    self.assertAllEqual([],
+                        linear_operator_util.broadcast_matrix_batch_dims([]))
 
   def test_one_batch_matrix_returned_after_tensor_conversion(self):
     arr = rng.rand(2, 3, 4)
@@ -194,6 +195,44 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
       linear_operator_util.broadcast_matrix_batch_dims([y, x])
 
 
+class CholeskySolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    chol = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 7)
+    chol_broadcast = chol + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2, 2]
+    chol = rng.rand(2, 3, 3)
+    rhs = rng.rand(2, 1, 3, 7)
+    chol_broadcast = chol + np.zeros((2, 2, 1, 1))
+    rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
+
+    chol_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.cholesky_solve_with_broadcast(
+                  chol_ph, rhs_ph),
+              linalg_ops.cholesky_solve(chol_broadcast, rhs_broadcast)
+          ],
+          feed_dict={
+              chol_ph: chol,
+              rhs_ph: rhs,
+          })
+      self.assertAllEqual(expected, result)
+
+
 class MatmulWithBroadcastTest(test.TestCase):
 
   def test_static_dims_broadcast(self):
@@ -209,7 +248,7 @@ class MatmulWithBroadcastTest(test.TestCase):
       expected = math_ops.matmul(x, y_broadcast)
       self.assertAllEqual(expected.eval(), result.eval())
 
-  def test_dynamic_dims_broadcast_32bit(self):
+  def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
     x = rng.rand(2, 1, 3)
@@ -221,9 +260,90 @@ class MatmulWithBroadcastTest(test.TestCase):
 
     with self.test_session() as sess:
       result, expected = sess.run(
-          [linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
-           math_ops.matmul(x, y_broadcast)],
-          feed_dict={x_ph: x, y_ph: y})
+          [
+              linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
+              math_ops.matmul(x, y_broadcast)
+          ],
+          feed_dict={
+              x_ph: x,
+              y_ph: y
+          })
+      self.assertAllEqual(expected, result)
+
+
+class MatrixSolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    matrix = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 7)
+    matrix_broadcast = matrix + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2, 2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(2, 1, 3, 7)
+    matrix_broadcast = matrix + np.zeros((2, 2, 1, 1))
+    rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
+
+    matrix_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.matrix_solve_with_broadcast(
+                  matrix_ph, rhs_ph),
+              linalg_ops.matrix_solve(matrix_broadcast, rhs_broadcast)
+          ],
+          feed_dict={
+              matrix_ph: matrix,
+              rhs_ph: rhs,
+          })
+      self.assertAllEqual(expected, result)
+
+
+class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(3, 7)
+    rhs_broadcast = rhs + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.matrix_triangular_solve_with_broadcast(
+          matrix, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(3, 7)
+    rhs_broadcast = rhs + np.zeros((2, 1, 1))
+
+    matrix_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.matrix_triangular_solve_with_broadcast(
+                  matrix_ph, rhs_ph),
+              linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+          ],
+          feed_dict={
+              matrix_ph: matrix,
+              rhs_ph: rhs,
+          })
       self.assertAllEqual(expected, result)
 
 
@@ -244,7 +364,7 @@ class AssertCompatibleMatrixDimensionsTest(test.TestCase):
       operator = DomainDimensionStubOperator(3)
       # Should not raise
       linear_operator_util.assert_compatible_matrix_dimensions(
-          operator, x).run()
+          operator, x).run()  # pyformat: disable
 
   def test_incompatible_dimensions_raise(self):
     with self.test_session():
@@ -252,7 +372,7 @@ class AssertCompatibleMatrixDimensionsTest(test.TestCase):
       operator = DomainDimensionStubOperator(3)
       with self.assertRaisesOpError("Incompatible matrix dimensions"):
         linear_operator_util.assert_compatible_matrix_dimensions(
-            operator, x).run()
+            operator, x).run()  # pyformat: disable
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 8865e165fd6bb15b53d10f336e526f8e830500b1..dbbed39c727f01ed1fae271375575c690958c7d8 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -152,6 +152,28 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32).eval()
       self.assertAllEqual(s1, [0, 1, 2, 3])
 
+  def testGraphStackSwitchDtype(self):
+    with context.graph_mode(), self.test_session():
+      list_ = list_ops.empty_tensor_list(
+          element_shape=constant_op.constant([], dtype=dtypes.int32),
+          element_dtype=dtypes.int32)
+      m = constant_op.constant([1, 2, 3], dtype=dtypes.float32)
+
+      def body(list_, m):
+        list_ = control_flow_ops.cond(
+            math_ops.equal(list_ops.tensor_list_length(list_), 0),
+            lambda: list_ops.empty_tensor_list(m.shape, m.dtype), lambda: list_)
+        list_ = list_ops.tensor_list_push_back(list_, m)
+        return list_, m
+
+      for _ in range(2):
+        list_, m = body(list_, m)
+
+      s1 = list_ops.tensor_list_stack(
+          list_, element_dtype=dtypes.float32).eval()
+      np_s1 = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)
+      self.assertAllEqual(s1, np_s1)
+
   def testGraphStackInLoopSwitchDtype(self):
     with context.graph_mode(), self.test_session():
       t1 = list_ops.empty_tensor_list(
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index ad802f7e1f72f6cbc3dda1ca98e46e6da4e5110a..55653489aff0a745c5731db4d31864aede97e954 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -1124,40 +1124,91 @@ class AUCTest(test.TestCase):
 
       self.assertAlmostEqual(0.7, auc.eval(), 5)
 
-  def testAUCPRSpecialCase(self):
+  # Regarding the AUC-PR tests: note that the preferred method when
+  # calculating AUC-PR is summation_method='careful_interpolation'.
+  def testCorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.1, 0.4, 0.35, 0.8], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 1, 1], shape=(1, 4))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
+
+      sess.run(variables.local_variables_initializer())
+      # expected ~= 0.79726744594
+      expected = 1 - math.log(1.5) / 2
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+
+  def testCorrectAnotherAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
+          shape=(1, 7),
+          dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
+
+      sess.run(variables.local_variables_initializer())
+      # expected ~= 0.61350593198
+      expected = (2.5 - 2 * math.log(4./3) - 0.25 * math.log(7./5)) / 3
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+
+  def testThirdCorrectAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
+          shape=(1, 7),
+          dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
+
+      sess.run(variables.local_variables_initializer())
+      # expected ~= 0.90410597584
+      expected = 1 - math.log(4./3) / 3
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+
+  def testIncorrectAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8], shape=(1, 4), dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 1], shape=(1, 4))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
 
-  def testAnotherAUCPRSpecialCase(self):
+  def testAnotherIncorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
           shape=(1, 7),
           dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
 
-  def testThirdAUCPRSpecialCase(self):
+  def testThirdIncorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
           shape=(1, 7),
           dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 14632ec29a5e8224de5bf85673904aac4eb482a7..361853448ce2c8477af6920257c58c1eba0fa952 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -338,27 +338,29 @@ class PadOpTest(test.TestCase):
 
   def testCollapseAdjacentNonPaddedDimensions(self):
     # pyformat: disable
-    for paddings_value in [[[0, 0], [0, 0], [0, 0], [0, 1]],
-                           [[0, 0], [2, 3], [0, 0], [0, 0]],
-                           [[0, 0], [0, 0], [0, 0], [0, 0]]]:
-      # pyformat: enable
-      inp = constant_op.constant(1.0, shape=[8, 28, 28, 3])
-      paddings = constant_op.constant(paddings_value, dtype=dtypes.int32)
-      padded = array_ops.pad(inp, paddings)
-      middle = array_ops.slice(padded, [row[0] for row in paddings_value],
-                               [dim.value for dim in inp.shape.dims])
-      left = array_ops.slice(padded, [0, 0, 0, 0],
-                             [row[0] for row in paddings_value])
-      right = array_ops.slice(
-          padded,
-          [paddings_value[i][0] + inp.shape.dims[i].value for i in range(4)],
-          [-1, -1, -1, -1])
-      with self.test_session(use_gpu=True):
-        self.assertAllEqual(inp.eval(), middle.eval())
-        self.assertAllEqual(
-            np.zeros([row[0] for row in paddings_value]), left.eval())
-        self.assertAllEqual(
-            np.zeros([row[1] for row in paddings_value]), right.eval())
+    paddings_values = [[[0, 0], [0, 0], [0, 0], [0, 1]],
+                       [[0, 0], [2, 3], [0, 0], [0, 0]],
+                       [[0, 0], [0, 0], [0, 0], [0, 0]]]
+    # pyformat: enable
+    for paddings_value in paddings_values:
+      for dtype in [dtypes.float32, dtypes.int32]:
+        inp = constant_op.constant(1, shape=[8, 28, 28, 3], dtype=dtype)
+        paddings = constant_op.constant(paddings_value, dtype=dtypes.int32)
+        padded = array_ops.pad(inp, paddings)
+        middle = array_ops.slice(padded, [row[0] for row in paddings_value],
+                                 [dim.value for dim in inp.shape.dims])
+        left = array_ops.slice(padded, [0, 0, 0, 0],
+                               [row[0] for row in paddings_value])
+        right = array_ops.slice(
+            padded,
+            [paddings_value[i][0] + inp.shape.dims[i].value for i in range(4)],
+            [-1, -1, -1, -1])
+        with self.test_session(use_gpu=True):
+          self.assertAllEqual(inp.eval(), middle.eval())
+          self.assertAllEqual(
+              np.zeros([row[0] for row in paddings_value]), left.eval())
+          self.assertAllEqual(
+              np.zeros([row[1] for row in paddings_value]), right.eval())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 2f3bea5825f8889c5880c819ebf6b17aaa613f08..ed44a1a4d16a94d3aa75a50bf059e33326757c4d 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -123,8 +123,9 @@ class PoolingTest(test.TestCase):
       if input_sizes[-1] % 4 != 0:
         tf_logging.info("Skipping test for depth %d", input_sizes[-1])
         return
-    tf_logging.info("Running %s test. %r %r %d %r %r %r", data_format, v2,
-                    input_sizes, total_size, pool_func, ksize, strides)
+    tf_logging.info("Running %s test. %r %r %d %r %r %r %s", data_format, v2,
+                    input_sizes, total_size, pool_func, ksize, strides,
+                    data_type)
     # Initializes the input tensor with array containing incrementing
     # numbers from 1, wrapping round to -127 after 127 to support int8.
     x = [((f + 128) % 255) - 127 for f in range(total_size)]
@@ -193,6 +194,8 @@ class PoolingTest(test.TestCase):
 
     self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
                         data_format, dtypes.float32, expected, use_gpu, v2)
+    self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
+                        data_format, dtypes.float64, expected, use_gpu, v2)
 
     if not use_gpu or test_util.CudaSupportsHalfMatMulAndConv():
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 36142801d682d4e99bc1beceee7aee76206db4cf..5b508b7c0e72180194fa1a4c95bc4282d4694605 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
@@ -480,6 +481,18 @@ class PyFuncTest(test.TestCase):
 
       self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerReturningVariableRaisesError(self):
+    def return_variable():
+      variable = resource_variable_ops.ResourceVariable(0.0)
+      return variable
+
+    with self.assertRaisesRegexp(errors.UnknownError,
+                                 "Attempting to return a variable"):
+      output = script_ops.eager_py_func(
+          return_variable, inp=[], Tout=dtypes.float32)
+      self.evaluate(output)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 88a4ddf7f29ec772282e7a8e2b59f144f1a968c2..acd7566eec8e3fffd74db33234b03a0c87427a3e 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -121,15 +121,3 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 2dc993f8117a41de8f15663ce763ec1d5b7ecdb4..edc63264a3549e91f2d6278a935be29eda5c99be 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -87,6 +87,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
+      resource_variable_ops.assign_variable_op(handle, 1)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "Trying to read variable with wrong dtype. "
                                    "Expected float got int32."):
@@ -103,6 +104,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v = resource_variable_ops.ResourceVariable(False, name="bool_test")
       self.assertAllEqual(bool(v), False)
 
+  def testFetchHandle(self):
+    with self.test_session():
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1], name="foo")
+      self.assertGreater(len(handle.eval()), 0)
+
   def testAssignVariableDtypeMismatchEager(self):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
@@ -179,6 +186,204 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
       self.assertEqual(self.evaluate(read), [[3]])
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterSub(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[1]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_sub(handle, [0],
+                                                     constant_op.constant(
+                                                         [[2]],
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterMul(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[1]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_mul(handle, [0],
+                                                     constant_op.constant(
+                                                         [[5]],
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterDiv(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[6]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_div(handle, [0],
+                                                     constant_op.constant(
+                                                         [[3]],
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterMin(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[6]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_min(handle, [0],
+                                                     constant_op.constant(
+                                                         [[3]],
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterMax(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[6]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_max(handle, [0],
+                                                     constant_op.constant(
+                                                         [[3]],
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[6]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterAddScalar(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[1]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_add(handle, [0],
+                                                     constant_op.constant(
+                                                         2,
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterSubScalar(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[1]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_sub(handle, [0],
+                                                     constant_op.constant(
+                                                         2,
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterMulScalar(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[1]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_mul(handle, [0],
+                                                     constant_op.constant(
+                                                         5,
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterDivScalar(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[6]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_div(handle, [0],
+                                                     constant_op.constant(
+                                                         3,
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterMinScalar(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[6]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_min(handle, [0],
+                                                     constant_op.constant(
+                                                         3,
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testScatterMaxScalar(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[6]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_max(handle, [0],
+                                                     constant_op.constant(
+                                                         3,
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[6]])
+
   def testScatterUpdateString(self):
     handle = resource_variable_ops.var_handle_op(
         dtype=dtypes.string, shape=[1, 1])
@@ -190,6 +395,23 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(compat.as_bytes(self.evaluate(read)[0][0]),
                      compat.as_bytes("b"))
 
+  def testScatterUpdateStringScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.string, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(handle,
+                                                 constant_op.constant(
+                                                     [["a"]],
+                                                     dtype=dtypes.string)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_update(handle, [0],
+                                                      constant_op.constant(
+                                                          "b",
+                                                          dtype=dtypes.string)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.string)
+    self.assertEqual(
+        compat.as_bytes(self.evaluate(read)[0][0]), compat.as_bytes("b"))
+
   # TODO(alive): get this to work in Eager mode.
   def testGPU(self):
     with self.test_session(use_gpu=True):
@@ -580,6 +802,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       state_ops.scatter_update(v, [1], [3.0])
       self.assertAllEqual([1.0, 3.0], v.numpy())
 
+  def testScatterAddStateOps(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="add")
+      state_ops.scatter_add(v, [1], [3])
+      self.assertAllEqual([1.0, 5.0], v.numpy())
+
   def testScatterUpdateCast(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 7cdf11d88468cabaf32387b0a4bdda760b4af31e..c70a4ffce7be71effe3ea10faa9754ab2b3842ce 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -38,38 +38,100 @@ def _NumpyAdd(ref, indices, updates):
     ref[indx] += updates[i]
 
 
+def _NumpyAddScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] += update
+
+
 def _NumpySub(ref, indices, updates):
   for i, indx in np.ndenumerate(indices):
     ref[indx] -= updates[i]
 
 
+def _NumpySubScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] -= update
+
+
 def _NumpyMul(ref, indices, updates):
   for i, indx in np.ndenumerate(indices):
     ref[indx] *= updates[i]
 
 
+def _NumpyMulScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] *= update
+
+
 def _NumpyDiv(ref, indices, updates):
   for i, indx in np.ndenumerate(indices):
     ref[indx] /= updates[i]
 
 
+def _NumpyDivScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] /= update
+
+
+def _NumpyMin(ref, indices, updates):
+  for i, indx in np.ndenumerate(indices):
+    ref[indx] = np.minimum(ref[indx], updates[i])
+
+
+def _NumpyMinScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] = np.minimum(ref[indx], update)
+
+
+def _NumpyMax(ref, indices, updates):
+  for i, indx in np.ndenumerate(indices):
+    ref[indx] = np.maximum(ref[indx], updates[i])
+
+
+def _NumpyMaxScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] = np.maximum(ref[indx], update)
+
+
 def _NumpyUpdate(ref, indices, updates):
   for i, indx in np.ndenumerate(indices):
     ref[indx] = updates[i]
 
 
+def _NumpyUpdateScalar(ref, indices, update):
+  for _, indx in np.ndenumerate(indices):
+    ref[indx] = update
+
+
 _TF_OPS_TO_NUMPY = {
     state_ops.scatter_update: _NumpyUpdate,
     state_ops.scatter_add: _NumpyAdd,
     state_ops.scatter_sub: _NumpySub,
     state_ops.scatter_mul: _NumpyMul,
     state_ops.scatter_div: _NumpyDiv,
+    state_ops.scatter_min: _NumpyMin,
+    state_ops.scatter_max: _NumpyMax,
+}
+
+_TF_OPS_TO_NUMPY_SCALAR = {
+    state_ops.scatter_update: _NumpyUpdateScalar,
+    state_ops.scatter_add: _NumpyAddScalar,
+    state_ops.scatter_sub: _NumpySubScalar,
+    state_ops.scatter_mul: _NumpyMulScalar,
+    state_ops.scatter_div: _NumpyDivScalar,
+    state_ops.scatter_min: _NumpyMinScalar,
+    state_ops.scatter_max: _NumpyMaxScalar,
 }
 
 
 class ScatterTest(test.TestCase):
 
-  def _VariableRankTest(self, tf_scatter, vtype, itype, repeat_indices=False):
+  def _VariableRankTest(self,
+                        tf_scatter,
+                        vtype,
+                        itype,
+                        repeat_indices=False,
+                        updates_are_scalar=False):
     np.random.seed(8)
     with self.test_session(use_gpu=True):
       for indices_shape in (), (2,), (3, 7), (3, 4, 7):
@@ -89,8 +151,11 @@ class ScatterTest(test.TestCase):
                                   indices[np.random.randint(size // 2)])
             np.random.shuffle(indices)
           indices = indices.reshape(indices_shape)
-          updates = _AsType(
-              np.random.randn(*(indices_shape + extra_shape)), vtype)
+          if updates_are_scalar:
+            updates = _AsType(np.random.randn(), vtype)
+          else:
+            updates = _AsType(
+                np.random.randn(*(indices_shape + extra_shape)), vtype)
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
@@ -101,7 +166,10 @@ class ScatterTest(test.TestCase):
 
           # Scatter via numpy
           new = old.copy()
-          np_scatter = _TF_OPS_TO_NUMPY[tf_scatter]
+          if updates_are_scalar:
+            np_scatter = _TF_OPS_TO_NUMPY_SCALAR[tf_scatter]
+          else:
+            np_scatter = _TF_OPS_TO_NUMPY[tf_scatter]
           np_scatter(new, indices, updates)
           # Scatter via tensorflow
           ref = variables.Variable(old)
@@ -109,25 +177,35 @@ class ScatterTest(test.TestCase):
           tf_scatter(ref, indices, updates).eval()
           self.assertAllClose(ref.eval(), new)
 
-  def _VariableRankTests(self, tf_scatter, repeat_indices=False):
+  def _VariableRankTests(self,
+                         tf_scatter,
+                         repeat_indices=False,
+                         updates_are_scalar=False):
     for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
-        self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices)
+        self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
+                               updates_are_scalar)
 
   def testVariableRankUpdate(self):
-    self._VariableRankTests(state_ops.scatter_update)
+    self._VariableRankTests(state_ops.scatter_update, False)
 
   def testVariableRankAdd(self):
-    self._VariableRankTests(state_ops.scatter_add)
+    self._VariableRankTests(state_ops.scatter_add, False)
 
   def testVariableRankSub(self):
-    self._VariableRankTests(state_ops.scatter_sub)
+    self._VariableRankTests(state_ops.scatter_sub, False)
 
   def testVariableRankMul(self):
-    self._VariableRankTests(state_ops.scatter_mul)
+    self._VariableRankTests(state_ops.scatter_mul, False)
 
   def testVariableRankDiv(self):
-    self._VariableRankTests(state_ops.scatter_div)
+    self._VariableRankTests(state_ops.scatter_div, False)
+
+  def testVariableRankMin(self):
+    self._VariableRankTests(state_ops.scatter_min, False)
+
+  def testVariableRankMax(self):
+    self._VariableRankTests(state_ops.scatter_max, False)
 
   def testRepeatIndicesAdd(self):
     self._VariableRankTests(state_ops.scatter_add, True)
@@ -141,6 +219,51 @@ class ScatterTest(test.TestCase):
   def testRepeatIndicesDiv(self):
     self._VariableRankTests(state_ops.scatter_div, True)
 
+  def testRepeatIndicesMin(self):
+    self._VariableRankTests(state_ops.scatter_min, True)
+
+  def testRepeatIndicesMax(self):
+    self._VariableRankTests(state_ops.scatter_max, True)
+
+  def testVariableRankUpdateScalar(self):
+    self._VariableRankTests(state_ops.scatter_update, False, True)
+
+  def testVariableRankAddScalar(self):
+    self._VariableRankTests(state_ops.scatter_add, False, True)
+
+  def testVariableRankSubScalar(self):
+    self._VariableRankTests(state_ops.scatter_sub, False, True)
+
+  def testVariableRankMulScalar(self):
+    self._VariableRankTests(state_ops.scatter_mul, False, True)
+
+  def testVariableRankDivScalar(self):
+    self._VariableRankTests(state_ops.scatter_div, False, True)
+
+  def testVariableRankMinScalar(self):
+    self._VariableRankTests(state_ops.scatter_min, False, True)
+
+  def testVariableRankMaxScalar(self):
+    self._VariableRankTests(state_ops.scatter_max, False, True)
+
+  def testRepeatIndicesAddScalar(self):
+    self._VariableRankTests(state_ops.scatter_add, True, True)
+
+  def testRepeatIndicesSubScalar(self):
+    self._VariableRankTests(state_ops.scatter_sub, True, True)
+
+  def testRepeatIndicesMulScalar(self):
+    self._VariableRankTests(state_ops.scatter_mul, True, True)
+
+  def testRepeatIndicesDivScalar(self):
+    self._VariableRankTests(state_ops.scatter_div, True, True)
+
+  def testRepeatIndicesMinScalar(self):
+    self._VariableRankTests(state_ops.scatter_min, True, True)
+
+  def testRepeatIndicesMaxScalar(self):
+    self._VariableRankTests(state_ops.scatter_max, True, True)
+
   def testBooleanScatterUpdate(self):
     if not test.is_gpu_available():
       with self.test_session(use_gpu=False) as session:
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 4de5f4e4dbd38043557c54ede90fa47e43a1e26d..d2647088c5c2afda032482fb5cfd983cedb49a8f 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -71,6 +71,23 @@ class SelfAdjointEigTest(test.TestCase):
       self.assertAllEqual(val[4], val[5])
       self.assertAllEqual(val[1], val[3])
 
+  def testMatrixThatFailsWhenFlushingDenormsToZero(self):
+    # Test a 32x32 matrix which is known to fail if denorm floats are flushed to
+    # zero.
+    matrix = np.genfromtxt(
+        test.test_src_dir_path(
+            "python/kernel_tests/testdata/"
+            "self_adjoint_eig_fail_if_denorms_flushed.txt")).astype(np.float32)
+    self.assertEqual(matrix.shape, (32, 32))
+    matrix_tensor = constant_op.constant(matrix)
+    with self.test_session(use_gpu=True) as sess:
+      (e, v) = sess.run(linalg_ops.self_adjoint_eig(matrix_tensor))
+      self.assertEqual(e.size, 32)
+      self.assertAllClose(
+          np.matmul(v, v.transpose()), np.eye(32, dtype=np.float32), atol=2e-3)
+      self.assertAllClose(matrix,
+                          np.matmul(np.matmul(v, np.diag(e)), v.transpose()))
+
 
 def SortEigenDecomposition(e, v):
   if v.ndim < 2:
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 2b8e99e18e6881143ee77c4f1ec5096635e5c1b2..981f96b74d3058aa79a1ea10e1254e572d0e8b85 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -18,14 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import unittest
 import numpy as np
 
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 
 
 @test_util.with_c_api
@@ -41,9 +44,10 @@ class SoftmaxTest(test.TestCase):
             features, axis=dim), one_only_on_dim))
     softmax = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim)
     if log:
-      return np.log(softmax)
+      res = np.log(softmax)
     else:
-      return softmax
+      res = softmax
+    return res
 
   def _testSoftmax(self, np_features, dim=-1, log=False, use_gpu=False):
     # A previous version of the code checked the op name rather than the op type
@@ -53,9 +57,9 @@ class SoftmaxTest(test.TestCase):
     np_softmax = self._npSoftmax(np_features, dim=dim, log=log)
     with self.test_session(use_gpu=use_gpu):
       if log:
-        tf_softmax = nn_ops.log_softmax(np_features, dim=dim, name=name)
+        tf_softmax = nn_ops.log_softmax(np_features, axis=dim, name=name)
       else:
-        tf_softmax = nn_ops.softmax(np_features, dim=dim, name=name)
+        tf_softmax = nn_ops.softmax(np_features, axis=dim, name=name)
       out = tf_softmax.eval()
     self.assertAllCloseAccordingToType(np_softmax, out)
     self.assertShapeEqual(np_softmax, tf_softmax)
@@ -117,10 +121,32 @@ class SoftmaxTest(test.TestCase):
     self._testAll(
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float32))
 
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testFloatGPU(self):
+    if test.is_gpu_available(cuda_only=True):
+      rows = [2**x + np.random.randint(0, 1024) for x in range(1, 10)]
+      cols = [2**x + np.random.randint(0, 1024) for x in range(1, 10)]
+      for row, col in zip(rows, cols):
+        logging.info("Testing softmax float dtype in shape [%d, %d]", row, col)
+        data = np.random.rand(row, col)
+        self._testAll(data.astype(np.float32))
+
   def testHalf(self):
     self._testAll(
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float16))
 
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testHalfGPU(self):
+    if test.is_gpu_available(cuda_only=True):
+      rows = [2**x + np.random.randint(0, 1024) for x in range(1, 8)]
+      cols = [2**x + np.random.randint(0, 1024) for x in range(1, 8)]
+      for row, col in zip(rows, cols):
+        logging.info("Testing softmax half dtype in shape [%d, %d]", row, col)
+        data = np.random.rand(row, col)
+        self._testAll(data.astype(np.float16))
+
   def testDouble(self):
     self._testSoftmax(
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float64))
@@ -169,7 +195,7 @@ class SoftmaxTest(test.TestCase):
       self.assertEqual(0, array_ops.size(x).eval())
       # reshape would raise if logits is empty
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax(x, dim=0).eval()
+        nn_ops.softmax(x, axis=0).eval()
 
   def testDimTooLarge(self):
     with self.test_session():
@@ -177,7 +203,7 @@ class SoftmaxTest(test.TestCase):
       # inference error.
       dim = array_ops.placeholder_with_default(100, shape=[])
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax([1., 2., 3., 4.], dim=dim).eval()
+        nn_ops.softmax([1., 2., 3., 4.], axis=dim).eval()
 
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
diff --git a/tensorflow/python/kernel_tests/testdata/BUILD b/tensorflow/python/kernel_tests/testdata/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..45264c773ac0089bbfed44bd115e73e848a8cc62
--- /dev/null
+++ b/tensorflow/python/kernel_tests/testdata/BUILD
@@ -0,0 +1,24 @@
+# Data files for kernel tests.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "self_adjoint_eig_op_test_files",
+    srcs = ["self_adjoint_eig_fail_if_denorms_flushed.txt"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/kernel_tests/testdata/self_adjoint_eig_fail_if_denorms_flushed.txt b/tensorflow/python/kernel_tests/testdata/self_adjoint_eig_fail_if_denorms_flushed.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d56a690a7928fafe39debc478db3e90ab953430b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/testdata/self_adjoint_eig_fail_if_denorms_flushed.txt
@@ -0,0 +1,32 @@
+2.60986303e-17 -9.66826148e-21 -1.68610775e-24 -9.16104778e-17 -1.1039539e-18 -1.66460338e-25 -2.12362492e-23 1.90946688e-21 -3.34190535e-22 1.2000634e-18 -7.31782583e-20 2.57851762e-20 -2.55509e-20 -9.54284927e-20 -1.04248315e-17 -5.32450516e-22 -1.81712853e-17 6.0044594e-18 3.96602716e-11 2.89077487e-25 -2.47461475e-25 1.77941757e-24 -7.30388687e-21 -3.84350041e-16 -3.88532388e-21 -4.29928618e-21 4.13551131e-16 -2.63408791e-25 -2.84830375e-21 -1.6450072e-16 -2.8585296e-21 -3.65413296e-21
+-9.66826148e-21 5.03939189e-22 9.17361108e-26 5.17304053e-20 1.99338895e-20 1.25259775e-28 -8.70441942e-26 9.91474109e-25 -5.80960164e-24 -1.19022314e-21 3.90467165e-22 -1.38179098e-22 1.79253406e-22 2.23977705e-22 1.1864143e-19 7.16291934e-24 4.10159639e-20 -2.16798529e-20 -4.95460504e-14 -2.6881406e-27 5.32861213e-27 -4.54567085e-28 1.99794328e-23 1.26854541e-17 -1.92916739e-23 8.60632417e-24 -1.04721097e-18 -7.00607669e-28 6.86771954e-23 8.65173173e-19 1.24469175e-22 6.03883081e-24
+-1.68610775e-24 9.17361108e-26 1.34889529e-26 2.65059e-22 2.39713735e-23 -2.00915344e-30 -1.135692e-27 -6.46049964e-26 -1.03607712e-26 -1.57623654e-23 -1.63805162e-24 -5.95741642e-25 3.24984759e-25 6.49561204e-24 2.28504969e-21 2.8319611e-25 3.96494845e-22 -2.1988623e-22 6.26027228e-16 1.2418479e-30 2.1016041e-30 6.22813846e-30 -1.0708067e-25 6.90778045e-21 1.86361622e-25 7.08789674e-26 -9.23628499e-21 1.65335067e-30 -1.12173032e-26 8.2257321e-22 -4.72686764e-27 -2.58501275e-26
+-9.16104778e-17 5.17304053e-20 2.65059e-22 2.69965968e-14 7.06005733e-17 1.69851446e-22 -2.75994304e-21 -6.61589523e-20 3.8682048e-20 -1.69253147e-17 -2.68580354e-18 -7.74994098e-19 -9.75466696e-19 2.13537585e-18 2.13185342e-16 6.89417478e-21 1.35805044e-16 -3.48309239e-16 1.0448622e-09 -2.17287918e-23 7.41749185e-24 -7.36683057e-23 -1.31083094e-20 1.574e-14 5.72646592e-19 -9.85673749e-21 -1.0654985e-14 2.70679318e-23 4.0943479e-20 -3.42938568e-15 8.57373804e-20 -2.18094505e-20
+-1.1039539e-18 1.99338895e-20 2.39713735e-23 7.06005733e-17 1.83801666e-17 1.09735975e-24 -5.73058223e-24 7.2227645e-22 -8.94843118e-22 -2.30558605e-19 -7.84892038e-20 -1.88692532e-20 -1.02217713e-20 2.95458834e-20 2.42873413e-17 8.89161401e-22 1.21669872e-17 -6.85317731e-18 -7.345906e-12 -3.1158751e-25 1.36359449e-24 -1.57981417e-24 3.89633371e-21 9.94580899e-16 1.45732115e-20 6.92065325e-22 -1.86114433e-16 6.00601346e-26 3.26844e-21 4.38573742e-17 1.06803444e-20 4.60203933e-22
+-1.66460338e-25 1.25259775e-28 -2.00915344e-30 1.69851446e-22 1.09735975e-24 5.75549306e-30 4.74050864e-29 -5.99239043e-28 -1.5784658e-27 -1.74631273e-25 -1.22702975e-25 -1.03371979e-26 -1.96967552e-26 -1.56446725e-26 -3.06462576e-25 -6.33857393e-28 -6.08829397e-24 -7.07478859e-24 -4.82614847e-18 -2.7324345e-31 1.23830207e-31 -7.96172e-31 -1.9034503e-27 -3.82709848e-22 -2.69257733e-26 -3.84934809e-27 -1.48572725e-22 4.14585761e-31 2.5611404e-28 -2.77402858e-24 3.10373361e-28 -5.09669241e-28
+-2.12362492e-23 -8.70441942e-26 -1.135692e-27 -2.75994304e-21 -5.73058223e-24 4.74050864e-29 6.28162e-26 -3.30076462e-25 -3.30065418e-25 -1.1370873e-23 -8.97722764e-24 -1.03190629e-24 -9.52908672e-25 -3.27285413e-24 1.36216664e-22 -8.0549564e-26 -1.94826821e-22 -3.64999226e-22 -2.92500975e-15 -3.00986528e-29 2.39712646e-29 -1.02470704e-28 -4.99034099e-25 -1.32277916e-19 -5.05595e-24 -3.04012473e-25 -1.44724215e-20 5.04614184e-30 -4.12370105e-26 4.20735765e-21 -1.02818953e-25 3.41267575e-26
+1.90946688e-21 9.91474109e-25 -6.46049964e-26 -6.61589523e-20 7.2227645e-22 -5.99239043e-28 -3.30076462e-25 1.8948059e-22 1.83367373e-23 1.06616038e-21 -2.81616502e-22 1.18347412e-22 8.3458038e-23 9.67703245e-24 -1.37445558e-20 2.11412652e-24 2.64820742e-21 8.02510339e-20 4.39926334e-13 9.58727772e-27 2.9838033e-28 1.29183353e-26 1.78626483e-22 3.03531056e-19 9.62612316e-23 1.33722715e-23 2.92905627e-18 -9.42286262e-28 3.23170971e-24 4.10885529e-19 -8.38673724e-25 -8.63732285e-25
+-3.34190535e-22 -5.80960164e-24 -1.03607712e-26 3.8682048e-20 -8.94843118e-22 -1.5784658e-27 -3.30065418e-25 1.83367373e-23 9.30693173e-23 1.48929558e-21 1.83278606e-21 1.08468362e-22 2.61703785e-22 4.42441537e-23 1.23906316e-20 2.55235433e-24 8.36323349e-20 1.2152038e-19 9.83332204e-14 5.14523933e-27 -3.28220159e-28 8.22099066e-27 3.34939233e-23 4.3309476e-19 5.82711129e-22 1.14299394e-22 3.25240717e-18 5.84184241e-28 -1.76991199e-24 5.5568966e-20 -2.80294941e-24 4.59071175e-24
+1.2000634e-18 -1.19022314e-21 -1.57623654e-23 -1.69253147e-17 -2.30558605e-19 -1.74631273e-25 -1.1370873e-23 1.06616038e-21 1.48929558e-21 2.05547703e-18 2.01471341e-20 2.65473229e-20 1.36331708e-20 -2.19777252e-20 -3.09825792e-18 -1.93365673e-22 -2.25608735e-18 7.98997246e-18 1.45582661e-11 6.29004356e-25 -1.14866332e-25 -5.51419319e-26 2.97082139e-21 -2.39052259e-16 1.48920411e-20 1.28589326e-21 4.27717466e-16 -4.44694851e-26 -1.80270052e-22 3.29932795e-18 -5.11645591e-22 5.53091711e-23
+-7.31782583e-20 3.90467165e-22 -1.63805162e-24 -2.68580354e-18 -7.84892038e-20 -1.22702975e-25 -8.97722764e-24 -2.81616502e-22 1.83278606e-21 2.01471341e-20 4.38037939e-19 -4.46678177e-21 3.48516266e-20 7.32592348e-21 1.11928135e-18 8.58541052e-23 8.80645183e-18 4.80109643e-21 -1.7163557e-11 1.92262335e-26 -2.78003951e-26 5.48322572e-25 8.95330117e-23 -1.11570766e-17 3.13666242e-20 4.47195205e-21 -1.09014604e-17 7.69340111e-26 1.64649306e-22 1.71054085e-17 1.33471053e-23 6.40747815e-22
+2.57851762e-20 -1.38179098e-22 -5.95741642e-25 -7.74994098e-19 -1.88692532e-20 -1.03371979e-26 -1.03190629e-24 1.18347412e-22 1.08468362e-22 2.65473229e-20 -4.46678177e-21 5.22731861e-21 1.06412616e-21 -8.0508039e-22 -1.68829721e-19 -2.7699538e-23 -2.15173717e-19 7.46895651e-19 1.71858101e-12 5.41956e-26 -6.15013064e-27 1.54884457e-26 2.54028029e-22 -1.50009535e-18 1.11920465e-21 1.05890428e-22 3.6487132e-17 -2.06798384e-27 -5.5143889e-23 -1.71529414e-18 -7.38099094e-23 -6.5250472e-24
+-2.55509e-20 1.79253406e-22 3.24984759e-25 -9.75466696e-19 -1.02217713e-20 -1.96967552e-26 -9.52908672e-25 8.3458038e-23 2.61703785e-22 1.36331708e-20 3.48516266e-20 1.06412616e-21 4.08927657e-20 -2.76503659e-21 -6.81059804e-20 5.13487959e-23 1.80612902e-18 5.32462054e-19 -3.89327199e-12 3.60012729e-26 -2.5575456e-26 3.14316426e-25 4.56614351e-22 -1.24545392e-17 9.14707146e-21 7.97421952e-22 2.84371096e-17 2.98359736e-26 1.33439467e-23 1.00242743e-17 -4.94476664e-23 3.28816461e-22
+-9.54284927e-20 2.23977705e-22 6.49561204e-24 2.13537585e-18 2.95458834e-20 -1.56446725e-26 -3.27285413e-24 9.67703245e-24 4.42441537e-23 -2.19777252e-20 7.32592348e-21 -8.0508039e-22 -2.76503659e-21 5.02409342e-20 1.57549297e-18 2.63027228e-22 6.11241908e-19 -2.71906856e-19 1.41003203e-12 2.66730019e-26 2.25679315e-26 1.00596535e-25 3.02875382e-22 3.85539387e-17 6.79708607e-22 1.60452617e-22 -2.08440846e-17 -5.40071056e-28 4.56236979e-23 -1.00868521e-17 1.22265047e-22 -1.81997389e-23
+-1.04248315e-17 1.1864143e-19 2.28504969e-21 2.13185342e-16 2.42873413e-17 -3.06462576e-25 1.36216664e-22 -1.37445558e-20 1.23906316e-20 -3.09825792e-18 1.11928135e-18 -1.68829721e-19 -6.81059804e-20 1.57549297e-18 2.5311263e-15 9.97996576e-20 2.26115975e-16 -3.86907114e-17 3.68487445e-12 8.23669787e-24 1.00324064e-23 3.38722042e-24 8.64234911e-21 2.46521189e-15 1.72823337e-19 9.24995431e-20 -3.16903295e-15 5.94130048e-25 1.73965082e-20 1.17371651e-15 2.26718703e-20 4.16709318e-21
+-5.32450516e-22 7.16291934e-24 2.8319611e-25 6.89417478e-21 8.89161401e-22 -6.33857393e-28 -8.0549564e-26 2.11412652e-24 2.55235433e-24 -1.93365673e-22 8.58541052e-23 -2.7699538e-23 5.13487959e-23 2.63027228e-22 9.97996576e-20 2.88326168e-23 1.35358898e-20 5.43364968e-21 4.24011412e-14 1.88486064e-27 8.93106076e-29 4.5748278e-27 2.48573168e-24 5.81165621e-19 1.96505062e-23 5.84813631e-24 -2.46866108e-20 1.912471e-29 2.0243857e-24 -2.88983463e-20 1.35761502e-24 1.40424791e-27
+-1.81712853e-17 4.10159639e-20 3.96494845e-22 1.35805044e-16 1.21669872e-17 -6.08829397e-24 -1.94826821e-22 2.64820742e-21 8.36323349e-20 -2.25608735e-18 8.80645183e-18 -2.15173717e-19 1.80612902e-18 6.11241908e-19 2.26115975e-16 1.35358898e-20 3.66013906e-15 1.35652384e-17 -1.97764849e-09 4.16586597e-24 1.28936031e-24 6.96597122e-23 2.43147439e-21 -1.25627342e-15 1.52711738e-18 2.61025243e-19 -2.00782109e-15 9.75835691e-24 4.0203e-21 1.40790259e-15 -7.8869e-21 8.51983e-20
+6.0044594e-18 -2.16798529e-20 -2.1988623e-22 -3.48309239e-16 -6.85317731e-18 -7.07478859e-24 -3.64999226e-22 8.02510339e-20 1.2152038e-19 7.98997246e-18 4.80109643e-21 7.46895651e-19 5.32462054e-19 -2.71906856e-19 -3.86907114e-17 5.43364968e-21 1.35652384e-17 1.19795414e-15 1.18472676e-09 2.74214961e-23 -7.6305178e-26 1.25969175e-23 1.68466447e-19 1.33873166e-15 1.0739288e-18 1.02533716e-19 2.73480291e-14 -1.87024011e-24 -9.73944425e-21 2.74769918e-16 -1.48632788e-20 1.69142815e-21
+3.96602716e-11 -4.95460504e-14 6.26027228e-16 1.0448622e-09 -7.345906e-12 -4.82614847e-18 -2.92500975e-15 4.39926334e-13 9.83332204e-14 1.45582661e-11 -1.7163557e-11 1.71858101e-12 -3.89327199e-12 1.41003203e-12 3.68487445e-12 4.24011412e-14 -1.97764849e-09 1.18472676e-09 0.0257282555 5.64106473e-17 5.83845666e-18 -1.72409096e-16 1.02886027e-12 1.42563525e-08 -1.57067415e-12 -4.61972799e-13 3.30651737e-08 -5.20615037e-17 -1.71347193e-14 2.87764201e-10 5.03749196e-14 -1.97989316e-13
+2.89077487e-25 -2.6881406e-27 1.2418479e-30 -2.17287918e-23 -3.1158751e-25 -2.7324345e-31 -3.00986528e-29 9.58727772e-27 5.14523933e-27 6.29004356e-25 1.92262335e-26 5.41956e-26 3.60012729e-26 2.66730019e-26 8.23669787e-24 1.88486064e-27 4.16586597e-24 2.74214961e-23 5.64106473e-17 1.2555855e-29 -1.30304595e-31 8.42884087e-31 1.75222077e-26 -2.89058862e-23 3.0225144e-26 6.67962117e-27 8.54181718e-22 -1.2385176e-32 -5.78078369e-28 3.34704626e-23 -2.00599605e-27 2.05674681e-28
+-2.47461475e-25 5.32861213e-27 2.1016041e-30 7.41749185e-24 1.36359449e-24 1.23830207e-31 2.39712646e-29 2.9838033e-28 -3.28220159e-28 -1.14866332e-25 -2.78003951e-26 -6.15013064e-27 -2.5575456e-26 2.25679315e-26 1.00324064e-23 8.93106076e-29 1.28936031e-24 -7.6305178e-26 5.83845666e-18 -1.30304595e-31 2.26490979e-30 -4.25637053e-31 1.40697e-27 5.91197152e-22 -2.08475892e-26 -5.64982671e-28 -3.97199197e-23 -5.06794406e-32 1.11993943e-27 -2.94280711e-23 2.65858181e-27 -2.23093754e-28
+1.77941757e-24 -4.54567085e-28 6.22813846e-30 -7.36683057e-23 -1.57981417e-24 -7.96172e-31 -1.02470704e-28 1.29183353e-26 8.22099066e-27 -5.51419319e-26 5.48322572e-25 1.54884457e-26 3.14316426e-25 1.00596535e-25 3.38722042e-24 4.5748278e-27 6.96597122e-23 1.25969175e-23 -1.72409096e-16 8.42884087e-31 -4.25637053e-31 1.40764294e-28 1.38735442e-26 -1.93810515e-22 1.93660175e-25 1.97417449e-26 1.62145272e-22 2.52533191e-31 -3.42833345e-28 6.34130774e-22 -2.01859e-27 6.1781768e-27
+-7.30388687e-21 1.99794328e-23 -1.0708067e-25 -1.31083094e-20 3.89633371e-21 -1.9034503e-27 -4.99034099e-25 1.78626483e-22 3.34939233e-23 2.97082139e-21 8.95330117e-23 2.54028029e-22 4.56614351e-22 3.02875382e-22 8.64234911e-21 2.48573168e-24 2.43147439e-21 1.68466447e-19 1.02886027e-12 1.75222077e-26 1.40697e-27 1.38735442e-26 1.18400807e-21 1.40670976e-18 2.40320429e-22 3.69528133e-23 4.81603371e-18 -1.49322683e-27 -2.70670724e-25 1.59463723e-19 6.40406749e-24 1.17170599e-23
+-3.84350041e-16 1.26854541e-17 6.90778045e-21 1.574e-14 9.94580899e-16 -3.82709848e-22 -1.32277916e-19 3.03531056e-19 4.3309476e-19 -2.39052259e-16 -1.11570766e-17 -1.50009535e-18 -1.24545392e-17 3.85539387e-17 2.46521189e-15 5.81165621e-19 -1.25627342e-15 1.33873166e-15 1.42563525e-08 -2.89058862e-23 5.91197152e-22 -1.93810515e-22 1.40670976e-18 4.40677789e-12 7.86017934e-19 7.73466606e-19 1.96690791e-15 -1.65941347e-22 2.63659933e-18 -3.0624544e-14 5.87194631e-18 -3.46291098e-19
+-3.88532388e-21 -1.92916739e-23 1.86361622e-25 5.72646592e-19 1.45732115e-20 -2.69257733e-26 -5.05595e-24 9.62612316e-23 5.82711129e-22 1.48920411e-20 3.13666242e-20 1.11920465e-21 9.14707146e-21 6.79708607e-22 1.72823337e-19 1.96505062e-23 1.52711738e-18 1.0739288e-18 -1.57067415e-12 3.0225144e-26 -2.08475892e-26 1.93660175e-25 2.40320429e-22 7.86017934e-19 1.80741048e-20 9.85491491e-22 5.08456938e-17 1.08072265e-26 -1.75036654e-23 4.36436952e-18 -1.77728563e-23 1.01268548e-22
+-4.29928618e-21 8.60632417e-24 7.08789674e-26 -9.85673749e-21 6.92065325e-22 -3.84934809e-27 -3.04012473e-25 1.33722715e-23 1.14299394e-22 1.28589326e-21 4.47195205e-21 1.05890428e-22 7.97421952e-22 1.60452617e-22 9.24995431e-20 5.84813631e-24 2.61025243e-19 1.02533716e-19 -4.61972799e-13 6.67962117e-27 -5.64982671e-28 1.97417449e-26 3.69528133e-23 7.73466606e-19 9.85491491e-22 3.68332283e-22 1.76753773e-18 2.6167718e-27 3.55918682e-25 1.95786374e-19 -2.60077304e-24 1.84790635e-23
+4.13551131e-16 -1.04721097e-18 -9.23628499e-21 -1.0654985e-14 -1.86114433e-16 -1.48572725e-22 -1.44724215e-20 2.92905627e-18 3.25240717e-18 4.27717466e-16 -1.09014604e-17 3.6487132e-17 2.84371096e-17 -2.08440846e-17 -3.16903295e-15 -2.46866108e-20 -2.00782109e-15 2.73480291e-14 3.30651737e-08 8.54181718e-22 -3.97199197e-23 1.62145272e-22 4.81603371e-18 1.96690791e-15 5.08456938e-17 1.76753773e-18 1.57092991e-12 -4.31425852e-23 -3.78241e-19 -1.15899865e-14 -7.61890782e-19 -1.15344546e-19
+-2.63408791e-25 -7.00607669e-28 1.65335067e-30 2.70679318e-23 6.00601346e-26 4.14585761e-31 5.04614184e-30 -9.42286262e-28 5.84184241e-28 -4.44694851e-26 7.69340111e-26 -2.06798384e-27 2.98359736e-26 -5.40071056e-28 5.94130048e-25 1.912471e-29 9.75835691e-24 -1.87024011e-24 -5.20615037e-17 -1.2385176e-32 -5.06794406e-32 2.52533191e-31 -1.49322683e-27 -1.65941347e-22 1.08072265e-26 2.6167718e-27 -4.31425852e-23 1.5576233e-30 -6.14697676e-29 -5.39097603e-24 -8.01112167e-29 1.81063126e-27
+-2.84830375e-21 6.86771954e-23 -1.12173032e-26 4.0943479e-20 3.26844e-21 2.5611404e-28 -4.12370105e-26 3.23170971e-24 -1.76991199e-24 -1.80270052e-22 1.64649306e-22 -5.5143889e-23 1.33439467e-23 4.56236979e-23 1.73965082e-20 2.0243857e-24 4.0203e-21 -9.73944425e-21 -1.71347193e-14 -5.78078369e-28 1.11993943e-27 -3.42833345e-28 -2.70670724e-25 2.63659933e-18 -1.75036654e-23 3.55918682e-25 -3.78241e-19 -6.14697676e-29 2.71732416e-23 2.4136621e-19 2.38938648e-23 1.21468477e-24
+-1.6450072e-16 8.65173173e-19 8.2257321e-22 -3.42938568e-15 4.38573742e-17 -2.77402858e-24 4.20735765e-21 4.10885529e-19 5.5568966e-20 3.29932795e-18 1.71054085e-17 -1.71529414e-18 1.00242743e-17 -1.00868521e-17 1.17371651e-15 -2.88983463e-20 1.40790259e-15 2.74769918e-16 2.87764201e-10 3.34704626e-23 -2.94280711e-23 6.34130774e-22 1.59463723e-19 -3.0624544e-14 4.36436952e-18 1.95786374e-19 -1.15899865e-14 -5.39097603e-24 2.4136621e-19 2.10373291e-13 4.84257897e-20 2.71571227e-19
+-2.8585296e-21 1.24469175e-22 -4.72686764e-27 8.57373804e-20 1.06803444e-20 3.10373361e-28 -1.02818953e-25 -8.38673724e-25 -2.80294941e-24 -5.11645591e-22 1.33471053e-23 -7.38099094e-23 -4.94476664e-23 1.22265047e-22 2.26718703e-20 1.35761502e-24 -7.8869e-21 -1.48632788e-20 5.03749196e-14 -2.00599605e-27 2.65858181e-27 -2.01859e-27 6.40406749e-24 5.87194631e-18 -1.77728563e-23 -2.60077304e-24 -7.61890782e-19 -8.01112167e-29 2.38938648e-23 4.84257897e-20 7.77486414e-23 -7.38542574e-25
+-3.65413296e-21 6.03883081e-24 -2.58501275e-26 -2.18094505e-20 4.60203933e-22 -5.09669241e-28 3.41267575e-26 -8.63732285e-25 4.59071175e-24 5.53091711e-23 6.40747815e-22 -6.5250472e-24 3.28816461e-22 -1.81997389e-23 4.16709318e-21 1.40424791e-27 8.51983e-20 1.69142815e-21 -1.97989316e-13 2.05674681e-28 -2.23093754e-28 6.1781768e-27 1.17170599e-23 -3.46291098e-19 1.01268548e-22 1.84790635e-23 -1.15344546e-19 1.81063126e-27 1.21468477e-24 2.71571227e-19 -7.38542574e-25 3.49516247e-23
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 531d0cdf9002d966909fe29ffcd538c3510c60ab..86ab9fbb70b5efcf06cc064617df14deb18c1f98 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
+import threading
 
 import numpy
 
@@ -1349,5 +1350,91 @@ class PartitionInfoTest(test.TestCase):
     self.assertEqual(0, partition_info.single_slice_dim([2, 3]))
 
 
+class VariableScopeMultithreadedTest(test.TestCase):
+
+  def testTwoThreadsDisjointScopeEntry(self):
+
+    def thread_fn(i, graph):
+      with graph.as_default():
+        with variable_scope.variable_scope("foo"):
+          if i == 0:
+            v = variable_scope.get_variable("v", [])
+            self.assertEquals("foo/v:0", v.name)
+          else:
+            # Any thread after the first one should fail to create variable
+            # with the same name.
+            with self.assertRaises(ValueError):
+              variable_scope.get_variable("v", [])
+
+    graph = ops.get_default_graph()
+    threads = [
+        threading.Thread(target=thread_fn, args=(i, graph,)) for i in range(2)]
+
+    threads[0].start()
+    # Allow thread 0 to finish before starting thread 1.
+    threads[0].join()
+    threads[1].start()
+    threads[1].join()
+
+  def testTwoThreadsNestedScopeEntry(self):
+
+    def thread_fn(i, graph, run_event, pause_event):
+      with graph.as_default():
+        with variable_scope.variable_scope("foo"):
+          if i == 0:
+            v = variable_scope.get_variable("v", [])
+            self.assertEquals("foo/v:0", v.name)
+          else:
+            # Any thread after the first one should fail to create variable
+            # with the same name.
+            with self.assertRaises(ValueError):
+              variable_scope.get_variable("v", [])
+          pause_event.set()
+          run_event.wait()
+
+    graph = ops.get_default_graph()
+    run_events = [threading.Event() for _ in range(2)]
+    pause_events = [threading.Event() for _ in range(2)]
+    threads = [
+        threading.Thread(
+            target=thread_fn, args=(i, graph, run_events[i], pause_events[i]))
+        for i in range(2)
+    ]
+
+    # Start first thread.
+    threads[0].start()
+    pause_events[0].wait()
+    # Start next thread once the first thread has paused.
+    threads[1].start()
+    pause_events[1].wait()
+    # Resume both threads.
+    run_events[0].set()
+    run_events[1].set()
+    threads[0].join()
+    threads[1].join()
+
+  def testReenterMainScope(self):
+
+    def thread_fn(graph, main_thread_scope):
+      with graph.as_default():
+        # Variable created with main scope will have prefix "main".
+        with variable_scope.variable_scope(main_thread_scope):
+          with variable_scope.variable_scope("foo"):
+            v = variable_scope.get_variable("v", [])
+            self.assertEquals("main/foo/v:0", v.name)
+
+        # Variable created outside main scope will not have prefix "main".
+        with variable_scope.variable_scope("bar"):
+          v = variable_scope.get_variable("v", [])
+          self.assertEquals("bar/v:0", v.name)
+
+    graph = ops.get_default_graph()
+    with variable_scope.variable_scope("main") as main_thread_scope:
+      thread = threading.Thread(
+          target=thread_fn, args=(graph, main_thread_scope))
+      thread.start()
+      thread.join()
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index e3e120a4eb01885ac5ac5e41f82ad3e480a83a77..60c726d54ceeb65ddf52af9b6aad685501214c24 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -18,10 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+import sys
+
 import numpy as np
 
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -88,7 +94,7 @@ class XentTest(test.TestCase):
                                                     4.]]]).astype(dtype)
       np_labels = np.array([[[0., 0., 0., 1.]], [[0., .5, .5,
                                                   0.]]]).astype(dtype)
-      self.assertRaisesRegexp(ValueError, "must be rank 2",
+      self.assertRaisesRegexp(ValueError, "rank 2, but is rank 3",
                               gen_nn_ops.softmax_cross_entropy_with_logits,
                               np_features, np_labels)
 
@@ -128,6 +134,24 @@ class XentTest(test.TestCase):
     self.assertAllClose(
         np.array([1.3862, 1.9401]), np_loss, rtol=1.e-3, atol=1.e-3)
 
+  def testShapeBroadcast(self):
+    np_f = np.array([[1., 2., 3., 4.],
+                     [1., 2., 3., 4.]]).astype(np.float32)
+    np_l = np.array([[0., 0., 0., 1.],
+                     [0., .5, .5, 0.]]).astype(np.float32)
+    np_loss, np_backprop = self._npXent(np_f, np_l)
+    tf_f = constant_op.constant(
+        np.array([[1., 2., 3., 4.]]).astype(np.float32))
+    tf_l = constant_op.constant(
+        np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float32))
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
+            tf_f, tf_l)
+        tf_loss, tf_backprop = sess.run([loss, backprop])
+      self.assertAllCloseAccordingToType(np_loss, tf_loss)
+      self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
+
   def testShapeMismatch(self):
     with self.test_session():
       with self.assertRaises(ValueError):
@@ -260,5 +284,60 @@ class XentTest(test.TestCase):
     self.assertAllEqual(np_loss, tf_loss)
 
 
+class XentBenchmark(test.Benchmark):
+
+  def benchmarkZeroDimension(self):
+    for (m, n, p, use_gpu) in itertools.product(
+        [128],
+        [10, 100, 1000, 10000, 100000],
+        [0.001, 0.01, 0.5, 0.99, 1.0],
+        [False]):
+      k = int(p * n)
+      if k == 0:
+        continue
+      name = "zero_dimension_m_%d_n_%d_k_%g_use_gpu_%s" % (m, n, k, use_gpu)
+      device = "/%s:0" % ("gpu" if use_gpu else "cpu")
+      with ops.Graph().as_default():
+        with ops.device(device):
+          labels = array_ops.zeros([0, 2, 4], dtype=dtypes.float32)
+          logits = array_ops.zeros([0, 2, 4], dtype=dtypes.float32)
+          op = nn_ops.softmax_cross_entropy_with_logits(
+              labels=labels, logits=logits)
+        with session.Session() as sess:
+          r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
+          gb_processed_input = m * n / 1.0e9
+          throughput = gb_processed_input / r["wall_time"]
+          print("Benchmark: %s \t wall_time: %0.03g s \t "
+                "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput))
+          sys.stdout.flush()
+
+  def benchmarkSingleClass(self):
+    for (m, n, p, use_gpu) in itertools.product(
+        [128],
+        [10, 100, 1000, 10000, 100000],
+        [0.001, 0.01, 0.5, 0.99, 1.0],
+        [False]):
+      k = int(p * n)
+      if k == 0:
+        continue
+      name = "single_class_m_%d_n_%d_k_%g_use_gpu_%s" % (m, n, k, use_gpu)
+      device = "/%s:0" % ("gpu" if use_gpu else "cpu")
+      with ops.Graph().as_default():
+        with ops.device(device):
+          labels = constant_op.constant([[1.], [-1.], [0.]],
+                                        dtype=dtypes.float32)
+          logits = constant_op.constant([[-1.], [0.], [1.]],
+                                        dtype=dtypes.float32)
+          op = nn_ops.softmax_cross_entropy_with_logits(
+              labels=labels, logits=logits)
+        with session.Session() as sess:
+          r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
+          gb_processed_input = m * n / 1.0e9
+          throughput = gb_processed_input / r["wall_time"]
+          print("Benchmark: %s \t wall_time: %0.03g s \t "
+                "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput))
+          sys.stdout.flush()
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index e4395bea92961d348ed3841a31cacb91aaa282ec..ec741d3265b4216bd962280b0b927d6ad8a51fe4 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -625,6 +625,8 @@ class Layer(checkpointable.CheckpointableBase):
     input_list = nest.flatten(inputs)
 
     build_graph = not context.executing_eagerly()
+    # TODO(fchollet, allenl): Make deferred mode work with subclassed Models
+    # which don't use an "inputs" argument.
     in_deferred_mode = isinstance(input_list[0], _DeferredTensor)
     # Ensure the Layer, if being reused, is working with inputs from
     # the same graph as where it was created.
@@ -692,7 +694,8 @@ class Layer(checkpointable.CheckpointableBase):
               self._dtype = input_list[0].dtype.base_dtype.name
             except AttributeError:
               pass
-          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+          if all(hasattr(x, 'get_shape') for x in input_list):
+            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
           self.build(input_shapes)
         try:
           # Note: not all sub-classes of Layer call Layer.__init__ (especially
@@ -892,7 +895,6 @@ class Layer(checkpointable.CheckpointableBase):
         mode.
         ValueError: If the index provided does not match any node.
     """
-    assert not context.executing_eagerly()
     if not self._inbound_nodes:
       raise RuntimeError('The layer has never been called '
                          'and thus has no defined ' + attr_name + '.')
@@ -922,9 +924,6 @@ class Layer(checkpointable.CheckpointableBase):
     Raises:
       RuntimeError: If called in Eager mode.
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          'Layer.get_input_shape_at not supported in Eager mode.')
     return self._get_node_attribute_at_index(node_index, 'input_shapes',
                                              'input shape')
 
@@ -985,8 +984,6 @@ class Layer(checkpointable.CheckpointableBase):
     Raises:
       RuntimeError: If called in Eager mode.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_output_at not supported in Eager mode.')
     return self._get_node_attribute_at_index(node_index, 'output_tensors',
                                              'output')
 
@@ -1008,8 +1005,6 @@ class Layer(checkpointable.CheckpointableBase):
       RuntimeError: If called in Eager mode.
       AttributeError: If no inbound nodes are found.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.input not supported in Eager mode.')
     if not self._inbound_nodes:
       raise AttributeError('Layer ' + self.name +
                            ' is not connected, no input to return.')
@@ -1030,8 +1025,6 @@ class Layer(checkpointable.CheckpointableBase):
         layers.
       RuntimeError: if called in Eager mode.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.output not supported in Eager mode.')
     if not self._inbound_nodes:
       raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
     return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
@@ -1052,8 +1045,6 @@ class Layer(checkpointable.CheckpointableBase):
         AttributeError: if the layer has no defined input_shape.
         RuntimeError: if called in Eager mode.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.input_shape not supported in Eager mode.')
     if not self._inbound_nodes:
       raise AttributeError('The layer has never been called '
                            'and thus has no defined input shape.')
@@ -1113,8 +1104,6 @@ class Layer(checkpointable.CheckpointableBase):
         AttributeError: if the layer has no defined output shape.
         RuntimeError: if called in Eager mode.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.output_shape not supported in Eager mode.')
     if not self._inbound_nodes:
       raise AttributeError('The layer has never been called '
                            'and thus has no defined output shape.')
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 74e7c63fb364d9c4475af5efe7d5db95cccf8166..2d99b1688f1b2736c0660ba2ac914018b21bf9ed 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -180,6 +180,8 @@ class _Conv(base.Layer):
           # bias_add when computing gradients. To use bias_add, we collapse Z
           # and Y into a single dimension to obtain a 4D input tensor.
           outputs_shape = outputs.shape.as_list()
+          if outputs_shape[0] is None:
+            outputs_shape[0] = -1
           outputs_4d = array_ops.reshape(outputs,
                                          [outputs_shape[0], outputs_shape[1],
                                           outputs_shape[2] * outputs_shape[3],
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index 160e732b6798697d05815e13a7b1c399070f0783..cdb42f5bd18292cad9d8536e88ea1c58c1d7d777 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -325,6 +325,12 @@ class ConvTest(test.TestCase):
     self.assertEqual(conv3d.kernel_constraint, k_constraint)
     self.assertEqual(conv3d.bias_constraint, b_constraint)
 
+  def testConv3DChannelsFirst(self):
+    # Test case for GitHub issue 15655
+    images = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[None, 1, 32, 32, 32])
+    conv_layers.conv3d(images, 32, 9, data_format='channels_first')
+
 
 @test_util.with_c_api
 class SeparableConv1DTest(test.TestCase):
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 3aafdbb14b56bbba3c4c9928e09a91d51f18db73..83b201e6420d48cfab38048d6638a9f9185d7d6c 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -32,12 +32,12 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import moving_averages
 from tensorflow.python.util.tf_export import tf_export
 
@@ -178,6 +178,11 @@ class BatchNormalization(base.Layer):
       self.renorm_clipping = renorm_clipping
       self.renorm_momentum = renorm_momentum
 
+  def _add_tower_local_variable(self, *args, **kwargs):
+    tower_context = distribute_lib.get_tower_context()
+    with tower_context.tower_local_var_scope('mean'):
+      return self.add_variable(*args, **kwargs)
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if not input_shape.ndims:
@@ -305,21 +310,20 @@ class BatchNormalization(base.Layer):
         self._scope.set_partitioner(None)
       else:
         partitioner = None
-      self.moving_mean = self.add_variable(
+      self.moving_mean = self._add_tower_local_variable(
           name='moving_mean',
           shape=param_shape,
           dtype=param_dtype,
           initializer=self.moving_mean_initializer,
           trainable=False)
 
-      self.moving_variance = self.add_variable(
+      self.moving_variance = self._add_tower_local_variable(
           name='moving_variance',
           shape=param_shape,
           dtype=param_dtype,
           initializer=self.moving_variance_initializer,
           trainable=False)
 
-      self._one_minus_decay = 1.0 - self.momentum
       if self.renorm:
         # Create variables to maintain the moving mean and standard deviation.
         # These are used in training and thus are different from the moving
@@ -329,7 +333,7 @@ class BatchNormalization(base.Layer):
         # stack to be cleared. The nested ones use a `lambda` to set the desired
         # device and ignore any devices that may be set by the custom getter.
         def _renorm_variable(name, shape):
-          var = self.add_variable(
+          var = self._add_tower_local_variable(
               name=name,
               shape=shape,
               dtype=param_dtype,
@@ -337,43 +341,32 @@ class BatchNormalization(base.Layer):
               trainable=False)
           return var
 
-        with ops.device(None):
-          device = (
-              self.moving_mean.device if context.executing_eagerly() else
-              (lambda _: self.moving_mean.device))
-          with ops.device(device):
-            self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
-            self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
-          # We initialize renorm_stddev to 0, and maintain the (0-initialized)
-          # renorm_stddev_weight. This allows us to (1) mix the average
-          # stddev with the minibatch stddev early in training, and (2) compute
-          # the unbiased average stddev by dividing renorm_stddev by the weight.
-          device = (
-              self.moving_variance.device if context.executing_eagerly() else
-              (lambda _: self.moving_variance.device))
-          with ops.device(device):
-            self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
-            self.renorm_stddev_weight = _renorm_variable(
-                'renorm_stddev_weight', ())
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_mean):
+          self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
+          self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
+        # We initialize renorm_stddev to 0, and maintain the (0-initialized)
+        # renorm_stddev_weight. This allows us to (1) mix the average
+        # stddev with the minibatch stddev early in training, and (2) compute
+        # the unbiased average stddev by dividing renorm_stddev by the weight.
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_variance):
+          self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
+          self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
+                                                       ())
     finally:
       if partitioner:
         self._scope.set_partitioner(partitioner)
     self.built = True
 
-  def _assign_moving_average(self, variable, value, one_minus_decay):
+  def _assign_moving_average(self, variable, value, momentum):
     with ops.name_scope(None, 'AssignMovingAvg',
-                        [variable, value, one_minus_decay]) as scope:
-      with ops.colocate_with(variable):
-        update_delta = math_ops.multiply(
-            math_ops.subtract(variable.read_value(), value),
-            one_minus_decay)
-        if isinstance(variable, resource_variable_ops.ResourceVariable):
-          # state_ops.assign_sub does an extra read_variable_op after the
-          # assign. We avoid that here.
-          return gen_resource_variable_ops.assign_sub_variable_op(
-              variable.handle, update_delta, name=scope)
-        else:
-          return state_ops.assign_sub(variable, update_delta, name=scope)
+                        [variable, value, momentum]) as scope:
+      decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
+      if decay.dtype != variable.dtype.base_dtype:
+        decay = math_ops.cast(decay, variable.dtype.base_dtype)
+      update_delta = (variable - value) * decay
+      return state_ops.assign_sub(variable, update_delta, name=scope)
 
   def _fused_batch_norm(self, inputs, training):
     """Returns the output of fused batch norm."""
@@ -412,22 +405,16 @@ class BatchNormalization(base.Layer):
 
     training_value = utils.constant_value(training)
     if training_value is None:
-      one_minus_decay = utils.smart_cond(training,
-                                         lambda: self._one_minus_decay,
-                                         lambda: 0.)
+      momentum = utils.smart_cond(training, lambda: self.momentum, lambda: 1.0)
     else:
-      one_minus_decay = ops.convert_to_tensor(self._one_minus_decay)
+      momentum = ops.convert_to_tensor(self.momentum)
     if training_value or training_value is None:
       mean_update = self._assign_moving_average(self.moving_mean, mean,
-                                                one_minus_decay)
+                                                momentum)
       variance_update = self._assign_moving_average(self.moving_variance,
-                                                    variance, one_minus_decay)
-      if not context.executing_eagerly():
-        # Note that in Eager mode, the updates are already executed when running
-        # assign_moving_averages. So we do not need to put them into
-        # collections.
-        self.add_update(mean_update, inputs=inputs)
-        self.add_update(variance_update, inputs=inputs)
+                                                    variance, momentum)
+      self.add_update(mean_update, inputs=inputs)
+      self.add_update(variance_update, inputs=inputs)
 
     return output
 
@@ -464,6 +451,7 @@ class BatchNormalization(base.Layer):
       """Updates a moving average and weight, returns the unbiased value."""
       value = array_ops.identity(value)
       def _do_update():
+        """Updates the var and weight, returns their updated ratio."""
         # Update the variables without zero debiasing. The debiasing will be
         # accomplished by dividing the exponential moving average by the weight.
         # For example, after a single update, the moving average would be
@@ -472,25 +460,25 @@ class BatchNormalization(base.Layer):
         # Make sure the weight is not updated until before r and d computation.
         with ops.control_dependencies([value]):
           weight_value = array_ops.constant(1., dtype=weight.dtype)
-        new_var = moving_averages.assign_moving_average(
-            var, value, self.renorm_momentum, zero_debias=False)
-        new_weight = moving_averages.assign_moving_average(
-            weight, weight_value, self.renorm_momentum, zero_debias=False)
+        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
+        new_weight = self._assign_moving_average(weight, weight_value,
+                                                 self.renorm_momentum)
+        # TODO(yuefengz): the updates to var and weighted can not be batched
+        # together if we fetch their updated values here. Consider calculating
+        # new values and delaying the updates.
         return new_var / new_weight
+
       def _fake_update():
         return array_ops.identity(var)
       return utils.smart_cond(training, _do_update, _fake_update)
 
-    with ops.colocate_with(self.moving_mean):
-      new_mean = _update_renorm_variable(self.renorm_mean,
-                                         self.renorm_mean_weight,
-                                         mean)
-    with ops.colocate_with(self.moving_variance):
-      new_stddev = _update_renorm_variable(self.renorm_stddev,
-                                           self.renorm_stddev_weight,
-                                           stddev)
-      # Make sqrt(moving_variance + epsilon) = new_stddev.
-      new_variance = math_ops.square(new_stddev) - self.epsilon
+    # TODO(yuefengz): colocate the operations
+    new_mean = _update_renorm_variable(self.renorm_mean,
+                                       self.renorm_mean_weight, mean)
+    new_stddev = _update_renorm_variable(self.renorm_stddev,
+                                         self.renorm_stddev_weight, stddev)
+    # Make sqrt(moving_variance + epsilon) = new_stddev.
+    new_variance = math_ops.square(new_stddev) - self.epsilon
 
     return (r, d, new_mean, new_variance)
 
@@ -601,8 +589,7 @@ class BatchNormalization(base.Layer):
         if in_eager_mode and not self.trainable:
           return
 
-        return moving_averages.assign_moving_average(
-            var, value, self.momentum, zero_debias=False)
+        return self._assign_moving_average(var, value, self.momentum)
 
       mean_update = utils.smart_cond(
           training,
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 994af69386b278f6b88c051f898cd6a9dc607f3f..a07e305ffbe8b4c4736c3231f6d1d7872d91e04e 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -267,7 +267,9 @@ gtl::InlinedVector<npy_intp, 4> GetPyArrayDimensionsForTensor(
   const int ndims = TF_NumDims(tensor);
   gtl::InlinedVector<npy_intp, 4> dims(ndims);
   if (TF_TensorType(tensor) == TF_RESOURCE) {
-    dims[0] = TF_TensorByteSize(tensor);
+    CHECK_EQ(ndims, 0)
+        << "Fetching of non-scalar resource tensors is not supported.";
+    dims.push_back(TF_TensorByteSize(tensor));
     *nelems = dims[0];
   } else {
     *nelems = 1;
diff --git a/tensorflow/python/lib/core/py_exception_registry.cc b/tensorflow/python/lib/core/py_exception_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6637de632b48e4dfc8219543161464b10dcdbe12
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+
+#include <Python.h>
+
+namespace tensorflow {
+
+PyExceptionRegistry* PyExceptionRegistry::singleton_ = nullptr;
+
+void PyExceptionRegistry::Init(PyObject* code_to_exc_type_map) {
+  DCHECK(singleton_ == nullptr) << "PyExceptionRegistry::Init() already called";
+  singleton_ = new PyExceptionRegistry;
+
+  DCHECK(PyDict_Check(code_to_exc_type_map));
+  PyObject* key;
+  PyObject* value;
+  Py_ssize_t pos = 0;
+  while (PyDict_Next(code_to_exc_type_map, &pos, &key, &value)) {
+    TF_Code code = static_cast<TF_Code>(PyLong_AsLong(key));
+    singleton_->exc_types_[code] = value;
+    // The exception classes should also have the lifetime of the process, but
+    // incref just in case.
+    Py_INCREF(value);
+  }
+}
+
+PyObject* PyExceptionRegistry::Lookup(TF_Code code) {
+  DCHECK(singleton_ != nullptr) << "Must call PyExceptionRegistry::Init() "
+                                   "before PyExceptionRegistry::Lookup()";
+  DCHECK_NE(code, TF_OK);
+  DCHECK(singleton_->exc_types_.find(code) != singleton_->exc_types_.end())
+      << "Unknown error code passed to PyExceptionRegistry::Lookup: " << code;
+  return singleton_->exc_types_[code];
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_exception_registry.h b/tensorflow/python/lib/core/py_exception_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b0f23b548c16130dee9a8ec086ae0283f1506e1
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
+
+#include <map>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/platform/logging.h"
+
+#ifndef PyObject_HEAD
+struct _object;
+typedef _object PyObject;
+#endif
+
+namespace tensorflow {
+
+// Global registry mapping C API error codes to the corresponding custom Python
+// exception type. This is used to expose the exception types to C extension
+// code (i.e. so we can raise custom exceptions via SWIG).
+//
+// Init() must be called exactly once at the beginning of the process before
+// Lookup() can be used.
+//
+// Example usage:
+//   TF_Status* status = TF_NewStatus();
+//   TF_Foo(..., status);
+//
+//   if (TF_GetCode(status) != TF_OK) {
+//     PyObject* exc_type = PyExceptionRegistry::Lookup(TF_GetCode(status));
+//     // Arguments to OpError base class. Set `node_def` and `op` to None.
+//     PyObject* args =
+//       Py_BuildValue("sss", nullptr, nullptr, TF_Message(status));
+//     PyErr_SetObject(exc_type, args);
+//     Py_DECREF(args);
+//     TF_DeleteStatus(status);
+//     return NULL;
+//   }
+class PyExceptionRegistry {
+ public:
+  // Initializes the process-wide registry. Should be called exactly once near
+  // the beginning of the process. The arguments are the various Python
+  // exception types (e.g. `cancelled_exc` corresponds to
+  // errors.CancelledError).
+  static void Init(PyObject* code_to_exc_type_map);
+
+  // Returns the Python exception type corresponding to `code`. Init() must be
+  // called before using this function. `code` should not be TF_OK.
+  static PyObject* Lookup(TF_Code code);
+
+ private:
+  static PyExceptionRegistry* singleton_;
+  PyExceptionRegistry() = default;
+
+  // Maps error codes to the corresponding Python exception type.
+  std::map<TF_Code, PyObject*> exc_types_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
diff --git a/tensorflow/python/lib/core/py_exception_registry.i b/tensorflow/python/lib/core/py_exception_registry.i
new file mode 100644
index 0000000000000000000000000000000000000000..e872b74985e03e203c8aeb8fdec8a3e67f03e1f9
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.i
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+%}
+
+%ignoreall
+
+%unignore tensorflow::PyExceptionRegistry;
+%unignore tensorflow::PyExceptionRegistry::Init;
+
+%include "tensorflow/python/lib/core/py_exception_registry.h"
+%unignoreall
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 02eafd42b35231195a6405c8c3cc11871ed55772..22317a348c9d5472486ad118d865341ffb6ad829 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -166,7 +166,7 @@ bool IsSingleNone(PyObject* obj) {
 // Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`.
 tensorflow::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
                                                 const Tensor** output_tensor) {
-  return EagerTensor_Handle(eager_tensor)->Tensor(output_tensor);
+  return EagerTensor_Handle(eager_tensor)->handle->Tensor(output_tensor);
 }
 
 // Calls the registered py function through the trampoline.
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 8247d354db62532c10c5acc9875cc08289cd31bf..32ea737a99067877e7f527e44d261a0b7c2eb07e 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/numpy.h"
 #include "tensorflow/python/lib/core/py_util.h"
@@ -77,9 +78,9 @@ string PyRepr(PyObject* obj) {
 bool IsPyDimension(PyObject* obj) {
   const char* tp_name = obj->ob_type->tp_name;
   if (strcmp(tp_name, "Dimension") != 0) return false;
-  bool ret =
-      StringPiece(PyRepr(PyType(obj)))
-          .ends_with("tensorflow.python.framework.tensor_shape.Dimension'>");
+  bool ret = str_util::EndsWith(
+      PyRepr(PyType(obj)),
+      "tensorflow.python.framework.tensor_shape.Dimension'>");
   return ret;
 }
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 6fcf9c91d831e3a89552b522040e8e8647114a2f..bf2d6f68b55d78f9570d3854804e3d1316176c99 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -78,8 +78,7 @@ def tf_record_iterator(path, options=None):
   try:
     while True:
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          reader.GetNext(status)
+        reader.GetNext()
       except errors.OutOfRangeError:
         break
       yield reader.record()
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 3c6a5c9e562ff9765c2ef47555871c94cd6feb1e..57d2657838760a6f0041dac850913035540dc939 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -255,10 +255,15 @@ def _SliceGrad(op, grad):
 @ops.RegisterGradient("StridedSlice")
 def _StridedSliceGrad(op, grad):
   """Gradient for StridedSlice op."""
-  x = array_ops.shape(op.inputs[0])
   begin = op.inputs[1]
   end = op.inputs[2]
   strides = op.inputs[3]
+  # StridedSliceGrad requires `x`, `begin`, `end` and `strides` to be of the
+  # same dtype so we build a shape of the same type as other args.
+  # Note that the choice of `begin` for specifying `out_type` is arbitrary.
+  # We could choose any of {begin|end|strides}.dtype since they are required to
+  # be the same.
+  x = array_ops.shape(op.inputs[0], out_type=begin.dtype)
 
   return array_ops.strided_slice_grad(
       x,
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index ec7c14f7d8697e61d2acb25a82c0ac9b2fcf28f4..68d446602efa0673c2b1d840a87af2807978e24a 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -387,7 +387,10 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
   """
   if context.executing_eagerly() and not isinstance(
       input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
-    return np.prod(ops.convert_to_tensor(input)._shape_tuple())  # pylint: disable=protected-access
+    input = ops.convert_to_tensor(input)
+    np_out_type = out_type.as_numpy_dtype
+    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-acces:
+    return ops.convert_to_tensor(num_elements, dtype=out_type)
   with ops.name_scope(name, "Size", [input]) as name:
     if isinstance(input, (sparse_tensor.SparseTensor,
                           sparse_tensor.SparseTensorValue)):
@@ -957,6 +960,11 @@ def _autopacking_helper(list_or_tuple, dtype, name):
   Returns:
     A `tf.Tensor` with value equivalent to `list_or_tuple`.
   """
+  if context.executing_eagerly():
+    # NOTE: Fast path when all the items are tensors, this doesn't do any type
+    # checking.
+    if all(ops.is_dense_tensor_like(elem) for elem in list_or_tuple):
+      return gen_array_ops.pack(list_or_tuple, name=name)
   must_pack = False
   converted_elems = []
   with ops.name_scope(name) as scope:
@@ -2691,12 +2699,17 @@ reverse_sequence.__doc__ = deprecation.rewrite_argument_docstring(
 
 @tf_export("gather")
 def gather(params, indices, validate_indices=None, name=None, axis=0):
-  # TODO(rjryan): Remove "Gather" creation in favor of GatherV2 once the forward
-  # compatibility 3 week period has passed.
-  if axis == 0:
-    return gen_array_ops.gather(
-        params, indices, validate_indices=validate_indices, name=name)
-  else:
+  del validate_indices
+  if axis != 0:
+    # Note that we do a sparse_read here to avoid snapshotting the entire
+    # resource variable and doing a gather, which can be inefficient and lead to
+    # subtle race conditions. TODO(apassos) implement axis != 0 on sparse_read
+    return gen_array_ops.gather_v2(params, indices, axis, name=name)
+  try:
+    # TODO(apassos) find a less bad way of detecting resource variables without
+    # introducing a circular dependency.
+    return params.sparse_read(indices, name=name)
+  except AttributeError:
     return gen_array_ops.gather_v2(params, indices, axis, name=name)
 
 
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..174d00987f9f76b4b07be73e5c29435bed7dfa06
--- /dev/null
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -0,0 +1,160 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_boosted_trees_ops
+from tensorflow.python.ops import resources
+
+# Re-exporting ops used by other modules.
+# pylint: disable=unused-import
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_calculate_best_gains_per_feature as calculate_best_gains_per_feature
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_make_stats_summary as make_stats_summary
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_predict as predict
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_training_predict as training_predict
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_update_ensemble as update_ensemble
+# pylint: enable=unused-import
+
+from tensorflow.python.training import saver
+
+
+class PruningMode(object):
+  NO_PRUNING, PRE_PRUNING, POST_PRUNING = range(0, 3)
+
+
+class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for TreeEnsemble."""
+
+  def __init__(self, resource_handle, create_op, name):
+    """Creates a _TreeEnsembleSavable object.
+
+    Args:
+      resource_handle: handle to the decision tree ensemble variable.
+      create_op: the op to initialize the variable.
+      name: the name to save the tree ensemble variable under.
+    """
+    stamp_token, serialized = (
+        gen_boosted_trees_ops.boosted_trees_serialize_ensemble(resource_handle))
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree ensemble variable. So we just pass an empty
+    # value.
+    slice_spec = ''
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
+                                        name + '_stamp'),
+        saver.BaseSaverBuilder.SaveSpec(serialized, slice_spec,
+                                        name + '_serialized'),
+    ]
+    super(_TreeEnsembleSavable, self).__init__(resource_handle, specs, name)
+    self._resource_handle = resource_handle
+    self._create_op = create_op
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree ensemble from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree ensemble variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return gen_boosted_trees_ops.boosted_trees_deserialize_ensemble(
+          self._resource_handle,
+          stamp_token=restored_tensors[0],
+          tree_ensemble_serialized=restored_tensors[1])
+
+
+class TreeEnsemble(object):
+  """Creates TreeEnsemble resource."""
+
+  def __init__(self, name, stamp_token=0, is_local=False, serialized_proto=''):
+    with ops.name_scope(name, 'TreeEnsemble') as name:
+      self._resource_handle = (
+          gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
+              container='', shared_name=name, name=name))
+      create_op = gen_boosted_trees_ops.boosted_trees_create_ensemble(
+          self.resource_handle,
+          stamp_token,
+          tree_ensemble_serialized=serialized_proto)
+      is_initialized_op = (
+          gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
+              self._resource_handle))
+      # Adds the variable to the savable list.
+      if not is_local:
+        saveable = _TreeEnsembleSavable(self.resource_handle, create_op,
+                                        self.resource_handle.name)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      resources.register_resource(
+          self.resource_handle,
+          create_op,
+          is_initialized_op,
+          is_shared=not is_local)
+
+  @property
+  def resource_handle(self):
+    return self._resource_handle
+
+  def get_stamp_token(self):
+    """Returns the current stamp token of the resource."""
+    stamp_token, _, _, _ = (
+        gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+            self.resource_handle))
+    return stamp_token
+
+  def get_states(self):
+    """Returns states of the tree ensemble.
+
+    Returns:
+      stamp_token, num_trees, num_finalized_trees, num_attempted_layers.
+    """
+    stamp_token, num_trees, num_finalized_trees, num_attempted_layers = (
+        gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+            self.resource_handle))
+    # Use identity to give names.
+    return (array_ops.identity(stamp_token, name='stamp_token'),
+            array_ops.identity(num_trees, name='num_trees'),
+            array_ops.identity(num_finalized_trees, name='num_finalized_trees'),
+            array_ops.identity(
+                num_attempted_layers, name='num_attempted_layers'))
+
+  def serialize(self):
+    """Serializes the ensemble into proto and returns the serialized proto.
+
+    Returns:
+      stamp_token: int64 scalar Tensor to denote the stamp of the resource.
+      serialized_proto: string scalar Tensor of the serialized proto.
+    """
+    return gen_boosted_trees_ops.boosted_trees_serialize_ensemble(
+        self.resource_handle)
+
+  def deserialize(self, stamp_token, serialized_proto):
+    """Deserialize the input proto and resets the ensemble from it.
+
+    Args:
+      stamp_token: int64 scalar Tensor to denote the stamp of the resource.
+      serialized_proto: string scalar Tensor of the serialized proto.
+
+    Returns:
+      Operation (for dependencies).
+    """
+    return gen_boosted_trees_ops.boosted_trees_deserialize_ensemble(
+        self.resource_handle, stamp_token, serialized_proto)
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index d6d75e4ef9874e2ed4706d48b7bdeed62baef414..9cea3e91f7760034d2ab7649709e62dbf1987701 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -363,27 +363,30 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
                          (x_sum, x_np[:x_sum],
                           y_sum, y_np[:y_sum]))
 
-        # Get the values that actually differed and their indices.
-        mask = math_ops.logical_not(eq)
-        indices = array_ops.where(mask)
-        indices_np = indices.numpy()
-        x_vals = array_ops.boolean_mask(x, mask)
-        y_vals = array_ops.boolean_mask(y, mask)
-        summarize = min(summarize, indices_np.shape[0])
+        index_and_values_str = ''
+        if x.shape == y.shape:
+          # If the shapes of x and y are the same,
+          # Get the values that actually differed and their indices.
+          # If shapes are different this information is more confusing
+          # than useful.
+          mask = math_ops.logical_not(eq)
+          indices = array_ops.where(mask)
+          indices_np = indices.numpy()
+          x_vals = array_ops.boolean_mask(x, mask)
+          y_vals = array_ops.boolean_mask(y, mask)
+          summarize = min(summarize, indices_np.shape[0])
+          index_and_values_str = (
+              'Indices of first %s different values:\n%s\n'
+              'Corresponding x values:\n%s\n'
+              'Corresponding y values:\n%s\n' %
+              (summarize, indices_np[:summarize],
+               x_vals.numpy().reshape((-1,))[:summarize],
+               y_vals.numpy().reshape((-1,))[:summarize]))
 
         raise errors.InvalidArgumentError(
             node_def=None, op=None,
-            message=('%s\nCondition x == y did not hold.\n'
-                     'Indices of first %s different values:\n%s\n'
-                     'Corresponding x values:\n%s\n'
-                     'Corresponding y values:\n%s\n'
-                     '%s'
-                     %
-                     (message or '',
-                      summarize, indices_np[:summarize],
-                      x_vals.numpy().reshape((-1,))[:summarize],
-                      y_vals.numpy().reshape((-1,))[:summarize],
-                      summary_msg)))
+            message=('%s\nCondition x == y did not hold.\n%s%s' %
+                     (message or '', index_and_values_str, summary_msg)))
       return
 
     if data is None:
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 21354b5ae8ff1724bbb2539aff370b3df6da2598..45955554cab130597e106660ff1fb4cdf7e9aeb1 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -142,6 +142,7 @@ def _ExitGrad(op, grad):
   """Gradients for an exit op are calculated using an Enter op."""
   graph = ops.get_default_graph()
   # pylint: disable=protected-access
+  op_ctxt = op._get_control_flow_context()
   grad_ctxt = graph._get_control_flow_context()
   # pylint: enable=protected-access
   if not grad_ctxt.back_prop:
@@ -150,10 +151,8 @@ def _ExitGrad(op, grad):
     # no gradient computation.
     return None
 
-  # pylint: disable=protected-access
-  if op._get_control_flow_context().grad_state:
+  if op_ctxt.grad_state:
     raise TypeError("Second-order gradient for while loops not supported.")
-  # pylint: enable=protected-access
 
   if isinstance(grad, ops.Tensor):
     grad_ctxt.AddName(grad.name)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index ff4f452bed673372cf5e63597ecb2378efea9f36..710287012eadea08d6c5be51a8e1be6cce6a5f65 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -196,7 +196,7 @@ def _Identity(data, name=None):
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
-      return gen_array_ops._ref_identity(data, name=name)
+      return gen_array_ops.ref_identity(data, name=name)
     else:
       return array_ops.identity(data, name=name)
   else:
@@ -264,10 +264,10 @@ def _Enter(data,
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
-      result = gen_control_flow_ops._ref_enter(
+      result = gen_control_flow_ops.ref_enter(
           data, frame_name, is_constant, parallel_iterations, name=name)
     else:
-      result = gen_control_flow_ops._enter(
+      result = gen_control_flow_ops.enter(
           data, frame_name, is_constant, parallel_iterations, name=name)
     if use_input_shape:
       result.set_shape(data.get_shape())
@@ -282,7 +282,7 @@ def _Enter(data,
         parallel_iterations=parallel_iterations,
         use_input_shape=use_input_shape,
         name=name)
-    indices = gen_control_flow_ops._enter(
+    indices = gen_control_flow_ops.enter(
         data.indices,
         frame_name,
         is_constant,
@@ -293,7 +293,7 @@ def _Enter(data,
     if isinstance(data, ops.IndexedSlices):
       dense_shape = data.dense_shape
       if dense_shape is not None:
-        dense_shape = gen_control_flow_ops._enter(
+        dense_shape = gen_control_flow_ops.enter(
             dense_shape,
             frame_name,
             is_constant,
@@ -303,7 +303,7 @@ def _Enter(data,
           dense_shape.set_shape(data.dense_shape.get_shape())
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = gen_control_flow_ops._enter(
+      dense_shape = gen_control_flow_ops.enter(
           data.dense_shape,
           frame_name,
           is_constant,
@@ -833,6 +833,9 @@ class GradLoopState(object):
     if outer_grad_state:
       outer_forward_ctxt = outer_grad_state.forward_context
     else:
+      if not hasattr(forward_ctxt, 'outer_context'):
+        raise ValueError("Failed to call gradients on a while loop without"
+                         "properly serializing graph via MetaGraphDef")
       outer_forward_ctxt = forward_ctxt.outer_context
 
     # Add the forward loop counter.
@@ -1467,7 +1470,10 @@ def ZerosLikeOutsideLoop(op, index):
       branch = op_ctxt.branch
       switch_val = switch(op.inputs[0], pred)[1 - branch]
       zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
-      return array_ops.zeros(zeros_shape, dtype=val.dtype)
+      # Ensure ops created within array_ops.zeros are dominated by switch in
+      # cond context.
+      with ops.control_dependencies([switch_val]):
+        return array_ops.zeros(zeros_shape, dtype=val.dtype)
     else:
       return array_ops.zeros_like(val, optimize=False)
 
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 4b57e2de790af13499bc73cfcfa98e999eab1603..908e7939027933327bcdeb21d598bc0b5ca5ff0f 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -218,7 +218,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
         The rows store: `[batch, time]`.
       `decoded.values`: Values vector, size `(total_decoded_outputs)`.
         The vector stores the decoded classes.
-      `decoded.shape`: Shape vector, size `(2)`.
+      `decoded.dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length]`
     neg_sum_logits: A `float` matrix `(batch_size x 1)` containing, for the
         sequence found, the negative of the sum of the greatest logit at each
@@ -265,7 +265,7 @@ def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
         The rows store: [batch, time].
       `decoded[j].values`: Values vector, size `(total_decoded_outputs[j])`.
         The vector stores the decoded classes for beam j.
-      `decoded[j].shape`: Shape vector, size `(2)`.
+      `decoded[j].dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length[j]]`.
     log_probability: A `float` matrix `(batch_size x top_paths)` containing
         sequence log-probabilities.
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 9eacac1b3704c43cbeb5ecd0cbe827cac3a7cc8b..dfa07abfc6474833143ce65ac5df65049e01cab8 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -95,7 +95,7 @@ def custom_gradient(f):
     if not context.executing_eagerly():
       if kwargs:
         raise ValueError(
-            "The custom_gradient decorator currently suports keywords "
+            "The custom_gradient decorator currently supports keywords "
             "arguments only when eager execution is enabled.")
       name = "CustomGradient-%s" % ops.uid()
       args = [ops.convert_to_tensor(x) for x in args]
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index d2cc87555f6321432261b32f08431c23ce707eff..cb725199a8501d2a894f18a9b57c23de4e349374 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1769,7 +1769,9 @@ class StagingArea(BaseStagingArea):
     its capacity.
 
     Args:
-      values: Tensor (or a tuple of Tensors) to place into the staging area.
+      values: A single tensor, a list or tuple of tensors, or a dictionary with
+        tensor values. The number of elements must match the length of the
+        list provided to the dtypes argument when creating the StagingArea.
       name: A name for the operation (optional).
 
     Returns:
@@ -1780,11 +1782,12 @@ class StagingArea(BaseStagingArea):
     """
     with ops.name_scope(name, "%s_put" % self._name,
                         self._scope_vals(values)) as scope:
+      
+      if not isinstance(values, (list, tuple, dict)):
+        values = [values]
 
       # Hard-code indices for this staging area
-      indices = (
-          list(six.moves.range(len(values)))
-          if isinstance(values, (list, tuple)) else None)
+      indices = list(six.moves.range(len(values)))
       vals, _ = self._check_put_dtypes(values, indices)
 
       with ops.colocate_with(self._coloc_op):
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index 50b956a267320b40cb39fbff8b5965a6420146d7..9d9ede7ad75f4eafa91ad051458afbcb6dc8f7b5 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -26,15 +26,3 @@ py_library(
         "@six_archive//:six",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 4071e50e815b01d30f3e24ba4677cc37b325f24d..7c43bf54fc783815127f03cc287ab0fc4349beb5 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -338,6 +338,27 @@ class Distribution(_BaseDistribution):
   cum_prob_invalid = u.cdf([4.0, 5.0, 6.0])
   ```
 
+  #### Shapes
+
+  There are three important concepts associated with TensorFlow Distributions
+  shapes:
+  - Event shape describes the shape of a single draw from the distribution;
+    it may be dependent across dimensions. For scalar distributions, the event
+    shape is `[]`. For a 5-dimensional MultivariateNormal, the event shape is
+    `[5]`.
+  - Batch shape describes independent, not identically distributed draws, aka a
+    "collection" or "bunch" of distributions.
+  - Sample shape describes independent, identically distributed draws of batches
+    from the distribution family.
+
+  The event shape and the batch shape are properties of a Distribution object,
+  whereas the sample shape is associated with a specific call to `sample` or
+  `log_prob`.
+
+  For detailed usage examples of TensorFlow Distributions shapes, see
+  [this tutorial](
+  https://github.com/tensorflow/probability/blob/master/tensorflow_probability/examples/jupyter_notebooks/Understanding%20TensorFlow%20Distributions%20Shapes.ipynb)
+
   #### Parameter values leading to undefined statistics or distributions.
 
   Some distributions do not have well-defined statistics for all initialization
@@ -593,7 +614,7 @@ class Distribution(_BaseDistribution):
     Returns:
       batch_shape: `TensorShape`, possibly unknown.
     """
-    return self._batch_shape()
+    return tensor_shape.as_shape(self._batch_shape())
 
   def _event_shape_tensor(self):
     raise NotImplementedError("event_shape_tensor is not implemented")
@@ -626,7 +647,7 @@ class Distribution(_BaseDistribution):
     Returns:
       event_shape: `TensorShape`, possibly unknown.
     """
-    return self._event_shape()
+    return tensor_shape.as_shape(self._event_shape())
 
   def is_scalar_event(self, name="is_scalar_event"):
     """Indicates that `event_shape == []`.
@@ -1105,6 +1126,34 @@ class Distribution(_BaseDistribution):
     with self._name_scope(name):
       return self._kl_divergence(other)
 
+  def __str__(self):
+    return ("tf.distributions.{type_name}("
+            "\"{self_name}\""
+            "{maybe_batch_shape}"
+            "{maybe_event_shape}"
+            ", dtype={dtype})".format(
+                type_name=type(self).__name__,
+                self_name=self.name,
+                maybe_batch_shape=(", batch_shape={}".format(self.batch_shape)
+                                   if self.batch_shape.ndims is not None
+                                   else ""),
+                maybe_event_shape=(", event_shape={}".format(self.event_shape)
+                                   if self.event_shape.ndims is not None
+                                   else ""),
+                dtype=self.dtype.name))
+
+  def __repr__(self):
+    return ("<tf.distributions.{type_name} "
+            "'{self_name}'"
+            " batch_shape={batch_shape}"
+            " event_shape={event_shape}"
+            " dtype={dtype}>".format(
+                type_name=type(self).__name__,
+                self_name=self.name,
+                batch_shape=self.batch_shape,
+                event_shape=self.event_shape,
+                dtype=self.dtype.name))
+
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
     """Helper function to standardize op scope."""
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index ec623b55eb0067e16599c18c9c504635da863907..0891bffdd554828586c5b23919f955f685632694 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -166,7 +166,8 @@ class Uniform(distribution.Distribution):
     return self.low + self.range() * samples
 
   def _prob(self, x):
-    broadcasted_x = x * array_ops.ones(self.batch_shape_tensor())
+    broadcasted_x = x * array_ops.ones(
+        self.batch_shape_tensor(), dtype=x.dtype)
     return array_ops.where(
         math_ops.is_nan(broadcasted_x),
         broadcasted_x,
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 20e4a28b9c0f5427e331a69cba52503492a8420a..f0120f2957db12caf6a513fde9aa8c756aff8bad 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -35,34 +35,14 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _gather(params, ids, name=None):
-  """Helper function for _embedding_lookup_and_transform.
-
-  This function gathers embeddings from a single tensor. The gather deals with
-  resource variables specially.
-
-  Args:
-    params: A `Tensor` of embeddings.
-    ids: A `Tensor` indexing the embeddings to be retrieved from `params`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` with the same type as `params`.
-  """
-  if isinstance(params, resource_variable_ops.ResourceVariable):
-    return params.sparse_read(ids, name=name)
-  else:
-    return array_ops.gather(params, ids, name=name)
-
-
 def _clip(params, ids, max_norm):
   """Helper function for _embedding_lookup_and_transform.
 
   This function optionally clips embeddings to an l2-norm of max_norm.
 
   Args:
-    params: A `Tensor` of embeddings retrieved by `_gather`.
-    ids: The `ids` argument that was passed to `_gather`.
+    params: A `Tensor` of embeddings retrieved by `gather`.
+    ids: The `ids` argument that was passed to `gather`.
     max_norm: If provided, the embeddings are l2-normalized to the value of
       max_norm.
 
@@ -148,7 +128,8 @@ def _embedding_lookup_and_transform(params,
     ids = ops.convert_to_tensor(ids, name="ids")
     if np == 1 and (not transform_fn or ids.get_shape().ndims == 1):
       with ops.colocate_with(params[0]):
-        result = _clip(_gather(params[0], ids, name=name), ids, max_norm)
+        result = _clip(array_ops.gather(params[0], ids, name=name),
+                       ids, max_norm)
         if transform_fn:
           result = transform_fn(result)
         return result
@@ -212,7 +193,7 @@ def _embedding_lookup_and_transform(params,
       for p in xrange(np):
         pids = gather_ids[p]
         with ops.colocate_with(params[p]):
-          result = _gather(params[p], pids)
+          result = array_ops.gather(params[p], pids)
           if transform_fn:
             # If transform_fn is provided, the clip_by_norm precedes
             # the transform and hence must be co-located. See below
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index a840b1eddfc6922dc310490e8166efd73480c437..161f6f36596279ee0dc7d04d454d670167ba798b 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,22 +27,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.gen_functional_ops import *
-# pylint: enable=wildcard-import
 # pylint: disable=unused-import
-from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
+from tensorflow.python.ops.gen_functional_ops import remote_call
 # pylint: enable=unused-import
+from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -365,7 +367,15 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
     dtype_flat = output_flatten(dtype)
 
     # Convert elems to tensor array. n may be known statically.
-    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+    static_shape = elems_flat[0].shape
+    if static_shape.ndims is not None and static_shape.ndims < 1:
+      if len(elems_flat) == 1:
+        raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
+      else:
+        raise ValueError(
+            "elements in elems must be 1+ dimensional Tensors, not scalars"
+        )
+    n = static_shape[0].value or array_ops.shape(elems_flat[0])[0]
 
     # TensorArrays are always flat
     elems_ta = [
@@ -634,3 +644,249 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
       varscope.set_caching_device(None)
 
     return output_pack(results_flat)
+
+
+# pylint: disable=invalid-name
+def If(cond, inputs, then_branch, else_branch, name=None):
+  r"""output = Cond(inputs) ? then_branch(inputs) : else_branch(inputs).
+
+  Args:
+    cond: A `Tensor`. A scalar. If the scalar is not a boolean, the scalar is
+      converted to a boolean according to the following rule: if the
+      scalar is a numerical value, non-zero means True and zero means
+      False; if the scalar is a string, non-empty means True and empty
+      means False.
+    inputs: A list of input tensors.
+    then_branch: A function takes 'inputs' and returns a list of tensors,
+        whose types are the same as what else_branch returns.
+    else_branch: A function takes 'inputs' and returns a list of tensors.
+        whose types are the same as what then_branch returns.
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of tensors returned by either then_branch(inputs)
+    or else_branch(inputs).
+  """
+  # pylint: disable=protected-access
+  return gen_functional_ops._if(
+      cond,
+      inputs, [_.type for _ in then_branch.definition.signature.output_arg],
+      then_branch,
+      else_branch,
+      name=name)
+
+
+def Gradient(inputs, f, name=None):
+  r"""Computes the gradient function for function f via backpropagation.
+
+  Args:
+    inputs: A list of tensors of size N + M.
+    f: The function we want to compute the gradient for.
+
+      The function 'f' must be a numerical function which takes N inputs and
+      produces M outputs. Its gradient function 'g', which is  a function
+      taking N + M inputs and produces N outputs.
+
+      I.e. if we have
+         (y1, y2, ..., yM) = f(x1, x2, ..., xN),
+      then, g is
+         (dL/dx1, dL/dx2, ..., dL/dxN) = g(x1, x2, ..., xN,
+                                           dL/dy1, dL/dy2, ..., dL/dyM),
+
+      where L is a scalar-value function of (x1, x2, ..., xN) (e.g., the
+      loss function). dL/dxi is the partial derivative of L with respect
+      to xi.
+
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of tensors of size N.
+  """
+  # TODO(zhifengc): Pretty-print the above spec in latex.
+  # TODO(zhfiengc): Needs some math expert to say the comment above better.
+  tlist = [_.type for _ in f.definition.signature.input_arg]
+  return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
+
+
+# pylint: disable=invalid-name,protected-access
+def While(input_, cond, body, name=None, hostmem=None):
+  r"""output = input; While (Cond(output)) { output = Body(output) }.
+
+  Args:
+    input_: A list of `Tensor` objects.
+      A list of input tensors whose types are T.
+    cond: . A function takes 'input' and returns a tensor.  If the tensor is
+      a scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical
+      value, non-zero means True and zero means False; if the scalar is
+      a string, non-empty means True and empty means False. If the
+      tensor is not a scalar, non-emptiness means True and False
+      otherwise.
+    body: . A funcion takes a list of tensors and returns another
+      list tensors. Both lists have the same types as specified
+      by T.
+    name: A name for the operation (optional).
+    hostmem: A list of integer. If i is in the list, input[i] is a
+      host memory tensor.
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `input`.
+    A list of output tensors whose types are T.
+  """
+  ret = gen_functional_ops._while(input_, cond, body, name=name)
+  if hostmem:
+    input_attr = attr_value_pb2.AttrValue()
+    input_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_input_hostmem", input_attr)  # pylint: disable=protected-access
+
+    output_attr = attr_value_pb2.AttrValue()
+    output_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
+  return ret
+
+
+# b/36459430
+#
+# Ideally, we do not need this rewrite For loop into a While loop.
+# However, today, if a While runs on GPU and the condition returns a
+# boolean, the While kernel crashes. Even if we fix the crash, the
+# bool needs to be copied between GPU and CPU. So, a for loop is much
+# preferred when running on GPU.
+#
+# On the other hand, For op has no directly XLA kernel. So, when we run
+# a for loop, we need to rewrite it using a While op.
+#
+# It should be possible and probably better to write a XLA C++ kernel
+# implementing the logic in _ForUsingWhile.
+def _ForUsingWhile(start,
+                   limit,
+                   delta,
+                   inputs,
+                   forbody,
+                   name=None,
+                   hostmem=None):
+  """Helper to implement a For loop using a While."""
+  # To support negative delta (e.g., range(100, 0, -3)), we iterate
+  # over the range(n) and use iter * delta + start as the real
+  # iteration index. (e.g., for i in range(34): iter = i * (-3) +
+  # 100).
+  d = math_ops.abs(delta)
+  # XLA on TPUs doesn't support integer division
+  n = math_ops.cast(
+      math_ops.cast((math_ops.abs(limit - start) + d - 1), dtypes.float32) /
+      math_ops.cast(d, dtypes.float32), dtypes.int32)
+
+  # Carried loop variables ("extra_args") are implicitly added to the input list
+  # of the WhileBody function. WhileCond does not call forbody, and so does not
+  # depend on any of forbody's extra_args. Since WhileCond and WhileBody
+  # must have identical inputs, we have to augment the cond signature to take
+  # the same types as the carried loop variables.
+  body_sig = [dtypes.int32] * 4 + list(forbody.declared_input_types)[1:]
+  cond_sig = body_sig + [t.dtype for t in forbody.captured_inputs]
+
+  cond_name = "%s_Cond" % forbody.name
+
+  @function.Defun(*cond_sig, func_name=cond_name)
+  def WhileCond(i, n, *args):
+    del args
+    return i < n
+
+  body_name = "%s_Body" % forbody.name
+
+  @function.Defun(*body_sig, func_name=body_name)
+  def WhileBody(i, n, start, delta, *args):
+    """A While wrapper for forbody that handles loop-carried captured inputs."""
+    for_result = forbody(start + i * delta, *args)
+    # Nullary functions return an Operation. Normal functions can't do this
+    # because their return values are converted to Tensors.
+    if isinstance(for_result, ops.Operation):
+      for_result = ()
+    # Unary functions return a single Tensor value.
+    elif isinstance(for_result, ops.Tensor):
+      for_result = (for_result,)
+    extra_args = tuple(function.get_extra_args())
+    return (i + 1, n, start, delta) + tuple(for_result) + extra_args
+
+  if hostmem is not None:
+    hostmem = [(4 + _) for _ in hostmem]
+
+  results = While(
+      input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs,
+      cond=WhileCond,
+      body=WhileBody,
+      name=name,
+      hostmem=hostmem)
+  # Slice off the loop-carried captured inputs.
+  return list(results[4:len(results) - len(WhileBody.captured_inputs)])
+
+
+def For(start,
+        limit,
+        delta,
+        inputs,
+        body,
+        name=None,
+        hostmem=None,
+        rewrite_with_while=None):
+  r"""out = input; for i in range(start, limit, delta) out = body(i, out).
+
+  Args:
+    start: A `Tensor` of type `int32`.
+    limit: A `Tensor` of type `int32`.
+    delta: A `Tensor` of type `int32`.
+    inputs: A list of `Tensor` objects.
+      A list of input tensors whose types are T.
+    body: A function takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as (int32, T...).
+    name: A name for the operation (optional).
+    hostmem: A list of integer. If i is in the list, inputs[i] is a
+      host memory tensor. In other words, (i+1)-th argument of the body
+      function is expecting a host memory.
+    rewrite_with_while: If True, using While op to implement the For.
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `input`.
+    A list of output tensors whose types are T.
+  """
+  if rewrite_with_while:
+    return _ForUsingWhile(start, limit, delta, inputs, body, name, hostmem)
+  if body.captured_inputs:
+    wrapper_name = "%s_BodyWrapper" % body.name
+
+    @function.Defun(*body.declared_input_types, func_name=wrapper_name)
+    def BodyWrapper(*args):
+      """A wrapper for body that handles loop-carried captured inputs."""
+      body_result = body(*args)
+      extra_args = tuple(function.get_extra_args())
+      # Nullary functions return an Operation. Normal functions can't do this
+      # because their return values are converted to Tensors.
+      if isinstance(body_result, ops.Operation):
+        return extra_args
+      # Unary functions return a single Tensor value.
+      elif not isinstance(body_result, tuple):
+        return (body_result,) + extra_args
+      # N-ary functions return a tuple of Tensors.
+      else:
+        return body_result + extra_args
+
+    inputs += BodyWrapper.captured_inputs
+    ret = gen_functional_ops._for(
+        start, limit, delta, inputs, BodyWrapper, name=name)
+    # Slice off the loop-carried captured inputs.
+    ret = ret[:-len(BodyWrapper.captured_inputs)]
+  else:
+    ret = gen_functional_ops._for(start, limit, delta, inputs, body, name=name)
+  if hostmem:
+    num_for_params = 3  # start/limit/delta
+
+    input_attr = attr_value_pb2.AttrValue()
+    input_attr.list.i.extend([num_for_params + i for i in hostmem])
+    ret[0].op._set_attr("_input_hostmem", input_attr)  # pylint: disable=protected-access
+
+    output_attr = attr_value_pb2.AttrValue()
+    output_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
+  return ret
+
+
+# pylint: enable=invalid-name,protected-access
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index 63d9a232224cc70c9ce6f04787a8dcc921b3e2f3..2668e8f60cd2864fd59ffa3fb539380d34a34004 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.ops.custom_gradient import custom_gradient
 from tensorflow.python.ops.gradients_impl import AggregationMethod
 from tensorflow.python.ops.gradients_impl import gradients
@@ -29,6 +30,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = [
     # TODO(drpng): find a good place to reference this.
     "AggregationMethod",
+    "GradientTape",
     "custom_gradient",
     "gradients",  # tf.gradients.gradients.
     "hessians",  # tf.gradients.hessians
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 40ab22951b1aa04a61e09aac155b6449ae358d7b..9dfe5ffbf42bcfc9657739b6fc6ad1f3c4823a7d 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -532,8 +532,7 @@ class Orthogonal(Initializer):
     q, r = linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
     d = array_ops.diag_part(r)
-    ph = d / math_ops.abs(d)
-    q *= ph
+    q *= math_ops.sign(d)
     if num_rows < num_cols:
       q = array_ops.matrix_transpose(q)
     return self.gain * array_ops.reshape(q, shape)
@@ -579,7 +578,11 @@ class ConvolutionDeltaOrthogonal(Initializer):
     a = random_ops.random_normal([shape[-1], shape[-1]],
                                  dtype=dtype, seed=self.seed)
     # Compute the qr factorization
-    q, _ = linalg_ops.qr(a, full_matrices=False)
+    q, r = linalg_ops.qr(a, full_matrices=False)
+    # Make Q uniform
+    d = array_ops.diag_part(r)
+    # ph = d / math_ops.abs(d)
+    q *= math_ops.sign(d)
     q = q[:shape[-2], :]
     q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
     if len(shape) == 3:
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index ce8c1580fe5ee614558bfd52afde0d9c5088abe6..07659ef44c443ad15876781d6c6254ae3bc38660 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -34,15 +34,3 @@ py_library(
         "//tensorflow/python:special_math_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 957a7959181efe3bbc319e62582053329b763dc3..193c787baa2ac68feec7e5d8bb03b251fc78d781 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -166,8 +166,7 @@ class LinearOperator(object):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
@@ -204,16 +203,6 @@ class LinearOperator(object):
     self._is_positive_definite = is_positive_definite
     self._name = name or type(self).__name__
 
-    # We will cache some tensors to avoid repeatedly adding shape
-    # manipulation ops to the graph.
-    # Naming convention:
-    #   self._cached_X_tensor is the cached version of self._X_tensor.
-    self._cached_shape_tensor = None
-    self._cached_batch_shape_tensor = None
-    self._cached_domain_dimension_tensor = None
-    self._cached_range_dimension_tensor = None
-    self._cached_tensor_rank_tensor = None
-
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
     """Helper function to standardize op scope."""
@@ -299,15 +288,11 @@ class LinearOperator(object):
       `int32` `Tensor`
     """
     with self._name_scope(name):
-      # Be clean by avoiding adding shape Ops to the graph too many times.
-      if self._cached_shape_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.shape.is_fully_defined():
-          self._cached_shape_tensor = linear_operator_util.shape_tensor(
-              self.shape.as_list())
-        else:
-          self._cached_shape_tensor = self._shape_tensor()
-      return self._cached_shape_tensor
+      # Prefer to use statically defined shape if available.
+      if self.shape.is_fully_defined():
+        return linear_operator_util.shape_tensor(self.shape.as_list())
+      else:
+        return self._shape_tensor()
 
   @property
   def batch_shape(self):
@@ -338,14 +323,12 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_batch_shape_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.batch_shape.is_fully_defined():
-          self._cached_batch_shape_tensor = linear_operator_util.shape_tensor(
-              self.batch_shape.as_list(), name="batch_shape")
-        else:
-          self._cached_batch_shape_tensor = self.shape_tensor()[:-2]
-      return self._cached_batch_shape_tensor
+      # Prefer to use statically defined shape if available.
+      if self.batch_shape.is_fully_defined():
+        return linear_operator_util.shape_tensor(
+            self.batch_shape.as_list(), name="batch_shape")
+      else:
+        return self.shape_tensor()[:-2]
 
   @property
   def tensor_rank(self, name="tensor_rank"):
@@ -378,14 +361,11 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_tensor_rank_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.tensor_rank is not None:
-          self._cached_tensor_rank_tensor = ops.convert_to_tensor(
-              self.tensor_rank)
-        else:
-          self._cached_tensor_rank_tensor = array_ops.size(self.shape_tensor())
-      return self._cached_tensor_rank_tensor
+      # Prefer to use statically defined shape if available.
+      if self.tensor_rank is not None:
+        return ops.convert_to_tensor(self.tensor_rank)
+      else:
+        return array_ops.size(self.shape_tensor())
 
   @property
   def domain_dimension(self):
@@ -416,14 +396,11 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_domain_dimension_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.domain_dimension.value is not None:
-          self._cached_domain_dimension_tensor = ops.convert_to_tensor(
-              self.domain_dimension.value)
-        else:
-          self._cached_domain_dimension_tensor = self.shape_tensor()[-1]
-      return self._cached_domain_dimension_tensor
+      # Prefer to use statically defined shape if available.
+      if self.domain_dimension.value is not None:
+        return ops.convert_to_tensor(self.domain_dimension.value)
+      else:
+        return self.shape_tensor()[-1]
 
   @property
   def range_dimension(self):
@@ -454,14 +431,11 @@ class LinearOperator(object):
     """
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
-      if self._cached_range_dimension_tensor is None:
-        # Prefer to use statically defined shape if available.
-        if self.range_dimension.value is not None:
-          self._cached_range_dimension_tensor = ops.convert_to_tensor(
-              self.range_dimension.value)
-        else:
-          self._cached_range_dimension_tensor = self.shape_tensor()[-2]
-      return self._cached_range_dimension_tensor
+      # Prefer to use statically defined shape if available.
+      if self.range_dimension.value is not None:
+        return ops.convert_to_tensor(self.range_dimension.value)
+      else:
+        return self.shape_tensor()[-2]
 
   def _assert_non_singular(self):
     """Private default implementation of _assert_non_singular."""
@@ -471,8 +445,7 @@ class LinearOperator(object):
     if self._can_use_cholesky():
       return self.assert_positive_definite()
     else:
-      singular_values = linalg_ops.svd(
-          self._get_cached_dense_matrix(), compute_uv=False)
+      singular_values = linalg_ops.svd(self.to_dense(), compute_uv=False)
       # TODO(langmore) Add .eig and .cond as methods.
       cond = (math_ops.reduce_max(singular_values, axis=-1) /
               math_ops.reduce_min(singular_values, axis=-1))
@@ -524,7 +497,7 @@ class LinearOperator(object):
     # and sufficient.
     if self.is_self_adjoint:
       return check_ops.assert_positive(
-          array_ops.matrix_diag_part(self._get_cached_chol()),
+          array_ops.matrix_diag_part(linalg_ops.cholesky(self.to_dense())),
           message="Matrix was not positive definite.")
     # We have no generic check for positive definite.
     raise NotImplementedError("assert_positive_definite is not implemented.")
@@ -547,7 +520,7 @@ class LinearOperator(object):
       return self._assert_positive_definite()
 
   def _assert_self_adjoint(self):
-    dense = self._get_cached_dense_matrix()
+    dense = self.to_dense()
     logging.warn(
         "Using (possibly slow) default implementation of assert_self_adjoint."
         "  Requires conversion to a dense matrix.")
@@ -692,7 +665,7 @@ class LinearOperator(object):
         "Using (possibly slow) default implementation of determinant."
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     if self._can_use_cholesky():
-      diag = array_ops.matrix_diag_part(self._get_cached_chol())
+      diag = array_ops.matrix_diag_part(linalg_ops.cholesky(self.to_dense()))
       return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
     _, log_abs_det = linalg.slogdet(self._matrix)
     return log_abs_det
@@ -726,9 +699,9 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
     if self._can_use_cholesky():
-      return linalg_ops.cholesky_solve(self._get_cached_chol(), rhs)
-    return linalg_ops.matrix_solve(
-        self._get_cached_dense_matrix(), rhs, adjoint=adjoint)
+      return linalg_ops.cholesky_solve(
+          linalg_ops.cholesky(self.to_dense()), rhs)
+    return linalg_ops.matrix_solve(self.to_dense(), rhs, adjoint=adjoint)
 
   def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
     """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
@@ -866,7 +839,7 @@ class LinearOperator(object):
 
   def _diag_part(self):
     """Generic and often inefficient implementation.  Override often."""
-    return array_ops.matrix_diag_part(self._get_cached_dense_matrix())
+    return array_ops.matrix_diag_part(self.to_dense())
 
   def diag_part(self, name="diag_part"):
     """Efficiently get the [batch] diagonal part of this operator.
@@ -915,7 +888,7 @@ class LinearOperator(object):
 
   def _add_to_tensor(self, x):
     # Override if a more efficient implementation is available.
-    return self._get_cached_dense_matrix() + x
+    return self.to_dense() + x
 
   def add_to_tensor(self, x, name="add_to_tensor"):
     """Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
@@ -936,13 +909,3 @@ class LinearOperator(object):
     # TODO(langmore) Add complex types when tf.cholesky can use them.
     return (not self.dtype.is_complex and self.is_self_adjoint and
             self.is_positive_definite)
-
-  def _get_cached_dense_matrix(self):
-    if not hasattr(self, "_cached_dense_matrix"):
-      self._cached_dense_matrix = self.to_dense()
-    return self._cached_dense_matrix
-
-  def _get_cached_chol(self):
-    if not hasattr(self, "_cached_chol"):
-      self._cached_chol = linalg_ops.cholesky(self._get_cached_dense_matrix())
-    return self._cached_chol
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index ecd30e4d7e4dd7cfd4b109ad6e60aacb172700f6..0292bc51dcf9809941087dd4aa1ea4c760c064d1 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -134,8 +134,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.  Default is the individual
         operators names joined with `_o_`.
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index e180e830263c44fb5ae290d307f1ef80106c31d5..5beaea65a5171ad7e92042a2afa81c0507e51d0e 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -132,8 +132,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index f979fb37d6c69a2683af08a1f6722b98da0b6650..5ba3b090ae9decaba239b31226db84c2d7b254bd 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -125,8 +125,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 50f3d407e85e4cca22ad6326931b5a2a736819a8..45929eb4e2e91218784a9fabba23b57851ae3cc8 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -236,8 +236,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
@@ -576,8 +575,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index a5130188b681813e1ccd4818dabdffeeb663e20a..c4d386ccb4efc7dede8310243e517fe2f6b45bd9 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -133,8 +133,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 427bd1e890305618264a6a588be4e4ffade33c01..9dd40765c20222c6998260547b7e8fa341e65437 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -102,6 +103,22 @@ def assert_is_batch_matrix(tensor):
         "%s" % tensor)
 
 
+def shape_tensor(shape, name=None):
+  """Convert Tensor using default type, unless empty list or tuple."""
+  # Works just like random_ops._ShapeTensor.
+  if isinstance(shape, (tuple, list)) and not shape:
+    dtype = dtypes.int32
+  else:
+    dtype = None
+  return ops.convert_to_tensor(shape, dtype=dtype, name=name)
+
+
+################################################################################
+# Broadcasting versions of common linear algebra functions.
+# TODO(b/77519145) Do this more efficiently in some special cases.
+################################################################################
+
+
 def broadcast_matrix_batch_dims(batch_matrices, name=None):
   """Broadcast leading dimensions of zero or more [batch] matrices.
 
@@ -170,7 +187,8 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     bcast_batch_shape = batch_matrices[0].get_shape()[:-2]
     for mat in batch_matrices[1:]:
       bcast_batch_shape = array_ops.broadcast_static_shape(
-          bcast_batch_shape, mat.get_shape()[:-2])
+          bcast_batch_shape,
+          mat.get_shape()[:-2])
     if bcast_batch_shape.is_fully_defined():
       # The [1, 1] at the end will broadcast with anything.
       bcast_shape = bcast_batch_shape.concatenate([1, 1])
@@ -183,7 +201,8 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     bcast_batch_shape = array_ops.shape(batch_matrices[0])[:-2]
     for mat in batch_matrices[1:]:
       bcast_batch_shape = array_ops.broadcast_dynamic_shape(
-          bcast_batch_shape, array_ops.shape(mat)[:-2])
+          bcast_batch_shape,
+          array_ops.shape(mat)[:-2])
     bcast_shape = array_ops.concat([bcast_batch_shape, [1, 1]], axis=0)
     for i, mat in enumerate(batch_matrices):
       batch_matrices[i] = _broadcast_to_shape(mat, bcast_shape)
@@ -195,6 +214,13 @@ def _broadcast_to_shape(x, shape):
   return x + array_ops.zeros(shape=shape, dtype=x.dtype)
 
 
+def cholesky_solve_with_broadcast(chol, rhs, name=None):
+  """Solve systems of linear equations."""
+  with ops.name_scope(name, "CholeskySolveWithBroadcast", [chol, rhs]):
+    chol, rhs = broadcast_matrix_batch_dims([chol, rhs])
+    return linalg_ops.cholesky_solve(chol, rhs)
+
+
 def matmul_with_broadcast(a,
                           b,
                           transpose_a=False,
@@ -206,6 +232,11 @@ def matmul_with_broadcast(a,
                           name=None):
   """Multiplies matrix `a` by matrix `b`, producing `a @ b`.
 
+  Works identically to `tf.matmul`, but broadcasts batch dims
+  of `a` and `b` (by replicating) if they are determined statically to be
+  different, or if static shapes are not fully defined.  Thus, this may result
+  in an inefficient replication of data.
+
   The inputs must be matrices (or tensors of rank > 2, representing batches of
   matrices).
 
@@ -276,7 +307,7 @@ def matmul_with_broadcast(a,
     ValueError: If transpose_a and adjoint_a, or transpose_b and adjoint_b
       are both set to True.
   """
-  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]) as name:
+  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]):
     a, b = broadcast_matrix_batch_dims([a, b])
     return math_ops.matmul(
         a,
@@ -289,11 +320,43 @@ def matmul_with_broadcast(a,
         b_is_sparse=b_is_sparse)
 
 
-def shape_tensor(shape, name=None):
-  """Convert Tensor using default type, unless empty list or tuple."""
-  # Works just like random_ops._ShapeTensor.
-  if isinstance(shape, (tuple, list)) and not shape:
-    dtype = dtypes.int32
-  else:
-    dtype = None
-  return ops.convert_to_tensor(shape, dtype=dtype, name=name)
+def matrix_solve_with_broadcast(matrix, rhs, adjoint=False, name=None):
+  """Solve systems of linear equations."""
+  with ops.name_scope(name, "MatrixSolveWithBroadcast", [matrix, rhs]):
+    matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
+    return linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint)
+
+
+def matrix_triangular_solve_with_broadcast(matrix,
+                                           rhs,
+                                           lower=True,
+                                           adjoint=False,
+                                           name=None):
+  """Solves triangular systems of linear equations with by backsubstitution.
+
+  Works identically to `tf.matrix_triangular_solve`, but broadcasts batch dims
+  of `matrix` and `rhs` (by replicating) if they are determined statically to be
+  different, or if static shapes are not fully defined.  Thus, this may result
+  in an inefficient replication of data.
+
+  Args:
+    matrix: A Tensor. Must be one of the following types:
+      `float64`, `float32`, `complex64`, `complex128`. Shape is `[..., M, M]`.
+    rhs: A `Tensor`. Must have the same `dtype` as `matrix`.
+      Shape is `[..., M, K]`.
+    lower: An optional `bool`. Defaults to `True`. Indicates whether the
+      innermost matrices in `matrix` are lower or upper triangular.
+    adjoint: An optional `bool`. Defaults to `False`. Indicates whether to solve
+      with matrix or its (block-wise) adjoint.
+    name: A name for the operation (optional).
+
+  Returns:
+    `Tensor` with same `dtype` as `matrix` and shape `[..., M, K]`.
+  """
+  with ops.name_scope(name, "MatrixTriangularSolve", [matrix, rhs]):
+    matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
+    return linalg_ops.matrix_triangular_solve(
+        matrix,
+        rhs,
+        lower=lower,
+        adjoint=adjoint)
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index b306042aff660b82db6681cae4b95aee1debc314..50706e57819abc18241cba519f913d7e0a282215 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -342,7 +342,7 @@ def self_adjoint_eig(tensor, name=None):
     name: string, optional name of the operation.
 
   Returns:
-    e: Eigenvalues. Shape is `[..., N]`.
+    e: Eigenvalues. Shape is `[..., N]`. Sorted in non-decreasing order.
     v: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
       matrices contain eigenvectors of the corresponding matrices in `tensor`
   """
@@ -430,7 +430,7 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   u, s, v_adj = np.linalg.svd(a, full_matrices=False)
   np_a_approx = np.dot(u, np.dot(np.diag(s), v_adj))
   # tf_a_approx and np_a_approx should be numerically close.
-  ````
+  ```
   @end_compatibility
   """
   s, u, v = gen_linalg_ops.svd(
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index a7ea7dc6e100e809caebed5f03027c4d694cfdd0..222b8ebc9da6b076f012f8febbd50cc3c4c86c08 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -109,7 +109,7 @@ def histogram_summary(tag, values, collections=None, name=None):
     buffer.
   """
   with ops.name_scope(name, "HistogramSummary", [tag, values]) as scope:
-    val = gen_logging_ops._histogram_summary(
+    val = gen_logging_ops.histogram_summary(
         tag=tag, values=values, name=scope)
     _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
@@ -346,7 +346,7 @@ def scalar_summary(tags, values, collections=None, name=None):
     buffer.
   """
   with ops.name_scope(name, "ScalarSummary", [tags, values]) as scope:
-    val = gen_logging_ops._scalar_summary(tags=tags, values=values, name=scope)
+    val = gen_logging_ops.scalar_summary(tags=tags, values=values, name=scope)
     _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
 
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index 07741e0c3c3ea8a9bb7d790b901e743907794dc0..4aea0265a72dcd2b2358f063fb0a51a5877076e7 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -43,15 +43,3 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 0840760810c86a6393ea6b4ab0b9410233275f11..34ca1adc3e13dc67560fb21d70c16cd42dc40552 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -194,6 +194,11 @@ def compute_weighted_loss(
   """
   Reduction.validate(reduction)
   with ops.name_scope(scope, "weighted_loss", (losses, weights)):
+    # Save the `reduction` argument for loss normalization when distributing
+    # to multiple towers.
+    # TODO(josh11b): Associate it with the returned op for more precision.
+    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
+
     with ops.control_dependencies((
         weights_broadcast_ops.assert_broadcastable(weights, losses),)):
       losses = ops.convert_to_tensor(losses)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index d220fe3cce186d731803d3d1c9587d9fe9ea3a28..02e07dc7b1f5fe6a671da967f6d07cef123d3d1e 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -41,6 +41,12 @@ def _ArgMaxGrad(op, grad):
   return [None, None]
 
 
+@ops.RegisterGradient("ArgMin")
+def _ArgMinGrad(op, grad):
+  del op, grad
+  return [None, None]
+
+
 @ops.RegisterGradient("Sum")
 def _SumGrad(op, grad):
   """Gradient for Sum."""
@@ -620,9 +626,7 @@ def _IgammaGrad(op, grad):
   x = op.inputs[1]
   sa = array_ops.shape(a)
   sx = array_ops.shape(x)
-  # pylint: disable=protected-access
-  unused_ra, rx = gen_array_ops._broadcast_gradient_args(sa, sx)
-  # pylint: enable=protected-access
+  unused_ra, rx = gen_array_ops.broadcast_gradient_args(sa, sx)
 
   # Perform operations in log space before summing, because Gamma(a)
   # and Gamma'(a) can grow large.
@@ -649,9 +653,7 @@ def _BetaincGrad(op, grad):
   # versa; so its sufficient to check against shape(a).
   sa = array_ops.shape(a)
   sx = array_ops.shape(x)
-  # pylint: disable=protected-access
-  _, rx = gen_array_ops._broadcast_gradient_args(sa, sx)
-  # pylint: enable=protected-access
+  _, rx = gen_array_ops.broadcast_gradient_args(sa, sx)
 
   # Perform operations in log space before summing, because terms
   # can grow large.
@@ -677,9 +679,7 @@ def _ZetaGrad(op, grad):
   # Broadcast gradients
   sx = array_ops.shape(x)
   sq = array_ops.shape(q)
-  # pylint: disable=protected-access
-  unused_rx, rq = gen_array_ops._broadcast_gradient_args(sx, sq)
-  # pylint: enable=protected-access
+  unused_rx, rq = gen_array_ops.broadcast_gradient_args(sx, sq)
   # Evaluate gradient
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
@@ -699,9 +699,7 @@ def _PolygammaGrad(op, grad):
   # Broadcast gradients
   sn = array_ops.shape(n)
   sx = array_ops.shape(x)
-  # pylint: disable=protected-access
-  unused_rn, rx = gen_array_ops._broadcast_gradient_args(sn, sx)
-  # pylint: enable=protected-access
+  unused_rn, rx = gen_array_ops.broadcast_gradient_args(sn, sx)
   # Evaluate gradient
   with ops.control_dependencies([grad]):
     n = math_ops.conj(n)
@@ -841,9 +839,7 @@ def _AddGrad(op, grad):
     return grad, grad
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   return (array_ops.reshape(math_ops.reduce_sum(grad, rx), sx),
           array_ops.reshape(math_ops.reduce_sum(grad, ry), sy))
 
@@ -858,9 +854,7 @@ def _SubGrad(op, grad):
     return grad, -grad
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   return (array_ops.reshape(math_ops.reduce_sum(grad, rx), sx),
           array_ops.reshape(-math_ops.reduce_sum(grad, ry), sy))
 
@@ -870,7 +864,6 @@ def _MulGrad(op, grad):
   """The gradient of scalar multiplication."""
   x = op.inputs[0]
   y = op.inputs[1]
-  # pylint: disable=protected-access
   if (isinstance(grad, ops.Tensor) and
       _ShapesFullySpecifiedAndEqual(x, y, grad) and
       grad.dtype in (dtypes.int32, dtypes.float32)):
@@ -878,14 +871,13 @@ def _MulGrad(op, grad):
   assert x.dtype.base_dtype == y.dtype.base_dtype, (x.dtype, " vs. ", y.dtype)
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(
       math_ops.reduce_sum(gen_math_ops.mul(grad, y), rx), sx),
           array_ops.reshape(
               math_ops.reduce_sum(gen_math_ops.mul(x, grad), ry), sy))
-  # pylint: enable=protected-access
 
 
 @ops.RegisterGradient("Div")
@@ -895,9 +887,7 @@ def _DivGrad(op, grad):
   y = op.inputs[1]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(math_ops.reduce_sum(math_ops.div(grad, y), rx), sx),
@@ -920,9 +910,7 @@ def _FloorModGrad(op, grad):
 
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   floor_xy = math_ops.floor_div(x, y)
   gx = array_ops.reshape(math_ops.reduce_sum(grad, rx), sx)
   gy = array_ops.reshape(
@@ -942,9 +930,7 @@ def _RealDivGrad(op, grad):
   y = op.inputs[1]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(
@@ -962,7 +948,7 @@ def _PowGrad(op, grad):
   z = op.outputs[0]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   z = math_ops.conj(z)
@@ -990,7 +976,7 @@ def _MaximumMinimumGrad(op, grad, selector_op):
   gradshape = array_ops.shape(grad)
   zeros = array_ops.zeros(gradshape, gdtype)
   xmask = selector_op(x, y)
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   xgrad = array_ops.where(xmask, grad, zeros)
   ygrad = array_ops.where(xmask, zeros, grad)
   gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
@@ -1017,9 +1003,7 @@ def _SquaredDifferenceGrad(op, grad):
   y = op.inputs[1]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  # pylint: disable=protected-access
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   with ops.control_dependencies([grad]):
     # The parens ensure that if grad is IndexedSlices, it'll get multiplied by
     # Tensor (not a number like 2.0) which causes it to convert to Tensor.
@@ -1183,7 +1167,7 @@ def _ComplexGrad(op, grad):
   y = op.inputs[1]
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   return (array_ops.reshape(math_ops.reduce_sum(math_ops.real(grad), rx), sx),
           array_ops.reshape(math_ops.reduce_sum(math_ops.imag(grad), ry), sy))
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index e18d0e95015556acc66b0b50f0c406c0e2cf41aa..b460ce5b95218083d9265d25931a55f2b59f2113 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -167,12 +167,14 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import gen_spectral_ops
+from tensorflow.python.platform import tf_logging as logging
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
@@ -180,7 +182,8 @@ linspace = gen_math_ops.lin_space
 
 arg_max = deprecation.deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
-
+tf_export("arg_max")(arg_max)
+tf_export("arg_min")(arg_min)
 
 # This is set by resource_variable_ops.py. It is included in this way since
 # there is a circular dependency between math_ops and resource_variable_ops
@@ -773,16 +776,18 @@ def cast(x, dtype, name=None):
   with ops.name_scope(name, "Cast", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
       values_cast = cast(x.values, base_type, name=name)
-      return sparse_tensor.SparseTensor(x.indices, values_cast, x.dense_shape)
+      x = sparse_tensor.SparseTensor(x.indices, values_cast, x.dense_shape)
     else:
       # TODO(josh11b): If x is not already a Tensor, we could return
       # ops.convert_to_tensor(x, dtype=dtype, ...)  here, but that
       # allows some conversions that cast() can't do, e.g. casting numbers to
       # strings.
       x = ops.convert_to_tensor(x, name="x")
-      if x.dtype.base_dtype == base_type:
-        return x
-      return gen_math_ops.cast(x, base_type, name=name)
+      if x.dtype.base_dtype != base_type:
+        x = gen_math_ops.cast(x, base_type, name=name)
+    if x.dtype.is_complex and base_type.is_floating:
+      logging.warn("Casting complex to real discards imaginary part.")
+    return x
 
 
 @tf_export("saturate_cast")
@@ -1196,7 +1201,7 @@ tf_export("floor_div")(floor_div)
 truncatemod = gen_math_ops.truncate_mod
 tf_export("truncatemod")(truncatemod)
 floormod = gen_math_ops.floor_mod
-tf_export("floormod")(floormod)
+tf_export("floormod", "mod")(floormod)
 
 
 def _mul_dispatch(x, y, name=None):
@@ -1338,8 +1343,7 @@ def _ReductionDims(x, axis, reduction_indices):
   else:
     # Fast path: avoid creating Rank and Range ops if ndims is known.
     if isinstance(x, ops.Tensor) and x._rank() is not None:  # pylint: disable=protected-access
-      return constant_op.constant(
-          np.arange(x._rank()), dtype=dtypes.int32)  # pylint: disable=protected-access
+      return constant_op.constant(np.arange(x._rank()), dtype=dtypes.int32)  # pylint: disable=protected-access
     if (isinstance(x, sparse_tensor.SparseTensor) and
         x.dense_shape.get_shape().is_fully_defined()):
       rank = x.dense_shape.get_shape()[0].value  # sparse.dense_shape is 1-D.
@@ -1517,7 +1521,7 @@ def reduce_mean(input_tensor,
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor)]`.
+      `[-rank(input_tensor), rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -2093,8 +2097,9 @@ def matmul(a,
       sparse_matmul_types = [dtypes.bfloat16, dtypes.float32]
       use_sparse_matmul = (
           a.dtype in sparse_matmul_types and b.dtype in sparse_matmul_types)
-    if a.dtype == dtypes.bfloat16 or b.dtype == dtypes.bfloat16:
-      # matmul currently doesn't handle bfloat16 inputs.
+    if (a.dtype == dtypes.bfloat16 or b.dtype == dtypes.bfloat16 and
+        a.dtype != b.dtype):
+      # matmul currently doesn't handle mixed-precision inputs.
       use_sparse_matmul = True
     if use_sparse_matmul:
       ret = sparse_matmul(
@@ -2267,10 +2272,11 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
     ValueError: If `inputs` don't all have same shape and dtype or the shape
     cannot be inferred.
   """
+
   def _input_error():
-    return ValueError(
-        "inputs must be a list of at least one Tensor with the "
-        "same dtype and shape")
+    return ValueError("inputs must be a list of at least one Tensor with the "
+                      "same dtype and shape")
+
   if not inputs or not isinstance(inputs, (list, tuple)):
     raise _input_error()
   inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
@@ -2288,8 +2294,8 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
 
   # tensor_dtype is for safety only; operator's output type computed in C++
   if tensor_dtype is not None and tensor_dtype != inputs[0].dtype:
-    raise TypeError("tensor_dtype is {}, but input is of type {}"
-                    .format(tensor_dtype, inputs[0].dtype))
+    raise TypeError("tensor_dtype is {}, but input is of type {}".format(
+        tensor_dtype, inputs[0].dtype))
 
   if len(inputs) == 1 and name is None:
     return inputs[0]
@@ -2755,14 +2761,14 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_sum(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("sparse_segment_mean")
-def sparse_segment_mean(data, indices, segment_ids, name=None,
+def sparse_segment_mean(data,
+                        indices,
+                        segment_ids,
+                        name=None,
                         num_segments=None):
   r"""Computes the mean along sparse segments of a tensor.
 
@@ -2799,14 +2805,14 @@ def sparse_segment_mean(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_mean(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("sparse_segment_sqrt_n")
-def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
+def sparse_segment_sqrt_n(data,
+                          indices,
+                          segment_ids,
+                          name=None,
                           num_segments=None):
   r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
 
@@ -2836,10 +2842,7 @@ def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_sqrt_n(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("tensordot", "linalg.tensordot")
@@ -3010,6 +3013,47 @@ def tensordot(a, b, axes, name=None):
       return product
 
 
+@tf_export("math.polyval")
+def polyval(coeffs, x, name=None):
+  r"""Computes the elementwise value of a polynomial.
+
+  If `x` is a tensor and `coeffs` is a list n + 1 tensors, this function returns
+  the value of the n-th order polynomial
+
+     p(x) = coeffs[n-1] + coeffs[n-2] * x + ...  + coeffs[0] * x**(n-1)
+
+  evaluated using Horner's method, i.e.
+
+     p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1] +
+            x * coeffs[0]))
+
+  Args:
+    coeffs: A list of `Tensor` representing the coefficients of the polynomial.
+    x: A `Tensor` representing the variable of the polynomial.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as the expression p(x) with usual broadcasting rules
+    for element-wise addition and multiplication applied.
+
+  @compatibility(numpy)
+  Equivalent to numpy.polyval.
+  @end_compatibility
+  """
+
+  with ops.name_scope(name, "polyval", nest.flatten(coeffs) + [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    if len(coeffs) < 1:
+      return array_ops.zeros_like(x, name=name)
+    coeffs = [
+        ops.convert_to_tensor(coeff, name=("coeff_%d" % index))
+        for index, coeff in enumerate(coeffs)
+    ]
+    p = coeffs[0]
+    for c in coeffs[1:]:
+      p = c + p * x
+    return p
+
 # FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
 # 1.0 API so we leave these here for backwards compatibility.
 fft = gen_spectral_ops.fft
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 9f85188b3513563a7444f7a0e908f11af985498b..05bcee8801259e4bc6c20c3f61cf20025ba5ea33 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -155,9 +155,7 @@ class RoundTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testRounding(self):
-    x = [0.49, 0.7, -0.3, -0.8]
-    # TODO(nolivia): Remove this when RoundOp is forwards compatible
-    # x = np.arange(-5.0, 5.0, .25)
+    x = np.arange(-5.0, 5.0, .25)
     for dtype in [np.float32, np.double, np.int32]:
       x_np = np.array(x, dtype=dtype)
       with test_util.device(use_gpu=True):
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 9ec49545796cfa7a603b31c23bfd0d495639898d..47eea6ef6b58abd4819544e29783048964104922 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -626,10 +627,16 @@ def auc(labels,
     curve: Specifies the name of the curve to be computed, 'ROC' [default] or
       'PR' for the Precision-Recall-curve.
     name: An optional variable_scope name.
-    summation_method: Specifies the Riemann summation method used, 'trapezoidal'
-      [default] that applies the trapezoidal rule, 'minoring' that applies
-      left summation for increasing intervals and right summation for decreasing
-      intervals or 'majoring' that applies the opposite.
+    summation_method: Specifies the Riemann summation method used
+      (https://en.wikipedia.org/wiki/Riemann_sum): 'trapezoidal' [default] that
+      applies the trapezoidal rule; 'careful_interpolation', a variant of it
+      differing only by a more correct interpolation scheme for PR-AUC -
+      interpolating (true/false) positives but not the ratio that is precision;
+      'minoring' that applies left summation for increasing intervals and right
+      summation for decreasing intervals; 'majoring' that does the opposite.
+      Note that 'careful_interpolation' is strictly preferred to 'trapezoidal'
+      (to be deprecated soon) as it applies the same method for ROC, and a
+      better one (see Davis & Goadrich 2006 for details) for the PR curve.
 
   Returns:
     auc: A scalar `Tensor` representing the current area-under-curve.
@@ -664,8 +671,62 @@ def auc(labels,
     # Add epsilons to avoid dividing by 0.
     epsilon = 1.0e-6
 
+    def interpolate_pr_auc(tp, fp, fn):
+      """Interpolation formula inspired by section 4 of Davis & Goadrich 2006.
+
+      Note here we derive & use a closed formula not present in the paper
+      - as follows:
+      Modeling all of TP (true positive weight),
+      FP (false positive weight) and their sum P = TP + FP (positive weight)
+      as varying linearly within each interval [A, B] between successive
+      thresholds, we get
+        Precision = (TP_A + slope * (P - P_A)) / P
+      with slope = dTP / dP = (TP_B - TP_A) / (P_B - P_A).
+      The area within the interval is thus (slope / total_pos_weight) times
+        int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P}
+        int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P}
+      where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in
+        int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A)
+      Bringing back the factor (slope / total_pos_weight) we'd put aside, we get
+         slope * [dTP + intercept *  log(P_B / P_A)] / total_pos_weight
+      where dTP == TP_B - TP_A.
+      Note that when P_A == 0 the above calculation simplifies into
+        int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A)
+      which is really equivalent to imputing constant precision throughout the
+      first bucket having >0 true positives.
+
+      Args:
+        tp: true positive counts
+        fp: false positive counts
+        fn: false negative counts
+      Returns:
+        pr_auc: an approximation of the area under the P-R curve.
+      """
+      dtp = tp[:num_thresholds - 1] - tp[1:]
+      p = tp + fp
+      prec_slope = _safe_div(dtp, p[:num_thresholds - 1] - p[1:], 'prec_slope')
+      intercept = tp[1:] - math_ops.multiply(prec_slope, p[1:])
+      safe_p_ratio = array_ops.where(
+          math_ops.logical_and(p[:num_thresholds - 1] > 0, p[1:] > 0),
+          _safe_div(p[:num_thresholds - 1], p[1:], 'recall_relative_ratio'),
+          array_ops.ones_like(p[1:]))
+      return math_ops.reduce_sum(
+          _safe_div(
+              prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
+              tp[1:] + fn[1:],
+              name='pr_auc_increment'),
+          name='interpolate_pr_auc')
+
     def compute_auc(tp, fn, tn, fp, name):
       """Computes the roc-auc or pr-auc based on confusion counts."""
+      if curve == 'PR':
+        if summation_method == 'trapezoidal':
+          logging.warning(
+              'Trapezoidal rule is known to produce incorrect PR-AUCs; '
+              'please switch to "careful_interpolation" instead.')
+        elif summation_method == 'careful_interpolation':
+          # This one is a bit tricky and is handled separately.
+          return interpolate_pr_auc(tp, fp, fn)
       rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
       if curve == 'ROC':
         fp_rate = math_ops.div(fp, fp + tn + epsilon)
@@ -675,7 +736,9 @@ def auc(labels,
         prec = math_ops.div(tp + epsilon, tp + fp + epsilon)
         x = rec
         y = prec
-      if summation_method == 'trapezoidal':
+      if summation_method in ('trapezoidal', 'careful_interpolation'):
+        # Note that the case ('PR', 'careful_interpolation') has been handled
+        # above.
         return math_ops.reduce_sum(
             math_ops.multiply(x[:num_thresholds - 1] - x[1:],
                               (y[:num_thresholds - 1] + y[1:]) / 2.),
@@ -923,8 +986,8 @@ def mean_per_class_accuracy(labels,
         weights = array_ops.reshape(weights, [-1])
       weights = math_ops.to_float(weights)
 
-      is_correct = is_correct * weights
-      ones = ones * weights
+      is_correct *= weights
+      ones *= weights
 
     update_total_op = state_ops.scatter_add(total, labels, ones)
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 9b2aaa4c1c913ca01a6eff14798438da9ba2be05..07ca32953f796466964d4555e45052fcf3c53ce0 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -150,14 +150,12 @@ class _NonAtrousConvolution(object):
                                                               conv_dims))
     if conv_dims == 1:
       # conv1d uses the 2-d data format names
-      if data_format is None or data_format == "NWC":
-        data_format_2d = "NHWC"
-      elif data_format == "NCW":
-        data_format_2d = "NCHW"
-      else:
+      if data_format is None:
+        data_format = "NWC"
+      elif data_format not in {"NCW", "NWC", "NCHW", "NHWC"}:
         raise ValueError("data_format must be \"NWC\" or \"NCW\".")
       self.strides = strides[0]
-      self.data_format = data_format_2d
+      self.data_format = data_format
       self.conv_op = self._conv1d
     elif conv_dims == 2:
       if data_format is None or data_format == "NHWC":
@@ -1810,7 +1808,7 @@ def softmax_cross_entropy_with_logits_v2(
   or `float64`).
 
   Backpropagation will happen into both `logits` and `labels`.  To disallow
-  backpropagation into `labels`, pass label tensors through a `stop_gradients`
+  backpropagation into `labels`, pass label tensors through @{tf.stop_gradient}
   before feeding it to this function.
 
   **Note that to avoid confusion, it is required to pass only named arguments to
@@ -1838,8 +1836,10 @@ def softmax_cross_entropy_with_logits_v2(
                       [logits, labels]) as name:
     logits = ops.convert_to_tensor(logits, name="logits")
     labels = ops.convert_to_tensor(labels, name="labels")
+    convert_to_float32 = (
+        logits.dtype == dtypes.float16 or logits.dtype == dtypes.bfloat16)
     precise_logits = math_ops.cast(
-        logits, dtypes.float32) if (logits.dtype == dtypes.float16) else logits
+        logits, dtypes.float32) if convert_to_float32 else logits
     # labels and logits must be of the same type
     labels = math_ops.cast(labels, precise_logits.dtype)
     input_rank = array_ops.rank(precise_logits)
@@ -1885,8 +1885,8 @@ def softmax_cross_entropy_with_logits_v2(
       del shape[dim]
       cost.set_shape(shape)
 
-    if logits.dtype == dtypes.float16:
-      return math_ops.cast(cost, dtypes.float16)
+    if convert_to_float32:
+      return math_ops.cast(cost, logits.dtype)
     else:
       return cost
 
@@ -1895,7 +1895,7 @@ _XENT_DEPRECATION = """
 Future major versions of TensorFlow will allow gradients to flow
 into the labels input on backprop by default.
 
-See tf.nn.softmax_cross_entropy_with_logits_v2.
+See @{tf.nn.softmax_cross_entropy_with_logits_v2}.
 """
 
 
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index af9dae2aa64f0994f403ac81dcba800699d3c960..46a5f4fae6b15766c21011ebeae5437262192df7 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -852,6 +852,57 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       self.assertAllClose(exp_sampled_softmax_loss,
                           got_sampled_softmax_loss.eval(), 1e-4)
 
+  def testSampledSoftmaxLossBf16(self):
+    # A simple test to verify the numerics for bfloat16.
+    def _SoftmaxCrossEntropyWithLogits(logits, targets):
+      # logits, targets: float arrays of the same shape.
+      assert logits.shape == targets.shape
+      stable_exp_logits = np.exp(
+          logits - np.amax(logits, axis=1, keepdims=True))
+      pred = stable_exp_logits / np.sum(stable_exp_logits, 1, keepdims=True)
+      return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)
+
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
+    labels = [0, 1, 2]
+    sampled = [1, 0, 2, 3]
+    (weights, biases, hidden_acts, _, exp_logits,
+     exp_labels) = self._GenerateTestData(
+         num_classes=num_classes,
+         dim=10,
+         batch_size=batch_size,
+         num_true=1,
+         labels=labels,
+         sampled=sampled,
+         subtract_log_q=True)
+    exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
+        exp_logits, exp_labels)
+
+    with self.test_session():
+      true_exp_bf16 = np.full(
+          [batch_size, 1], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
+      sampled_exp_bf16 = np.full(
+          [len(sampled)], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
+      sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
+
+      got_sampled_softmax_loss = math_ops.cast(
+          nn_impl.sampled_softmax_loss(
+              weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
+              biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
+              labels=constant_op.constant(
+                  labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
+              inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
+              num_sampled=4,
+              num_classes=num_classes,
+              num_true=1,
+              sampled_values=sampled_vals_bf16,
+              remove_accidental_hits=False,
+              partition_strategy="div"), dtypes.float32)
+
+      self.assertAllClose(exp_sampled_softmax_loss,
+                          got_sampled_softmax_loss.eval(), 1e-1)
+
 
 class CReluTest(test_lib.TestCase):
 
@@ -1030,6 +1081,42 @@ class DataFormatDimMapTest(test_lib.TestCase):
     self._test([1, -3, -2], [2, 2, 3])
     self._test([[1, -3], [1, -1]], [[2, 2], [2, 1]])
 
+  def testNHWCtoNCHW(self):
+    x_val = [1, -3, -2]
+    y_val_expected = [2, 2, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="NCHW")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testNHWCtoHWNC(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [2, 0, 1, 3, 2, 0, 1, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="HWNC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testNHWCtoWHCN(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [3, 1, 0, 2, 3, 1, 0, 2]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="WHCN")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testArbitraryASCII(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [3, 2, 1, 0, 3, 2, 1, 0]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="qwer", dst_format="rewq")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
 
 class DataFormatVectorPermuteTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index db8159579a21d9b98b06b6172f8d0df7e8ff95ca..6a2dd3f1cd55eea1d3b652a31cd2784c411c2ce0 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -209,7 +209,7 @@ def random_uniform(shape,
     maxval: A 0-D Tensor or Python value of type `dtype`. The upper bound on
       the range of random values to generate.  Defaults to 1 if `dtype` is
       floating point.
-    dtype: The type of the output: 'float16`, `float32`, `float64`, `int32`,
+    dtype: The type of the output: `float16`, `float32`, `float64`, `int32`,
       or `int64`.
     seed: A Python integer. Used to create a random seed for the distribution.
       See @{tf.set_random_seed}
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 54191ee765c642a9fe8f040eb3c22d1abdff293b..07e25e540c808c6c991e1603f779921beba62dfd 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -46,10 +46,6 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
   container = ops.get_default_graph()._container  # pylint: disable=protected-access
   if container is None:
     container = ""
-  if not graph_mode:
-    # When in eager mode use a uid for the shared_name, to prevent accidental
-    # sharing.
-    shared_name = str(ops.uid())
   handle = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
                                                    shared_name=shared_name,
                                                    name=name,
@@ -153,7 +149,7 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
 class ResourceVariable(variables.Variable):
   """Variable based on resource handles.
 
-  See the @{$python/state_ops$`Variables`} documentation for more details.
+  See the @{$variables$Variables How To} for a high level overview.
 
   A `ResourceVariable` allows you to maintain state across subsequent calls to
   session.run.
@@ -175,7 +171,9 @@ class ResourceVariable(variables.Variable):
   to see all modifications to the value of the variable which happen in any
   operation on which the read_value depends on (either directly, indirectly, or
   via a control dependency) and guaranteed to not see any modification to the
-  value of the variable on which the read_value operation does not depend on.
+  value of the variable from operations that depend on the read_value operation.
+  Updates from operations that have no dependency relationship to the read_value
+  operation might or might not be visible to read_value.
 
   For example, if there is more than one assignment to a ResourceVariable in
   a single session.run call there is a well-defined value for each operation
@@ -183,24 +181,20 @@ class ResourceVariable(variables.Variable):
   by edges in the graph. Consider the following example, in which two writes
   can cause tf.Variable and tf.ResourceVariable to behave differently:
 
-   ```python
-    a = tf.ResourceVariable(1.0)
-    a.initializer.run()
-
-    assign = a.assign(2.0)
-    with tf.control_dependencies([assign]):
-      b = a.read_value()
-    with tf.control_dependencies([b]):
-      other_assign = a.assign(3.0)
-    with tf.control_dependencies([other_assign]):
-      # Will print 2.0 because the value was read before other_assign ran. If
-      # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
-      tf.Print(b, [b]).eval()
+  ```python
+  a = tf.ResourceVariable(1.0)
+  a.initializer.run()
+
+  assign = a.assign(2.0)
+  with tf.control_dependencies([assign]):
+    b = a.read_value()
+  with tf.control_dependencies([b]):
+    other_assign = a.assign(3.0)
+  with tf.control_dependencies([other_assign]):
+    # Will print 2.0 because the value was read before other_assign ran. If
+    # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
+    tf.Print(b, [b]).eval()
   ```
-
-  To enforce these consistency properties tf.ResourceVariable might make more
-  copies than an equivalent tf.Variable under the hood, so tf.Variable is still
-  not deprecated.
   """
 
   def __init__(self,
@@ -368,6 +362,12 @@ class ResourceVariable(variables.Variable):
                           if init_from_fn else [initial_value]) as name:
         # pylint: disable=protected-access
         handle_name = ops._name_from_scope_name(name)
+        if self._in_graph_mode:
+          shared_name = handle_name
+        else:
+          # When in eager mode use a uid for the shared_name, to prevent
+          # accidental sharing.
+          shared_name = "%s_%d" % (handle_name, ops.uid())
         if init_from_fn:
           # Use attr_scope and device(None) to simulate the behavior of
           # colocate_with when the variable we want to colocate with doesn't
@@ -383,7 +383,7 @@ class ResourceVariable(variables.Variable):
               self._handle = _eager_safe_variable_handle(
                   shape=initial_value.get_shape(),
                   dtype=initial_value.dtype.base_dtype,
-                  shared_name=handle_name,
+                  shared_name=shared_name,
                   name=name,
                   graph_mode=self._in_graph_mode)
               self._shape = initial_value.get_shape()
@@ -395,7 +395,7 @@ class ResourceVariable(variables.Variable):
             self._handle = _eager_safe_variable_handle(
                 shape=initial_value.get_shape(),
                 dtype=initial_value.dtype.base_dtype,
-                shared_name=handle_name,
+                shared_name=shared_name,
                 name=name,
                 graph_mode=False)
             self._shape = initial_value.get_shape()
@@ -418,11 +418,12 @@ class ResourceVariable(variables.Variable):
           self._handle = _eager_safe_variable_handle(
               shape=initial_value.get_shape(),
               dtype=initial_value.dtype.base_dtype,
-              shared_name=handle_name,
+              shared_name=shared_name,
               name=name,
               graph_mode=self._in_graph_mode)
           self._shape = initial_value.get_shape()
 
+        self._unique_id = shared_name
         self._initial_value = initial_value if self._in_graph_mode else None
         self._handle_name = handle_name + ":0"
         self._dtype = initial_value.dtype.base_dtype
@@ -503,6 +504,7 @@ class ResourceVariable(variables.Variable):
     self._shape = tensor_shape.TensorShape(
         self._handle.op.get_attr("shape"))
     self._handle_name = self._handle.name
+    self._unique_id = self._handle_name
     self._initializer_op = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.initializer_name, import_scope=import_scope))
@@ -851,7 +853,8 @@ class ResourceVariable(variables.Variable):
       tape.watch_variable(self)
     return _UnreadVariable(
         self._handle, self.dtype, self._shape, self._in_graph_mode,
-        self._handle_deleter if not self._in_graph_mode else None, op)
+        self._handle_deleter if not self._in_graph_mode else None, op,
+        self._unique_id)
 
   def assign(self, value, use_locking=None, name=None, read_value=True):
     """Assigns a new value to this variable.
@@ -966,7 +969,7 @@ class _UnreadVariable(ResourceVariable):
   """
 
   def __init__(self, handle, dtype,  # pylint: disable=super-init-not-called
-               shape, in_graph_mode, deleter, parent_op):
+               shape, in_graph_mode, deleter, parent_op, unique_id):
     # We do not call super init on purpose.
     self._trainable = False
     self._save_slice_info = None
@@ -979,6 +982,7 @@ class _UnreadVariable(ResourceVariable):
       self._handle_name = ""
     else:
       self._handle_name = self._handle.name
+    self._unique_id = unique_id
     self._dtype = dtype
     self._constraint = None
     self._cached_value = None
@@ -1085,6 +1089,11 @@ ops.register_proto_function(
     proto_type=variable_pb2.VariableDef,
     to_proto=_to_proto_fn,
     from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.GLOBAL_STEP,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
 
 
 def is_resource_variable(var):
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 42af7f8b274c555e6375ab8e937a8cc06ffbaa8e..1dd464d51d9d1b17bf9e2741668117bf014d9453 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -49,24 +49,21 @@ _concat = rnn_cell_impl._concat
 
 
 def _transpose_batch_time(x):
-  """Transpose the batch and time dimensions of a Tensor.
+  """Transposes the batch and time dimensions of a Tensor.
 
-  Retains as much of the static shape information as possible.
+  If the input tensor has rank < 2 it returns the original tensor. Retains as
+  much of the static shape information as possible.
 
   Args:
-    x: A tensor of rank 2 or higher.
+    x: A Tensor.
 
   Returns:
     x transposed along the first two dimensions.
-
-  Raises:
-    ValueError: if `x` is rank 1 or lower.
   """
   x_static_shape = x.get_shape()
   if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
-    raise ValueError(
-        "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
-        (x, x_static_shape))
+    return x
+
   x_rank = array_ops.rank(x)
   x_t = array_ops.transpose(
       x, array_ops.concat(
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index fb59bbba5eb7a668b6b959c8519f2b52f0ce26fb..96fb0247157851b8bd931142b048b3df4da65503 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -36,6 +36,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_script_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -54,6 +55,16 @@ class EagerFunc(object):
     self._func = func
     self._out_dtypes = Tout
 
+  def _convert(self, value, dtype):
+    if isinstance(value, resource_variable_ops.ResourceVariable):
+      raise RuntimeError(
+          "Attempting to return a variable from an eagerly executed py_func. "
+          "Only numeric data structures like Tensors or NumPy arrays should "
+          "be returned; to return the value of a variable, make sure to obtain "
+          "the Tensor backing it by calling `.read_value()` on the variable in "
+          "question: %s" % value)
+    return ops.convert_to_tensor(value, dtype=dtype)
+
   def __call__(self, on_gpu, args):
     """Passes `args` to `self._func`, which is executed eagerly."""
     with context.eager_mode():
@@ -61,14 +72,13 @@ class EagerFunc(object):
       maybe_copy_to_gpu = lambda x: x if not on_gpu else x.gpu()
       if isinstance(ret, (tuple, list)):
         return [
-            maybe_copy_to_gpu(ops.convert_to_tensor(x, dtype=dtype))
+            maybe_copy_to_gpu(self._convert(x, dtype=dtype))
             for (x, dtype) in zip(ret, self._out_dtypes)
         ]
       elif ret is None:
         return ret
       else:
-        return maybe_copy_to_gpu(
-            ops.convert_to_tensor(ret, dtype=self._out_dtypes[0]))
+        return maybe_copy_to_gpu(self._convert(ret, dtype=self._out_dtypes[0]))
 
 
 class FuncRegistry(object):
@@ -324,7 +334,11 @@ def py_func(func, inp, Tout, stateful=True, name=None):
     result = func(*[x.numpy() for x in inp])
     result = nest.flatten(result)
 
-    return [x if x is None else ops.convert_to_tensor(x) for x in result]
+    result = [x if x is None else ops.convert_to_tensor(x) for x in result]
+    if len(result) == 1:
+      # Mimic the automatic unwrapping in graph-mode py_func
+      result, = result
+    return result
 
   return _internal_py_func(
       func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index b0eecd8a1e812857de8f47e1370e4fc5f1004bc0..21e08d03d213c173d12dfc6676fe7f009811e93f 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -247,7 +247,7 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
     #
     # collections.OrderedDict([
     #     ((0, 0, 0), 2),
-    #     ((0, 0, 1), 3),
+    #     ((0, 1, 0), 3),
     # ])
   ```
 
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 60a98aca7f822802363de0efd0d2974e45dc810e..e90ff0746a8e86b4b462b71028fd677632c9075d 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -80,6 +80,8 @@ from tensorflow.python.ops.state_ops import scatter_add
 from tensorflow.python.ops.state_ops import scatter_div
 from tensorflow.python.ops.state_ops import scatter_mul
 from tensorflow.python.ops.state_ops import scatter_sub
+from tensorflow.python.ops.state_ops import scatter_min
+from tensorflow.python.ops.state_ops import scatter_max
 from tensorflow.python.ops.state_ops import scatter_update
 from tensorflow.python.ops.state_ops import scatter_nd_add
 from tensorflow.python.ops.state_ops import scatter_nd_sub
@@ -218,6 +220,7 @@ _allowed_symbols_gradients = [
     # Documented in training.py:
     # Not importing training.py to avoid complex graph dependencies.
     "AggregationMethod",
+    "GradientTape",
     "custom_gradient",
     "gradients",  # tf.gradients = gradients.gradients
     "hessians",
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index c3ad5831b4dfa2160a198429c60c7a6ac00f6357..f6a11ca625b46cd088c3764039a10bc72619d1f8 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -63,6 +63,8 @@
 @@scatter_nd_update
 @@scatter_sub
 @@scatter_update
+@@scatter_min
+@@scatter_max
 @@sparse_mask
 @@tables_initializer
 @@trainable_variables
@@ -421,3 +423,55 @@ def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
       ref.handle, indices, ops.convert_to_tensor(updates, dtype=ref.dtype),
       use_locking, name)]):
     return ref.read_value()
+
+
+@tf_export("scatter_add")
+def scatter_add(ref, indices, updates, use_locking=False, name=None):
+  # pylint: disable=line-too-long
+  r"""Adds sparse updates to the variable referenced by `resource`.
+
+  This operation computes
+
+  ```python
+      # Scalar indices
+      ref[indices, ...] += updates[...]
+
+      # Vector indices (for each i)
+      ref[indices[i], ...] += updates[i, ...]
+
+      # High rank indices (for each i, ..., j)
+      ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+  ```
+
+  This operation outputs `ref` after the update is done.
+  This makes it easier to chain operations that need to use the updated value.
+  Duplicate entries are handled correctly: if multiple `indices` reference
+  the same location, their contributions add.
+
+  Requires `updates.shape = indices.shape + ref.shape[1:]`.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+  </div>
+
+  Args:
+    ref: A `Variable`.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A tensor of indices into the first dimension of `ref`.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A tensor of updated values to store in `ref`.
+    use_locking: An optional `bool`. Defaults to `True`.
+      If True, the assignment will be protected by a lock;
+      otherwise the behavior is undefined, but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    Same as `ref`.  Returned as a convenience for operations that want
+    to use the updated values after the update is done.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_add(ref, indices, updates,
+                                     use_locking=use_locking, name=name)
+  return ref._lazy_read(gen_resource_variable_ops.resource_scatter_add(  # pylint: disable=protected-access
+      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+      name=name))
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index 37b80d5e20bf06c041a669c14ac6d88201af2180..a793f634bda06ad43991fb978f865a2c5fe25437 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -23,6 +23,7 @@ import re
 
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.training import distribute
 
 
 def collect(val, collections, default_collections):
@@ -42,6 +43,16 @@ def collect(val, collections, default_collections):
 _INVALID_TAG_CHARACTERS = re.compile(r'[^-/\w\.]')
 
 
+def skip_summary():
+  # If using multiple towers in distributed strategy, skip summaries on all
+  # towers except the first one (tower_id=0).
+  # TODO(priyag): Add a new optional argument that will provide multiple
+  # alternatives to override default behavior. (e.g. run on last tower,
+  # compute sum or mean across towers).
+  tower_context = distribute.get_tower_context()
+  return tower_context and tower_context.tower_id > 0
+
+
 def clean_tag(name):
   """Cleans a tag. Removes illegal characters for instance.
 
diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py
index 037bc9845a3f734f65b73b0c4b4ca19fb653731d..ec4d4a6e9242107fd7f4bebe1416198457e32cee 100644
--- a/tensorflow/python/ops/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import summary_pb2
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import summary_op_util
@@ -71,6 +72,8 @@ def tensor_summary(name,
 
   serialized_summary_metadata = summary_metadata.SerializeToString()
 
+  if summary_op_util.skip_summary():
+    return constant_op.constant("")
   with summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
     val = gen_logging_ops.tensor_summary_v2(
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 0a391d896a388f7b40b574f6e1367e9d22fb1d0a..0294ecee548d1e7f507a5e4195e4ee320a0b9918 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -583,7 +583,7 @@ class _EagerTemplateVariableStore(object):
       if self._variable_scope_name is None:
         raise RuntimeError("A variable scope must be set before an "
                            "_EagerTemplateVariableStore object exits.")
-      self._eager_variable_store._store.close_variable_subscopes(  # pylint: disable=protected-access
+      variable_scope.get_variable_scope_store().close_variable_subscopes(
           self._variable_scope_name)
 
   def _variables_in_scope(self, variable_list):
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index c1af8ff8d3108d60b9007552ef1aadf98fea32e6..e33085ba626a7645be64941dd4da8e6943292e7e 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -24,6 +24,7 @@ import copy
 import enum  # pylint: disable=g-bad-import-order
 import functools
 import sys
+import threading
 import traceback
 
 import six
@@ -211,23 +212,8 @@ class _VariableStore(object):
     """Create a variable store."""
     self._vars = {}  # A dictionary of the stored TensorFlow variables.
     self._partitioned_vars = {}  # A dict of the stored PartitionedVariables.
-    self.variable_scopes_count = {}  # Count re-used variable scopes.
     self._store_eager_variables = False
 
-  def open_variable_scope(self, scope_name):
-    if scope_name in self.variable_scopes_count:
-      self.variable_scopes_count[scope_name] += 1
-    else:
-      self.variable_scopes_count[scope_name] = 1
-
-  def close_variable_subscopes(self, scope_name):
-    for k in self.variable_scopes_count:
-      if not scope_name or k.startswith(scope_name + "/"):
-        self.variable_scopes_count[k] = 0
-
-  def variable_scope_count(self, scope_name):
-    return self.variable_scopes_count.get(scope_name, 0)
-
   def get_variable(self, name, shape=None, dtype=dtypes.float32,
                    initializer=None, regularizer=None, reuse=None,
                    trainable=True, collections=None, caching_device=None,
@@ -1160,18 +1146,49 @@ class VariableScope(object):
 
 
 _VARSTORE_KEY = ("__variable_store",)
-_VARSCOPE_KEY = ("__varscope",)
+_VARSCOPESTORE_KEY = ("__varscope",)
+
+
+class _VariableScopeStore(threading.local):
+  """A thread local store for the current variable scope and scope counts."""
+
+  def __init__(self):
+    super(_VariableScopeStore, self).__init__()
+    self.current_scope = VariableScope(False)
+    self.variable_scopes_count = {}
+
+  def open_variable_scope(self, scope_name):
+    if scope_name in self.variable_scopes_count:
+      self.variable_scopes_count[scope_name] += 1
+    else:
+      self.variable_scopes_count[scope_name] = 1
+
+  def close_variable_subscopes(self, scope_name):
+    for k in list(self.variable_scopes_count.keys()):
+      if not scope_name or k.startswith(scope_name + "/"):
+        self.variable_scopes_count[k] = 0
+
+  def variable_scope_count(self, scope_name):
+    return self.variable_scopes_count.get(scope_name, 0)
+
+
+def get_variable_scope_store():
+  """Returns the variable scope store for current thread."""
+  scope_store = ops.get_collection(_VARSCOPESTORE_KEY)
+
+  if not scope_store:
+    scope_store = _VariableScopeStore()
+    ops.add_to_collection(_VARSCOPESTORE_KEY, scope_store)
+  else:
+    scope_store = scope_store[0]
+
+  return scope_store
 
 
 @tf_export("get_variable_scope")
 def get_variable_scope():
   """Returns the current variable scope."""
-  scope = ops.get_collection(_VARSCOPE_KEY)
-  if scope:  # This collection has at most 1 element, the default scope at [0].
-    return scope[0]
-  scope = VariableScope(False)
-  ops.add_to_collection(_VARSCOPE_KEY, scope)
-  return scope
+  return get_variable_scope_store().current_scope
 
 
 def _get_default_variable_store():
@@ -1575,10 +1592,8 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
     self._dtype = dtype
     self._use_resource = use_resource
     self._constraint = constraint
-    get_variable_scope()  # Ensure that a default exists, then get a pointer.
-    # Get the reference to the collection as we want to modify it in place.
-    self._default_varscope = ops.get_collection_ref(_VARSCOPE_KEY)
     self._var_store = _get_default_variable_store()
+    self._var_scope_store = get_variable_scope_store()
     if isinstance(self._name_or_scope, VariableScope):
       self._new_name = self._name_or_scope.name
       name_scope = self._name_or_scope._name_scope  # pylint: disable=protected-access
@@ -1626,10 +1641,11 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
         a reuse scope, or if reuse is not `None` or `True`.
       TypeError: when the types of some arguments are not appropriate.
     """
-    self._old = self._default_varscope[0]
+    self._old = self._var_scope_store.current_scope
     if isinstance(self._name_or_scope, VariableScope):
-      self._var_store.open_variable_scope(self._new_name)
-      self._old_subscopes = copy.copy(self._var_store.variable_scopes_count)
+      self._var_scope_store.open_variable_scope(self._new_name)
+      self._old_subscopes = copy.copy(
+          self._var_scope_store.variable_scopes_count)
       variable_scope_object = self._cached_variable_scope_object
     else:
       # Handler for the case when we just prolong current variable scope.
@@ -1672,17 +1688,17 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
         variable_scope_object.set_dtype(self._dtype)
       if self._use_resource is not None:
         variable_scope_object.set_use_resource(self._use_resource)
-      self._var_store.open_variable_scope(self._new_name)
-    self._default_varscope[0] = variable_scope_object
+      self._var_scope_store.open_variable_scope(self._new_name)
+    self._var_scope_store.current_scope = variable_scope_object
     return variable_scope_object
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
     # If jumping out from a non-prolonged scope, restore counts.
     if isinstance(self._name_or_scope, VariableScope):
-      self._var_store.variable_scopes_count = self._old_subscopes
+      self._var_scope_store.variable_scopes_count = self._old_subscopes
     else:
-      self._var_store.close_variable_subscopes(self._new_name)
-    self._default_varscope[0] = self._old
+      self._var_scope_store.close_variable_subscopes(self._new_name)
+    self._var_scope_store.current_scope = self._old
 
 
 def _maybe_wrap_custom_getter(custom_getter, old_getter):
@@ -1707,13 +1723,13 @@ def _maybe_wrap_custom_getter(custom_getter, old_getter):
 
 def _get_unique_variable_scope(prefix):
   """Get a name with the given prefix unique in the current variable scope."""
-  var_store = _get_default_variable_store()
+  var_scope_store = get_variable_scope_store()
   current_scope = get_variable_scope()
   name = current_scope.name + "/" + prefix if current_scope.name else prefix
-  if var_store.variable_scope_count(name) == 0:
+  if var_scope_store.variable_scope_count(name) == 0:
     return prefix
   idx = 1
-  while var_store.variable_scope_count(name + ("_%d" % idx)) > 0:
+  while var_scope_store.variable_scope_count(name + ("_%d" % idx)) > 0:
     idx += 1
   return prefix + ("_%d" % idx)
 
@@ -1729,9 +1745,10 @@ class variable_scope(object):
   graph, ensures that graph is the default graph, and pushes a name scope and a
   variable scope.
 
-  If `name_or_scope` is not None, it is used as is. If `scope` is None, then
-  `default_name` is used.  In that case, if the same name has been previously
-  used in the same scope, it will be made unique by appending `_N` to it.
+  If `name_or_scope` is not None, it is used as is. If `name_or_scope` is None,
+  then `default_name` is used.  In that case, if the same name has been
+  previously used in the same scope, it will be made unique by appending `_N`
+  to it.
 
   Variable scope allows you to create new variables and to share already created
   ones while providing checks to not create or share by accident. For details,
@@ -1810,6 +1827,32 @@ class variable_scope(object):
   discouraged) to pass False to the reuse argument, yielding undocumented
   behaviour slightly different from None. Starting at 1.1.0 passing None and
   False as reuse has exactly the same effect.
+
+  A note about using variable scopes in multi-threaded environment: Variable
+  scopes are thread local, so one thread will not see another thread's current
+  scope. Also, when using `default_name`, unique scopes names are also generated
+  only on a per thread basis. If the same name was used within a different
+  thread, that doesn't prevent a new thread from creating the same scope.
+  However, the underlying variable store is shared across threads (within the
+  same graph). As such, if another thread tries to create a new variable with
+  the same name as a variable created by a previous thread, it will fail unless
+  reuse is True.
+
+  Further, each thread starts with an empty variable scope. So if you wish to
+  preserve name prefixes from a scope from the main thread, you should capture
+  the main thread's scope and re-enter it in each thread. For e.g.
+
+  ```
+  main_thread_scope = variable_scope.get_variable_scope()
+
+  # Thread's target function:
+  def thread_target_fn(captured_scope):
+    with variable_scope.variable_scope(captured_scope):
+      # .... regular code for this thread
+
+
+  thread = threading.Thread(target=thread_target_fn, args=(main_thread_scope,))
+  ```
   """
 
   def __init__(self,
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 5b9947f4417a0f7621fda885c685021d91701c2b..c646f795896f0abfce3eb9a57cadc27299714023 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -125,8 +125,8 @@ class Variable(checkpointable.CheckpointableBase):
 
   @compatibility(eager)
   `tf.Variable` is not compatible with eager execution.  Use
-  `tfe.Variable` instead which is compatible with both eager execution
-  and graph construction.  See [the TensorFlow Eager Execution
+  `tf.contrib.eager.Variable` instead which is compatible with both eager
+  execution and graph construction.  See [the TensorFlow Eager Execution
   guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
   for details on how variables work in eager execution.
   @end_compatibility
@@ -293,6 +293,7 @@ class Variable(checkpointable.CheckpointableBase):
     Raises:
       ValueError: If the initial value is not specified, or does not have a
         shape and `validate_shape` is `True`.
+      RuntimeError: If lifted into the eager context.
     """
     _ = expected_shape
     if initial_value is None:
@@ -319,6 +320,11 @@ class Variable(checkpointable.CheckpointableBase):
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     with ops.init_scope():
+      # Ensure that we weren't lifted into the eager context.
+      if context.executing_eagerly():
+        raise RuntimeError(
+            "tf.Variable not supported when eager execution is enabled. "
+            "Please use tf.contrib.eager.Variable instead")
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
 
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index dbefca2be9615b18418a92f4cbe0b1a0b2917449..478dd46f7e6965f8727e5741f2ccdfdc69247980 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -229,3 +229,25 @@ _COPY_TYPEMAPS(unsigned int, mode_t);
 %define final %enddef
 %define override %enddef
 #endif
+
+// Typemaps to automatically raise a Python exception from bad output TF_Status.
+// TODO(b/77295559): expand this to all TF_Status* output params and deprecate
+// raise_exception_on_not_ok_status (currently it only affects the C API).
+%typemap(in, numinputs=0) TF_Status* status (TF_Status* status) {
+  $1 = TF_NewStatus();
+}
+
+%typemap(freearg) (TF_Status* status) {
+ TF_DeleteStatus($1);
+}
+
+%typemap(argout) TF_Status* status {
+  TF_Code code = TF_GetCode($1);
+  if (code != TF_OK) {
+    PyObject* exc = tensorflow::PyExceptionRegistry::Lookup(code);
+    // Arguments to OpError.
+    PyObject* exc_args = Py_BuildValue("sss", nullptr, nullptr, TF_Message($1));
+    SWIG_SetErrorObj(exc, exc_args);
+    SWIG_fail;
+  }
+}
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index 5c50fa023dc3b216838390d9356a39e70e2362d2..fdd2b903fc79c40a26392714328f74756f3fff92 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -68,7 +68,6 @@ def get_compile_flags():
   """
   flags = []
   flags.append('-I%s' % get_include())
-  flags.append('-I%s/external/nsync/public' % get_include())
   flags.append('-D_GLIBCXX_USE_CXX11_ABI=%d' % _CXX11_ABI_FLAG)
   return flags
 
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index c815aad0a065eaba4a0dc52487b5ee67e271a146..0654104a3436366bb5fe88e2c3415cc957cbfde8 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -156,18 +156,3 @@ py_test(
         "@com_google_pprof//:pprof_proto_py",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 362a1c49e64118134a4039ae3a5d939ed0b6d730..994206cd63a915de93bc109e7b217ad997c787a7 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -70,18 +70,3 @@ cuda_py_test(
         "no_pip",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index b481ddf5d4798aeed970d435234fd82de3b93a06..7acb8eeb1a73c395b3eab3da9dc6696bcaac4e88 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+%include "tensorflow/python/platform/base.i"
+
 %ignore "";
 
 %rename("%s") TFE_NewContext;
@@ -26,6 +28,9 @@ limitations under the License.
 %rename("%s") TFE_ContextClearCaches;
 %rename("%s") TFE_ContextGetDevicePlacementPolicy;
 %rename("%s") TFE_ContextSetThreadLocalDevicePlacementPolicy;
+%rename("%s") TFE_ContextSetAsyncForThread;
+%rename("%s") TFE_ContextAsyncWait;
+%rename("%s") TFE_ContextAsyncClearError;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_RegisterExceptionClass;
@@ -51,6 +56,7 @@ limitations under the License.
 %rename("%s") TFE_NewContextOptions;
 %rename("%s") TFE_ContextOptionsSetConfig;
 %rename("%s") TFE_ContextOptionsSetDevicePlacementPolicy;
+%rename("%s") TFE_ContextOptionsSetAsync;
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 30e0a099d8b2e30cff36b69164ba9f1789dd8916..2609a5d222659f6ebf775d6baa48bd7bc39fd7f6 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -235,15 +235,3 @@ py_test(
 
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 2a3918b9b421ea125b327c9f62d098de961fff6b..1286ed670390350ff1695b748714e4b2be60352e 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -48,10 +48,13 @@ from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.core.util.event_pb2 import TaggedRunMetadata
 # pylint: enable=unused-import
 
+
 from tensorflow.python.eager import context as _context
+from tensorflow.python.framework import constant_op as _constant_op
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.ops import gen_logging_ops as _gen_logging_ops
+from tensorflow.python.ops import gen_summary_ops as _gen_summary_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import summary_op_util as _summary_op_util
 
 # exports tensor-related summaries
@@ -96,10 +99,11 @@ def scalar(name, tensor, collections=None, family=None):
   Raises:
     ValueError: If tensor has the wrong shape or type.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
-    # pylint: disable=protected-access
-    val = _gen_logging_ops._scalar_summary(tags=tag, values=tensor, name=scope)
+    val = _gen_logging_ops.scalar_summary(tags=tag, values=tensor, name=scope)
     _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
   return val
 
@@ -150,6 +154,8 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
     val = _gen_logging_ops.image_summary(
@@ -188,11 +194,12 @@ def histogram(name, values, collections=None, family=None):
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[values],
       default_name='HistogramSummary') as (tag, scope):
-    # pylint: disable=protected-access
-    val = _gen_logging_ops._histogram_summary(
+    val = _gen_logging_ops.histogram_summary(
         tag=tag, values=values, name=scope)
     _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
   return val
@@ -234,6 +241,8 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family=family, values=[tensor]) as (tag, scope):
     sample_rate = _ops.convert_to_tensor(
@@ -282,6 +291,8 @@ def merge(inputs, collections=None, name=None):
     raise RuntimeError(
         'Merging tf.summary.* ops is not compatible with eager execution. '
         'Use tf.contrib.summary instead.')
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   name = _summary_op_util.clean_tag(name)
   with _ops.name_scope(name, 'Merge', inputs):
     val = _gen_logging_ops.merge_summary(inputs=inputs, name=name)
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 82b908ac0e95643d1daf5ed062be44a58cfea97f..26e8acd8977734768accb1f9c7e37431c337ee34 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -25,6 +25,7 @@ limitations under the License.
 %include "tensorflow/python/util/tfprof.i"
 
 %include "tensorflow/python/lib/core/py_func.i"
+%include "tensorflow/python/lib/core/py_exception_registry.i"
 
 %include "tensorflow/python/lib/io/py_record_reader.i"
 %include "tensorflow/python/lib/io/py_record_writer.i"
@@ -54,4 +55,3 @@ limitations under the License.
 %include "tensorflow/python/grappler/tf_optimizer.i"
 %include "tensorflow/python/grappler/cost_analyzer.i"
 %include "tensorflow/python/grappler/model_analyzer.i"
-
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 5415881cae4a7815e790620f226929278f8c1f12..6e39ce8c808a1716ff9263982e99a14592472c76 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -249,23 +249,12 @@ py_test(
         "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     ],
     srcs_version = "PY2AND3",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "no-internal-py3",
+    ],
     deps = [
         ":saved_model_cli",
         "//tensorflow/core:protos_all_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/tools/optimize_for_inference.py b/tensorflow/python/tools/optimize_for_inference.py
index 902748d55efedf2166cbeb0d8e0fcff0d18ed152..dac6a06a89c7596dd66d0ed7a2e5a59a0ba9b9dd 100644
--- a/tensorflow/python/tools/optimize_for_inference.py
+++ b/tensorflow/python/tools/optimize_for_inference.py
@@ -87,7 +87,9 @@ def main(unused_args):
   output_graph_def = optimize_for_inference_lib.optimize_for_inference(
       input_graph_def,
       FLAGS.input_names.split(","),
-      FLAGS.output_names.split(","), FLAGS.placeholder_type_enum)
+      FLAGS.output_names.split(","),
+      FLAGS.placeholder_type_enum,
+      FLAGS.toco_compatible)
 
   if FLAGS.frozen_graph:
     f = gfile.FastGFile(FLAGS.output, "w")
@@ -138,6 +140,14 @@ def parse_args():
       type=int,
       default=dtypes.float32.as_datatype_enum,
       help="The AttrValue enum to use for placeholders.")
+  parser.add_argument(
+      "--toco_compatible",
+      type=bool,
+      default=False,
+      help="""\
+      If true, only use ops compatible with Tensorflow
+      Lite Optimizing Converter.\
+      """)
   return parser.parse_known_args()
 
 
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index 9c1927122252f45ddfa8092045c7589fa0f45532..bb90d1cd6e33aacf4bb7498fb9c9e7ecfb447c04 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -87,7 +87,7 @@ EPSILON_ATTR = {
 
 
 def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
-                           placeholder_type_enum):
+                           placeholder_type_enum, toco_compatible=False):
   """Applies a series of inference optimizations on the input graph.
 
   Args:
@@ -98,6 +98,8 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
       results.
     placeholder_type_enum: The AttrValue enum for the placeholder data type, or
         a list that specifies one value per input node name.
+    toco_compatible: Boolean, if True, only runs optimizations that result in
+      TOCO compatible graph operations (default=False).
 
   Returns:
     An optimized version of the input graph.
@@ -110,8 +112,9 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
   optimized_graph_def = graph_util.remove_training_nodes(
       optimized_graph_def, output_node_names)
   optimized_graph_def = fold_batch_norms(optimized_graph_def)
-  optimized_graph_def = fuse_resize_and_conv(optimized_graph_def,
-                                             output_node_names)
+  if not toco_compatible:
+    optimized_graph_def = fuse_resize_and_conv(optimized_graph_def,
+                                               output_node_names)
   ensure_graph_is_valid(optimized_graph_def)
   return optimized_graph_def
 
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index af87d6f0e5e74557523c7816d966c8fbcf69a824..9be8b6aafefa33977511cde24dd2e87dd6c3b81a 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -319,6 +319,15 @@ class AdamOptimizerTest(test.TestCase):
         # fails.
         optimizer.apply_gradients([(grads0, var0)])
 
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.AdamOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(6, len(set(opt.variables())))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable.py b/tensorflow/python/training/checkpointable.py
index e49965703e65aea05dc3bddabf8d1e3a24c65a74..bbbe1e8ac5b985b11f2481ddcadedc06ed70a4fb 100644
--- a/tensorflow/python/training/checkpointable.py
+++ b/tensorflow/python/training/checkpointable.py
@@ -210,8 +210,8 @@ class _CheckpointPosition(object):
     restore_ops = []
     building_graph = not context.executing_eagerly()
     for serialized_tensor in self.object_proto.attributes:
-      saveable_object = saveables.get(serialized_tensor.name, None)
-      if saveable_object is None:
+      saveable_factory = saveables.get(serialized_tensor.name, None)
+      if saveable_factory is None:
         # Purposefully does not throw an exception if attributes have been added
         # or deleted. Stores unused attributes so an exception can be raised if
         # the user decides to check that everything in the checkpoint was
@@ -225,7 +225,11 @@ class _CheckpointPosition(object):
       else:
         existing_ops = None
       if existing_ops is None:
-        named_saveables[serialized_tensor.checkpoint_key] = saveable_object
+        if callable(saveable_factory):
+          saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
+        else:
+          saveable = saveable_factory
+        named_saveables[serialized_tensor.checkpoint_key] = saveable
     if named_saveables:
       validated_saveables = (
           self._checkpoint.builder._ValidateAndSliceInputs(named_saveables))  # pylint: disable=protected-access
@@ -317,8 +321,10 @@ class CheckpointableBase(object):
     # Maps names -> Checkpointable objects
     self._unconditional_dependency_names = {}
     # Restorations for other Checkpointable objects on which this object may
-    # eventually depend.
-    self._deferred_dependencies = {}  # local name -> _CheckpointPosition list
+    # eventually depend. Maps local name -> _CheckpointPosition list. Optimizers
+    # tack on conditional dependencies, and so need separate management of
+    # deferred dependencies too.
+    self._unconditional_deferred_dependencies = {}
     # The UID of the highest assignment to this object. Used to ensure that the
     # last requested assignment determines the final value of an object.
     if hasattr(self, "_update_uid"):
@@ -340,6 +346,21 @@ class CheckpointableBase(object):
     """
     return self._unconditional_checkpoint_dependencies
 
+  @property
+  def _deferred_dependencies(self):
+    """A dictionary with deferred dependencies.
+
+    Stores restorations for other Checkpointable objects on which this object
+    may eventually depend. May be overridden by sub-classes (e.g. Optimizers use
+    conditional dependencies based the current graph, and so need separate
+    management of deferred dependencies too).
+
+    Returns:
+      A dictionary mapping from local name to a list of _CheckpointPosition
+      objects.
+    """
+    return self._unconditional_deferred_dependencies
+
   def _lookup_dependency(self, name):
     """Look up a dependency by name.
 
@@ -539,6 +560,7 @@ class CheckpointableBase(object):
       checkpointable: The Checkpointable object to restore (inheriting from
         `CheckpointableBase`).
     """
+    self._maybe_initialize_checkpointable()
     deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
     for checkpoint_position in sorted(
         deferred_dependencies_list,
@@ -600,14 +622,30 @@ class CheckpointableBase(object):
     """Returns a dictionary of values to checkpoint with this object.
 
     Keys in the returned dictionary are local to this object and in a separate
-    namespace from dependencies. Values may either be `SaveableObject`s or
-    variables easily converted to `SaveableObject`s (as in `tf.train.Saver`'s
+    namespace from dependencies. Values may either be `SaveableObject` factories
+    or variables easily converted to `SaveableObject`s (as in `tf.train.Saver`'s
     `var_list` constructor argument).
 
+    `SaveableObjects` have a name set, which Checkpointable needs to generate
+    itself. So rather than returning `SaveableObjects` directly, this method
+    should return a dictionary of callables which take `name` arguments and
+    return `SaveableObjects` with that name.
+
+    If this object may also be passed to the global-name-based `tf.train.Saver`,
+    the returned callables should have a default value for their name argument
+    (i.e. be callable with no arguments).
+
     Returned values must be saved only by this object; if any value may be
     shared, it should instead be a dependency. For example, variable objects
     save their own values with the key `VARIABLE_VALUE_KEY`, but objects which
     reference variables simply add a dependency.
+
+    Returns:
+      The dictionary mapping attribute names to `SaveableObject` factories
+      described above. For example:
+      {VARIABLE_VALUE_KEY:
+       lambda name="global_name_for_this_object":
+       SaveableObject(name=name, ...)}
     """
     return {}
 
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index d31c375b4ce48dcb9bc2918514707636a647c675..be80c3657158b52d063b5d2b7731f25d184794a0 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -25,14 +25,13 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util.tf_export import tf_export
 
-# This is a tuple of PS ops used by tf.estimator.Esitmator which should work in
+# This is a tuple of PS ops used by tf.estimator.Estimator which should work in
 # almost all of cases.
-STANDARD_PS_OPS = (
-    "Variable", "VariableV2", "AutoReloadVariable", "MutableHashTable",
-    "MutableHashTableV2", "MutableHashTableOfTensors",
-    "MutableHashTableOfTensorsV2", "MutableDenseHashTable",
-    "MutableDenseHashTableV2", "VarHandleOp"
-)
+STANDARD_PS_OPS = ("Variable", "VariableV2", "AutoReloadVariable",
+                   "MutableHashTable", "MutableHashTableV2",
+                   "MutableHashTableOfTensors", "MutableHashTableOfTensorsV2",
+                   "MutableDenseHashTable", "MutableDenseHashTableV2",
+                   "VarHandleOp", "BoostedTreesEnsembleResourceHandleOp")
 
 
 class _RoundRobinStrategy(object):
diff --git a/tensorflow/python/training/device_util.py b/tensorflow/python/training/device_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1137e80ab4394333ef0f3b7982d5b55f4704d0d
--- /dev/null
+++ b/tensorflow/python/training/device_util.py
@@ -0,0 +1,68 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Device-related support functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+
+
+def canonicalize(d):
+  d = tf_device.DeviceSpec.from_string(d)
+  assert d.device_type is None or d.device_type == d.device_type.upper(), (
+      "Device type '%s' must be all-caps." % (d.device_type,))
+  # Fill in missing device fields using defaults.
+  result = tf_device.DeviceSpec(
+      job="localhost", replica=0, task=0, device_type="CPU", device_index=0)
+  result.merge_from(d)
+  return result.to_string()
+
+
+class _FakeNodeDef(object):
+  """A fake NodeDef for _FakeOperation."""
+
+  def __init__(self):
+    self.op = ""
+    self.name = ""
+
+
+class _FakeOperation(object):
+  """A fake Operation object to pass to device functions."""
+
+  def __init__(self):
+    self.device = ""
+    self.type = ""
+    self.name = ""
+    self.node_def = _FakeNodeDef()
+
+  def _set_device(self, device):
+    self.device = ops._device_string(device)  # pylint: disable=protected-access
+
+
+def current():
+  """Return a string (not canonicalized) for the current device."""
+  # TODO(josh11b): Work out how this function interacts with ops.colocate_with.
+  ctx = context.context()
+  if ctx.executing_eagerly():
+    d = ctx.device_name
+  else:
+    op = _FakeOperation()
+    ops.get_default_graph()._apply_device_functions(op)  # pylint: disable=protected-access
+    d = op.device
+  return d
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..16e200d64dfdf6255cf45113afcea4ecbfc9bb56
--- /dev/null
+++ b/tensorflow/python/training/distribute.py
@@ -0,0 +1,1238 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class DistributionStrategy, TowerContext, and supporting APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.training import device_util
+from tensorflow.python.util import nest
+
+
+# ------------------------------------------------------------------------------
+# Internal API for setting the current thread mode as being either in a
+# tower or cross-tower context for a particular distribution strategy.
+
+
+class _ThreadMode(object):
+
+  def __init__(self, dist, cross, tower):
+    self.distribution_strategy = dist
+    self.cross_tower_context = cross
+    self.tower_context = tower
+
+
+class _CrossTowerThreadMode(_ThreadMode):
+
+  def __init__(self, distribution_strategy):
+    _ThreadMode.__init__(
+        self, distribution_strategy, distribution_strategy, None)
+
+
+class _InTowerThreadMode(_ThreadMode):
+
+  def __init__(self, tower_ctx):
+    _ThreadMode.__init__(
+        self, tower_ctx.distribution_strategy, None, tower_ctx)
+
+
+_per_thread_mode = threading.local()
+
+
+def _push_per_thread_mode(context):
+  if not hasattr(_per_thread_mode, "stack"):
+    _per_thread_mode.stack = []
+  _per_thread_mode.stack.append(context)
+
+
+def _pop_per_thread_mode():
+  _per_thread_mode.stack.pop(-1)
+
+
+class _DefaultTowerThreadMode(_ThreadMode):
+  """Type of default value returned by `_get_per_thread_mode()`.
+
+  Used when the thread-local stack is empty.
+  """
+
+  def __init__(self):
+    # _default_distribution_strategy and _default_tower_context are
+    # defined at the bottom of this file.
+    _ThreadMode.__init__(
+        self, _default_distribution_strategy, None, _default_tower_context)
+
+
+def _get_per_thread_mode():
+  try:
+    return _per_thread_mode.stack[-1]
+  except (AttributeError, IndexError):
+    # _default_tower_mode is defined at the bottom of this file.
+    return _default_tower_mode
+
+
+# ------------------------------------------------------------------------------
+# Context tracking whether in a distribution.update() or .update_non_slot()
+# call.
+
+
+_update_device = threading.local()
+
+
+def get_update_device():
+  """Get the current device if in a `DistributionStrategy.update()` call."""
+  try:
+    return _update_device.current
+  except AttributeError:
+    return None
+
+
+class UpdateContext(object):
+  """Context manager when you are in `update()` or `update_non_slot()`."""
+
+  def __init__(self, device):
+    self._device = device
+    self._old_device = None
+
+  def __enter__(self):
+    self._old_device = get_update_device()
+    _update_device.current = self._device
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+    _update_device.current = self._old_device
+
+
+# ------------------------------------------------------------------------------
+# Public API for accessing the current thread mode
+
+
+def get_tower_context():
+  """Returns the current TowerContext or None if in a cross-tower context.
+
+  Note that execution:
+  1. starts in the default (single-tower) tower context (this function
+     will return the default TowerContext object);
+  2. switches to cross-tower context (in which case this will return
+     None) when entering a `with DistributionStrategy.scope():` block;
+  3. switches to a (non-default) tower context inside
+     `call_for_each_tower(fn, ...)`;
+  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-tower context (and again
+     this function will return None).
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-tower context for the default `DistributionStrategy`. You may
+  also switch from the cross-tower context of 4 to a tower context by
+  calling `call_for_each_tower()`, jumping back to step 3.
+
+  Most `DistributionStrategy` methods may only be executed in
+  a cross-tower context, in a tower context you should use the
+  `TowerContext` API instead.
+
+  Returns:
+    The current `TowerContext` object when in a tower context scope, else None.
+
+    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
+    will return None in a particular block.
+  """
+  return _get_per_thread_mode().tower_context
+
+
+def get_cross_tower_context():
+  """Returns the current DistributionStrategy if in a cross-tower context.
+
+  Note that execution:
+  1. starts in the default (single-tower) tower context;
+  2. switches to cross-tower context when entering a
+     `with DistributionStrategy.scope():` block;
+  3. switches to a (non-default) tower context inside
+     `call_for_each_tower(fn, ...)`;
+  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-tower context.
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-tower context for the default `DistributionStrategy`. You may
+  also switch from the cross-tower context of 4 to a tower context by
+  calling `call_for_each_tower()`, jumping back to step 3.
+
+  Most `DistributionStrategy` methods may only be executed in
+  a cross-tower context.
+
+  Returns:
+    Returns the current `DistributionStrategy` object in a cross-tower
+    context, or None.
+
+    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
+    will return None in a particular block.
+  """
+  return _get_per_thread_mode().cross_tower_context
+
+
+def get_distribution_strategy():
+  """Returns the current `DistributionStrategy` object.
+
+  Prefer to use `get_tower_context()` or `get_cross_tower_context()`
+  instead when possible.
+
+  Returns:
+    A `DistributionStrategy` object. Inside a
+    `with distribution_strategy.scope()` block, it returns
+    `distribution_strategy`, otherwise it returns the default
+    (single-tower) `DistributionStrategy` object.
+  """
+  return _get_per_thread_mode().distribution_strategy
+
+
+def has_distribution_strategy():
+  """Return if there is a current non-default `DistributionStrategy`.
+
+  Returns:
+    True if inside a `with distribution_strategy.scope():`.
+  """
+  return get_distribution_strategy() is not _default_distribution_strategy
+
+
+# ------------------------------------------------------------------------------
+# Public utility functions.
+
+
+def get_loss_reduction():
+  """Reduce `method_string` corresponding to the last loss reduction."""
+  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
+  if loss_reduction == losses_impl.Reduction.SUM:
+    return "sum"
+  return "mean"
+
+
+# ------------------------------------------------------------------------------
+# Internal API for validating the current thread mode
+
+
+def _require_cross_tower_context(distribution_strategy):
+  """Verify in cross-tower context for `distribution_strategy`."""
+  context = _get_per_thread_mode()
+  if context.cross_tower_context is distribution_strategy: return
+  # We have an error to report, figure out the right message.
+  if context.distribution_strategy is not distribution_strategy:
+    if context.distribution_strategy is _default_distribution_strategy:
+      raise RuntimeError(
+          'Need to be inside "with distribution_strategy.scope()" for %s' %
+          (distribution_strategy,))
+    else:
+      raise RuntimeError(
+          "Mixing different DistributionStrategy objects: %s is not %s" %
+          (context.distribution_strategy, distribution_strategy))
+  assert context.cross_tower_context is None
+  raise RuntimeError("Method requires being in cross-tower context, use "
+                     "get_tower_context().merge_call()")
+
+
+def require_tower_context(tower_ctx):
+  """Verify in `tower_ctx` tower context."""
+  context = _get_per_thread_mode()
+  if context.tower_context is tower_ctx: return
+  # We have an error to report, figure out the right message.
+  if context.tower_context is None:
+    raise RuntimeError("Need to be inside `call_for_each_tower()`")
+  if context.distribution_strategy is tower_ctx.distribution_strategy:
+    # Two different TowerContexts with the same DistributionStrategy.
+    raise RuntimeError("Mismatching tower context.")
+  raise RuntimeError(
+      "Mismatching DistributionStrategy objects: %s is not %s." %
+      (context.distribution_strategy, tower_ctx.distribution_strategy))
+
+
+def _require_distribution_strategy_scope(distribution_strategy):
+  """Verify in a `distribution_strategy.scope()` in this thread."""
+  context = _get_per_thread_mode()
+  if context.distribution_strategy is distribution_strategy: return
+  # We have an error to report, figure out the right message.
+  if context.distribution_strategy is _default_distribution_strategy:
+    raise RuntimeError(
+        'Need to be inside "with distribution_strategy.scope()" for %s' %
+        (distribution_strategy,))
+  else:
+    raise RuntimeError(
+        "Mixing different DistributionStrategy objects: %s is not %s" %
+        (context.distribution_strategy, distribution_strategy))
+
+
+# ------------------------------------------------------------------------------
+# Internal context managers used to implement the DistributionStrategy
+# base class
+
+
+class _CurrentDistributionContext(object):
+  """Context manager for setting the `DistributionStrategy` and var creator."""
+
+  def __init__(self, distribution_strategy, var_creator_scope, var_scope=None):
+    self._context = _CrossTowerThreadMode(distribution_strategy)
+    self._var_creator_scope = var_creator_scope
+    self._var_scope = var_scope
+
+  def __enter__(self):
+    _push_per_thread_mode(self._context)
+    if self._var_scope:
+      self._var_scope.__enter__()
+    self._var_creator_scope.__enter__()
+    return self._context.distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
+    if self._var_scope:
+      self._var_scope.__exit__(exception_type, exception_value, traceback)
+    _pop_per_thread_mode()
+
+
+class _SameScopeAgainContext(object):
+  """Trivial context manager when you are already in `scope()`."""
+
+  def __init__(self, distribution_strategy):
+    self._distribution_strategy = distribution_strategy
+
+  def __enter__(self):
+    return self._distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+
+
+# ------------------------------------------------------------------------------
+# Base classes for all distribution strategies.
+
+
+class DistributionStrategy(object):
+  """A list of devices with a state & compute distribution policy.
+
+  The intent is that you can write an algorithm in a stylized way and
+  it will be usable with a variety of different `DistributionStrategy`
+  implementations. Each descendant will implement a different strategy
+  for distributing the algorithm across multiple devices/machines.
+  Furthermore, these changes can be hidden inside the specific layers
+  and other library classes that need special treatment to run in a
+  distributed setting, so that most users' model definition code can
+  run unchanged. The `DistributionStrategy` API works the same way
+  with eager and graph execution.
+
+  First let's introduce a few high-level concepts:
+
+  * _Data parallelism_ is where we run multiple copies of the model
+    on different slices of the input data. This is in contrast to
+    _model parallelism_ where we divide up a single copy of a model
+    across multiple devices.
+    Note: for now we only support data parallelism at this time, but
+    hope to add support for model parallelism in the future.
+  * A _tower_ is one copy of the model, running on one slice of the
+    input data.
+  * _Synchronous_, or more commonly _sync_, training is when the
+    updates from each tower are aggregated together before updating
+    the model variables. This is in contrast to _asynchronous_, or
+    _async_ training where each tower updates the model variables
+    independently.
+  * Furthermore you might run your computation on multiple devices
+    on one machine (or "host"), or on multiple machines/hosts.
+    If you are running on multiple machines, you might have a
+    single master host that drives computation across all of them,
+    or you might have multiple clients driving the computation
+    asynchronously.
+
+  To distribute an algorithm, we might use some of these ingredients:
+
+  * Parameter servers: These are hosts that hold a single copy of
+    parameters/variables. All towers that want to operate on a variable
+    retrieve it at the beginning of a step and send an update to be
+    applied at the end of the step. Can support either sync or async
+    training.
+  * Mirrored variables: These are variables that are copied to multiple
+    devices, where we keep the copies in sync by applying the same
+    updates to every copy. Normally would only be used with sync training.
+  * Reductions and Allreduce: A _reduction_ is some method of
+    aggregating multiple values into one value, like "sum" or
+    "mean". If doing sync training, we will perform a reduction on the
+    gradients to a parameter from each tower before applying the
+    update. Allreduce is an algorithm for performing a reduction on
+    values from multiple devices and making the result available on
+    all of those devices.
+  * In the future we will have support for TensorFlows' partitioned
+    variables, where a single variable is split across multiple
+    devices.
+
+  We have then a few approaches we want to support:
+  * Code written (as if) with no knowledge of class `DistributionStrategy`.
+    This code should work as before, even if some of the layers, etc.
+    used by that code are written to be distribution-aware. This is done
+    by having a default `DistributionStrategy` that gives ordinary behavior,
+    and by default being in a single tower context.
+  * Ordinary model code that you want to run using a specific
+    `DistributionStrategy`. This can be as simple as:
+
+    ```
+    with my_distribution.scope():
+      iterator = my_distribution.distribute_dataset(dataset)
+      tower_train_ops = my_distribution.call_for_each_tower(
+          tower_fn, iterator.get_next())
+      train_op = tf.group(my_distribution.unwrap(tower_train_ops))
+    ```
+
+    This takes an ordinary `dataset` and `tower_fn` and runs it
+    distributed using a particular `DistributionStrategy` in
+    `my_distribution`. Any variables created in `tower_fn` are created
+    using `my_distribution`'s policy, and library functions called by
+    `tower_fn` can use the `get_tower_context()` API to get enhanced
+    behavior in this case.
+
+    Note that in the future we will add support for initializable
+    Dataset iterators, at which point this example code will change.
+
+  * If you want to write a distributed algorithm, you may use any of
+    the `DistributionStrategy` APIs inside a
+    `with my_distribution.scope():` block of code.
+
+  Lower-level concepts:
+
+  * Wrapped values: In order to represent values parallel across devices
+    (either towers or the devices associated with a particular value), we
+    wrap them in a "PerDevice" or "Mirrored" object that contains a map
+    from device to values. "PerDevice" is used when the value may be
+    different across devices, and "Mirrored" when the value are the same.
+  * Unwrapping and merging: Consider calling a function `fn` on
+    multiple devices, like `call_for_each_tower(fn, w)` with an
+    argument `w` that is a wrapped value. This means `w` will have a
+    map taking tower device `d0` to `w0`, tower device `d1` to `w1`,
+    etc. `call_for_each_tower()` unwraps `w` before calling `fn`, so
+    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges
+    the return values from `fn()`, which can possibly result in
+    wrapped values. For example, let's say `fn()` returns a tuple with
+    three components: `(x, a, v0)` from tower 0, `(x, b, v1)` on tower 1,
+    etc. If the first component is the same object `x` from every
+    tower, then the first component of the merged result will also be
+    `x`. If the second component is different (`a`, `b`, ...)  from
+    each tower, then the merged value will have a wrapped map from
+    tower device to the different values. If the third component is
+    the members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to
+    `v1`, etc.), then the merged result will be that mirrored variable
+    (`v`).
+  * Tower context vs. Cross-tower context: _tower context_ is when we
+    are in some function that is being called once for each tower.
+    Otherwise we are in cross-tower context, which is useful for
+    calling `DistributionStrategy` methods which operate across the
+    towers (like `reduce()`). By default you start in a tower context
+    (the default "single tower context") and then some methods can
+    switch you back and forth, as described below.
+  * Worker devices vs. parameter devices: Most tower computations will
+    happen on worker devices. Since we don't yet support model
+    parallelism, there will be one worker device per tower. When using
+    parameter servers (see above), the set of devices holding
+    variables may be different, otherwise the parameter devices might
+    match the worker devices.
+  * Non-slot devices are some subset of the parameter devices where we
+    put all the non-slot variables. We need to ensure that all
+    non-slot variables are allocated on the same device, or mirrored
+    across the same set of devices. If you have some variable you want
+    to colocate all the non-slot variables with, you can use
+    `colocate_vars_with()` to get the remaining non-slot variables on
+    the same device.  Otherwise you can use `non_slot_devices()` to
+    pick a consistent set of devices to pass to both
+    `colocate_vars_with()` and `update_non_slot()`.
+
+  When using a `DistributionStrategy`, we have a new type dimension
+  called _locality_ that says what values are compatible with which
+  APIs:
+
+  * T: different value for each tower (e.g. a PerDevice-wrapped value).
+  * M: value is "mirrored" across towers, i.e. there are copies with the
+    same value on each tower (e.g. a Mirrored-wrapped value).
+  * V(`v`): value is "mirrored" across all the devices which have a
+    copy of variable `v` (also a Mirrored-wrapped value, but over
+    parameter devices instead of worker devices).
+  * N: value is "mirrored" across all the "non-slot" devices
+
+  Rules for methods with respect to locality and single-tower vs.
+  cross-tower context:
+
+  * `with d.scope()`: default single-tower context -> cross-tower context for
+    `d`
+  * `with d.colocate_vars_with(v)`: in tower/cross-tower context, variables
+    will be created with locality V(`v`). That is, if we write
+    `with d.colocate_vars_with(v1): v2 = tf.get_variable(...)`, then
+    `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
+    V(`v1`).
+  * `with d.colocate_vars_with(d.non_slot_devices(...))`: in
+    tower/cross-tower context, variables will be created with locality N
+  * `v = tf.get_variable(...)`: in tower/cross-tower context, creates
+    a variable (which by definition will have locality V(`v`), though
+    will match another locality if inside a `colocate_vars_with`
+    scope).
+  * `d.distribute_dataset(dataset)`: in cross-tower context, produces an
+    iterator with locality T
+  * `d.broadcast(t)`: in cross-tower context, produces a value with locality M
+  * `d.broadcast(t, v)`: in cross-tower context, produces a value with
+    locality V(`v`)
+  * `d.call_for_each_tower(fn, ...)`: in cross-tower context, runs
+    `fn()` in a tower context (and so may call `get_tower_context()` and
+    use its API, including `merge_call()` to get back to cross-tower
+    context), once for each tower. May use values with locality T or
+    M, and any variable.
+  * `d.reduce(m, t)`: in cross-tower context, accepts t with locality T
+    and produces a value with locality M.
+  * `d.reduce(m, t, v)`: in cross-tower context, accepts t with
+    locality T and produces a value with locality V(`v`).
+  * `d.batch_reduce(m, [(t, v)]): see `d.reduce()`
+  * `d.update(v, fn, ...)`: in cross-tower context, runs `fn()` once
+    for each device `v` is copied to, all inputs should have locality
+    V(`v`), output will have locality V(`v`) as well.
+  * `d.update_non_slot(d.non_slot_devices(), fn)`: in cross-tower
+    context, like `d.update()` except with locality N.
+  * `d.fetch(t)`: Copy `t` with any locality to the client's CPU device.
+
+  The standard pattern for updating variables is to:
+
+  1. Wrap your input dataset in `d.distribute_dataset()`.
+  2. Define each tower `d.call_for_each_tower()` up to the point of
+     getting a list of gradient, variable pairs.
+  3. Call `d.reduce("sum", t, v)` or `d.batch_reduce()` to sum the
+     gradients (with locality T) into values with locality V(`v`).
+  4. Call `d.update(v)` for each variable to update its value.
+
+  Steps 3 and 4 are done automatically by class `Optimizer` if you call
+  its `apply_gradients` method in a tower context. Otherwise you can
+  manually call its `_distributed_apply` method in a cross-tower context.
+
+  Another thing you might want to do in the middle of your tower function
+  is an all-reduce of some intermediate value, using `d.reduce()` or
+  `d.batch_reduce()` without supplying a variable as the destination.
+
+  Layers should expect to be called in a tower context, and can use
+  the `get_tower_context()` function to get a `TowerContext` object.  The
+  `TowerContext` object has a `merge_call()` method for entering
+  cross-tower context where you can use `reduce()` (or
+  `batch_reduce()`) and then optionally `update()` to update state.
+
+  You may use this API whether or not a `DistributionStrategy` is
+  being used, since there is a default implementation of
+  `TowerContext` and `DistributionStrategy`. Or you can use the
+  `get_tower_context().is_single_tower` property to run different code
+  in the distributed vs. single tower cases.
+  """
+
+  # TODO(josh11b): Raise an exception if variable partitioning requested before
+  #   we add support.
+  # TODO(josh11b): Also `parameter_device_index` property?
+  # TODO(josh11b): `map()`
+  # TODO(josh11b): ClusterSpec/ClusterResolver
+  # TODO(josh11b): Partitioned computations, state; sharding
+  # TODO(josh11b): Model parallelism: "towers" with multiple devices; shuffling
+  # TODO(josh11b): List of towers with their worker and parameter devices
+  #   (where the parameter devices may overlap in the ps case).
+
+  def scope(self):
+    """Returns a context manager selecting this DistributionStrategy as current.
+
+    Inside a `with distribution_strategy.scope():` code block, this thread
+    will use a variable creator set by `distribution_strategy`, and will
+    enter its "cross-tower context".
+
+    Returns:
+      A context manager.
+    """
+    if has_distribution_strategy():
+      _require_cross_tower_context(self)
+      return _SameScopeAgainContext(self)
+
+    def creator_with_resource_vars(*args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      kwargs["use_resource"] = True
+      return self._create_variable(*args, **kwargs)
+
+    def disable_partitioned_variables(getter, *args, **kwargs):
+      if kwargs.pop("partitioner", None) is not None:
+        tf_logging.log_first_n(
+            tf_logging.WARN, "Partitioned variables are disabled when using "
+            "DistributionStrategy.", 1)
+      return getter(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        self, variable_scope.variable_creator_scope(creator_with_resource_vars),
+        variable_scope.variable_scope(
+            variable_scope.get_variable_scope(),
+            custom_getter=disable_partitioned_variables))
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    # Note: should support "colocate_with" argument.
+    raise NotImplementedError("must be implemented in descendants")
+
+  def tower_local_var_scope(self, reduce_method):
+    """Inside this scope, new variables will not be mirrored.
+
+    There will still be one component variable per tower, but there is
+    no requirement that they stay in sync. Instead, when saving them
+    or calling `fetch()`, we use the value that results when calling
+    `reduce()` on all the towers' variables.
+
+    Note: tower-local implies not trainable. Instead, it is expected
+    that each tower will directly update (using `assign_add()` or
+    whatever) its local variable instance but only the aggregated
+    value (accessible using `fetch()`) will be exported from the
+    model. When it is acceptable to only aggregate on export, we
+    greatly reduce communication overhead by using tower-local
+    variables.
+
+    Note: All component variables will be initialized to the same
+    value, using the initialization expression from the first tower.
+    The values will match even if the initialization expression uses
+    random numbers.
+
+    Args:
+      reduce_method: String used as a `method_string` to `reduce()`
+        to get the value to save when checkpointing.
+
+    Returns:
+      A context manager.
+    """
+    def create_tower_local_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      kwargs["use_resource"] = True
+      kwargs["tower_local_reduce_method"] = reduce_method
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope(self)
+    return variable_scope.variable_creator_scope(create_tower_local_variable)
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Scope that controls which devices variables will be created on.
+
+    No operations should be added to the graph inside this scope, it
+    should only be used when creating variables (some implementations
+    work by changing variable creation, others work by using a
+    tf.colocate_with() scope).
+
+    This may only be used inside `self.scope()`.
+
+    Example usage:
+
+    ```
+    with distribution_strategy.scope():
+      var1 = tf.get_variable(...)
+      with distribution_strategy.colocate_vars_with(v1):
+        # var2 and var3 will be created on the same device(s) as var1
+        var2 = tf.get_variable(...)
+        var3 = tf.get_variable(...)
+
+      def fn(v1, v2, v3):
+        # operates on v1 from var1, v2 from var2, and v3 from var3
+
+      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
+      distribution_strategy.update(v1, fn, v2, v3)
+    ```
+
+    Args:
+      colocate_with_variable: A created in `self.scope()`. Variables created
+        while in the returned context manager will be on the same set of
+        devices as `colocate_with_variable`.
+
+    Returns:
+      A context manager.
+    """
+    def create_colocated_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      kwargs["use_resource"] = True
+      kwargs["colocate_with"] = colocate_with_variable
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope(self)
+    return variable_scope.variable_creator_scope(create_colocated_variable)
+
+  # TODO(josh11b): Currently this returns an iterator, but should return
+  # something implementing (a subset of) the Dataset API.
+  def distribute_dataset(self, dataset):
+    """Return an iterator into `dataset` split across all towers.
+
+    Suitable for providing input to for `call_for_each_tower()`, as in:
+
+    ```
+    with distribution_strategy.scope():
+      iterator = distribution_strategy.distribute_dataset(dataset)
+      tower_results = distribution_strategy.call_for_each_tower(
+          tower_fn, iterator.get_next())
+    ```
+
+    Args:
+      dataset: A `tf.data.Dataset`.
+
+    Returns:
+      A Dataset iterator that will produce separate splits for each tower.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def broadcast(self, tensor, destinations=None):
+    """Mirror a tensor on one device to all worker devices.
+
+    Args:
+      tensor: A Tensor value to broadcast.
+      destinations: An optional mirrored variable, device string, or
+        list of device strings, specifying the destination devices
+        to copy `tensor` to. Defaults to `self.worker_devices`.
+
+    Returns:
+      A value mirrored to `destinations` devices.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_tower_context(self)
+    return self._broadcast(tensor, destinations)
+
+  def _broadcast(self, tensor, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def call_for_each_tower(self, fn, *args, **kwargs):
+    """Run `fn` once per tower.
+
+    `fn` may call `tf.get_tower_context()` to access methods such as
+    `tower_id()` and `merge_call()`.
+
+    `merge_call()` is used to communicate betwen the towers and
+    re-enter the cross-tower context. All towers pause their execution
+    having encountered a `merge_call()` call. After that the
+    `merge_fn`-function is executed. Its results are then unwrapped and
+    given back to each tower call. After that execution resumes until
+    `fn` is complete or encounters another `merge_call()`.  Example:
+
+    ```python
+    # Called once in "cross-tower" context.
+    def merge_fn(distribution, three_plus_tower_id):
+      # sum the values across towers
+      return sum(distribution.unwrap(three_plus_tower_id))
+
+    # Called once per tower in `distribution`, in a "tower" context.
+    def fn(three):
+      tower_ctx = tf.get_tower_context()
+      v = three + tower_ctx.tower_id
+      # Computes the sum of the `v` values across all towers.
+      s = tower_ctx.merge_call(merge_fn, v)
+      return s + v
+
+    with distribution.scope():
+      # in "cross-tower" context
+      ...
+      merged_results = distribution.call_for_each_tower(fn, 3)
+      # merged_results has the values from every tower execution of `fn`.
+      print(distribution.unwrap(merged_results))  # Prints a list
+    ```
+
+    Args:
+      fn: function to run (will be run once per tower).
+      *args: positional arguments for `fn`
+      **kwargs: keyword arguments for `fn`.
+          `"run_concurrently"`: Boolean indicating whether executions of `fn`
+             can be run concurrently (under eager execution only), defaults to
+             `True`.
+
+    Returns:
+      Merged return value of `fn` across all towers.
+    """
+    _require_cross_tower_context(self)
+    return self._call_for_each_tower(fn, *args, **kwargs)
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def reduce(self, method_string, value, destinations=None):
+    """Combine (via e.g. sum or mean) values across towers.
+
+    Args:
+      method_string: A string indicating how to combine values, either
+        "sum" or "mean".
+      value: A per-device value with one value per tower.
+      destinations: An optional mirrored variable, a device string,
+        list of device strings. The return value will be copied to all
+        destination devices (or all the devices where the mirrored
+        variable resides). If `None` or unspecified, the destinations
+        will match the devices `value` resides on.
+
+    Returns:
+      A value mirrored to `destinations`.
+    """
+    # TODO(josh11b): More docstring
+    # TODO(josh11b): Return an unwrapped value if colocate_with is a
+    # single device.
+    _require_cross_tower_context(self)
+    return self._reduce(method_string, value, destinations)
+
+  def _reduce(self, method_string, value, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def batch_reduce(self, method_string, value_destination_pairs):
+    """Combine multiple `reduce` calls into one for faster execution.
+
+    Args:
+      method_string: A string indicating how to combine values, either
+        "sum" or "mean".
+      value_destination_pairs: A sequence of (value, destinations)
+        pairs. See `reduce()` for a description.
+
+    Returns:
+      A list of mirrored values, one per pair in `value_destination_pairs`.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_tower_context(self)
+    assert method_string in ("sum", "mean")
+    return self._batch_reduce(method_string, value_destination_pairs)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    return [self.reduce(method_string, t, destinations=v)
+            for t, v in value_destination_pairs]
+
+  def update(self, var, fn, *args, **kwargs):
+    """Run `fn` to update `var` using inputs mirrored to the same devices.
+
+    If `var` is mirrored across multiple devices, then this implements
+    logic like:
+
+    ```
+    results = {}
+    for device, v in var:
+      with tf.device(device):
+        # *args and **kwargs will be unwrapped if they are mirrored.
+        results[device] = fn(v, *args, **kwargs)
+    return merged(results)
+    ```
+
+    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.'
+
+    Neither *args nor **kwargs may contain per-device values.
+    If they contain mirrored values, they will be unwrapped before
+    calling `fn`.
+
+    Args:
+      var: Variable, possibly mirrored to multiple devices, to operate on.
+      fn: Function to call. Should take the variable as the first argument.
+      *args: Additional positional arguments to pass to `fn()`.
+      **kwargs: Keyword arguments to pass to `fn()`.
+
+    Returns:
+      Merged return value of `fn` across all towers.
+    """
+    _require_cross_tower_context(self)
+    return self._update(var, fn, *args, **kwargs)
+
+  def _update(self, var, fn, *args, **kwargs):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
+
+    Args:
+      colocate_with: The return value of `non_slot_devices()`.
+      fn: Function to execute.
+      *args: Positional arguments to pass to `fn()`.
+      **kwargs: Keyword arguments to pass to `fn()`.
+
+    Returns:
+      Return value of `fn`, possibly merged across devices.
+    """
+    _require_cross_tower_context(self)
+    return self._update_non_slot(colocate_with, fn, *args, **kwargs)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def fetch(self, val, destination="/device:CPU:0", fn=lambda x: x):
+    """Return a copy of `val` or `fn(val)` on `destination`.
+
+    This is useful for getting a mirrored value onto a device.  It
+    will attempt to avoid a copy by checking if the value is already
+    on the destination device.
+
+    Args:
+      val: Value (which may be mirrored) to copy.
+      destination: A device string to copy the value to.
+      fn: An optional function to apply to the value on the source
+          device, before copying.
+
+    Returns:
+      A `Tensor` on `destination`.
+    """
+    _require_cross_tower_context(self)
+    return self._fetch(val, destination, fn)
+
+  def _fetch(self, val, destination, fn):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def unwrap(self, value):
+    """Returns the list of all per-device values contained in `value`.
+
+    Args:
+      value: A value returned by `call_for_each_tower()` or a variable
+        created in `scope()`.
+
+    Returns:
+      A list of values contained in `value`. If `value` represents a single
+      value, this returns `[value].`
+    """
+    _require_cross_tower_context(self)
+    return self._unwrap(value)
+
+  def _unwrap(self, distributed_value):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def group(self, value, name=None):
+    """Shortcut for `tf.group(distribution.unwrap(value))`."""
+    value = nest.flatten(self.unwrap(value))
+
+    if len(value) != 1 or name is not None:
+      return control_flow_ops.group(value, name=name)
+    # Special handling for the common case of one op.
+    v, = value
+    if isinstance(v, ops.Tensor):
+      v = v.op
+    return v
+
+  @property
+  def is_single_tower(self):
+    """Returns whether there is a single tower or multiple.
+
+    Returns:
+      A boolean. If `True`, `call_for_each_tower(fn)` will only call `fn` once.
+      If `False`, `call_for_each_tower(fn)` may call `fn` multiple times.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def num_towers(self):
+    """Returns number of towers, for purposes of averaging across towers."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def worker_devices(self):
+    """Returns the list of devices used to run `call_for_each_tower()` calls."""
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def parameter_devices(self):
+    """Returns the list of devices used for variable and `update` placement."""
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  def non_slot_devices(self, var_list):
+    """Device(s) for non-slot variables.
+
+    Create variables on these devices in a
+    `with colocate_vars_with(non_slot_devices(...)):` block.
+    Update those using `update_non_slot()`.
+
+    Args:
+      var_list: The list of variables being optimized, needed with the
+        default `DistributionStrategy`.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def worker_device_index(self):
+    """An object mapping worker device to an id.
+
+    This might be passed as an argument to `call_for_each_tower()`, as in:
+
+    ```
+    with distribution_strategy.scope():
+
+      def fn(device_id):
+        # device_id is an integer. `fn` is being executed on device:
+        #    distribution_strategy.worker_devices[device_id].
+
+      distribution_strategy.call_for_each_tower(
+          fn, distribution_strategy.worker_device_index)
+    ```
+
+    Returns:
+      An index object, or the integer 0 if there is only a single tower.
+    """
+    _require_cross_tower_context(self)
+    return self._worker_device_index()
+
+  def _worker_device_index(self):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def configure(self, session_config=None):
+    """Find the best configuration given a tensorflow session config."""
+    del session_config
+
+
+# A note about the difference between the context managers
+# `TowerContext` (defined here) and `_CurrentDistributionContext`
+# (defined above) used by `DistributionStrategy.scope()`:
+#
+# * a TowerContext is only present during a `call_for_each_tower()`
+#   call (except during a `merge_run` call) and in such a scope it
+#   will be returned by calls to `get_tower_context()`.  Implementers of new
+#   DistributionStrategy descendants will frequently also need to
+#   define a descendant of TowerContext, and are responsible for
+#   entering and exiting this context.
+#
+# * DistributionStrategy.scope() sets up a variable_creator scope that
+#   changes variable creation calls (e.g. to make mirrored
+#   variables). This is intended as an outer scope that users enter once
+#   around their model creation and graph definition. There is no
+#   anticipated need to define descendants of _CurrentDistributionContext.
+#   It sets the current DistributionStrategy for purposes of
+#   `get_distribution_strategy()` and `has_distribution_strategy()`
+#   and switches the thread mode to a "cross-tower context".
+class TowerContext(object):
+  """DistributionStrategy API inside a `call_for_each_tower()` call."""
+
+  def __init__(self, distribution_strategy, tower_id):
+    self._distribution_strategy = distribution_strategy
+    self._thread_context = _InTowerThreadMode(self)
+    self._tower_id = tower_id
+
+  def __enter__(self):
+    _push_per_thread_mode(self._thread_context)
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    _pop_per_thread_mode()
+
+  def merge_call(self, merge_fn, *args, **kwargs):
+    """Merge args across towers and run `merge_fn` in a cross-tower context.
+
+    This allows communication and coordination when there are multiple calls
+    to a model function triggered by a call to
+    `distribution.call_for_each_tower(model_fn, ...)`.
+
+    See `MirroredDistribution.call_for_each_tower()` for an explanation.
+
+    Otherwise, this is equivalent to:
+
+    ```
+    distribution = get_distribution_strategy()
+    with cross-tower-context(distribution):
+      return merge_fn(distribution, *args, **kwargs)
+    ```
+
+    Args:
+      merge_fn: function that joins arguments from threads that are given as
+        PerDevice. It accepts `DistributionStrategy` object as the first
+        argument.
+      *args: positional per-thread arguments for `merge_fn`
+      **kwargs: keyword per-thread arguments for `merge_fn`.
+
+    Returns:
+      The return value of `merge_fn`, except for `PerDevice` values which are
+      unpacked.
+    """
+    require_tower_context(self)
+    return self._merge_call(merge_fn, *args, **kwargs)
+
+  def _merge_call(self, merge_fn, *args, **kwargs):
+    """Default implementation for single tower."""
+    _push_per_thread_mode(  # thread-local, so not needed with multiple threads
+        _CrossTowerThreadMode(self._distribution_strategy))
+    try:
+      return merge_fn(self._distribution_strategy, *args, **kwargs)
+    finally:
+      _pop_per_thread_mode()
+
+  def tower_local_var_scope(self, reduce_method):
+    """Alias for distribution_strategy.tower_local_var_scope()."""
+    return self._distribution_strategy.tower_local_var_scope(reduce_method)
+
+  @property
+  def is_single_tower(self):
+    """Returns whether there is a single tower or multiple."""
+    require_tower_context(self)
+    return self._distribution_strategy.is_single_tower
+
+  @property
+  def num_towers(self):
+    """Returns number of towers, for purposes of averaging across towers."""
+    return self._distribution_strategy.num_towers
+
+  @property
+  def tower_id(self):
+    """Which tower is being defined, a number from 0 to `num_towers - 1`."""
+    require_tower_context(self)
+    return self._tower_id
+
+  @property
+  def distribution_strategy(self):
+    """The current `DistributionStrategy` object."""
+    return self._distribution_strategy
+
+  @property
+  def device(self):
+    """The device this tower is to be executed on, as a string."""
+    require_tower_context(self)
+    return device_util.current()
+
+  # TODO(josh11b): Implement `start_all_reduce(method, t)` that returns
+  # a function returning the result of reducing `t` across all
+  # towers. Most likely can be implemented in terms of `merge_call()`
+  # and `batch_reduce()`.
+
+# ------------------------------------------------------------------------------
+
+
+class _DefaultDistributionStrategy(DistributionStrategy):
+  """Default `DistributionStrategy` if none is explicitly selected."""
+
+  def scope(self):
+    """Context manager setting a variable creator and `self` as current."""
+    if has_distribution_strategy():
+      raise RuntimeError("Must not nest DistributionStrategy scopes.")
+
+    def creator(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      if kwargs.pop("tower_local_reduce_method", None) is not None:
+        kwargs["trainable"] = False
+      return next_creator(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        self, variable_scope.variable_creator_scope(creator))
+
+  def tower_local_var_scope(self, reduce_method):
+    """Does not set to resource variables."""
+    def create_tower_local_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      kwargs["tower_local_reduce_method"] = reduce_method
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope(self)
+    return variable_scope.variable_creator_scope(create_tower_local_variable)
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Does not require `self.scope`."""
+    _require_distribution_strategy_scope(self)
+    return ops.colocate_with(colocate_with_variable)
+
+  def distribute_dataset(self, dataset):
+    # TODO(josh11b): Support for this when executing eagerly is currently only
+    # in contrib.
+    return dataset.make_one_shot_iterator()
+
+  def _broadcast(self, tensor, destinations):
+    if destinations is None:
+      return tensor
+    else:
+      raise NotImplementedError("TODO")
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    # We don't run `fn` in multiple threads in _DefaultDistributionStrategy.
+    kwargs.pop("run_concurrently", None)
+    with TowerContext(self, tower_id=0):
+      return fn(*args, **kwargs)
+
+  def _reduce(self, method_string, value, destinations):
+    # TODO(josh11b): Use destinations?
+    del method_string, destinations
+    return value
+
+  def _update(self, var, fn, *args, **kwargs):
+    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
+    # once that value is used for something.
+    with ops.colocate_with(var), UpdateContext(var):
+      return fn(var, *args, **kwargs)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
+    # once that value is used for something.
+    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
+      return fn(*args, **kwargs)
+
+  def _fetch(self, var, destination, fn):
+    with ops.colocate_with(var):
+      var = fn(var)
+    with ops.device(destination):
+      return array_ops.identity(var)
+
+  def _unwrap(self, distributed_value):
+    return [distributed_value]
+
+  @property
+  def is_single_tower(self):
+    return True
+
+  @property
+  def num_towers(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    raise RuntimeError(
+        "worker_devices() method unsupported by _DefaultDistributionStrategy.")
+
+  @property
+  def parameter_devices(self):
+    raise RuntimeError("parameter_devices() method unsupported by "
+                       "_DefaultDistributionStrategy.")
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
+
+  def _worker_device_index(self):
+    raise RuntimeError("worker_device_index() method unsupported by "
+                       "_DefaultDistributionStrategy.")
+
+# ------------------------------------------------------------------------------
+# Common operations
+
+
+def increment_var(v, amount=1):
+  """`v += amount`, distributed-aware version."""
+  def update(vu):
+    if isinstance(vu, resource_variable_ops.ResourceVariable):
+      return vu.assign_add(amount, read_value=False)
+    else:
+      return state_ops.assign_add(vu, amount)
+
+  def merge_fn(dist, vm):
+    return dist.group(dist.update(vm, update))
+
+  tower_context = get_tower_context()
+  return tower_context.merge_call(merge_fn, v)
+
+
+# ------------------------------------------------------------------------------
+# Singletons
+
+_default_distribution_strategy = _DefaultDistributionStrategy()
+_default_tower_context = TowerContext(
+    _default_distribution_strategy, tower_id=0)
+_default_tower_mode = _DefaultTowerThreadMode()
+
+
+# ------------------------------------------------------------------------------
+# We haven't yet implemented deserialization for DistributedVariables.
+# So here we catch any attempts to deserialize variables
+# when using distribution strategies.
+# pylint: disable=protected-access
+def _from_proto_fn(v, import_scope=None):
+  if has_distribution_strategy():
+    raise NotImplementedError(
+        "Deserialization of variables is not yet supported when using"
+        "distributed strategies.")
+  else:
+    resource_variable_ops._from_proto_fn(v, import_scope=import_scope)
+
+resource_variable_ops._from_proto_fn = _from_proto_fn
+# pylint: enable=protected-access
diff --git a/tensorflow/python/training/distribute_test.py b/tensorflow/python/training/distribute_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a4f19c31f6714e1211f9deed9703c02192cc2c0
--- /dev/null
+++ b/tensorflow/python/training/distribute_test.py
@@ -0,0 +1,104 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test DistributionStrategy, TowerContext, and supporting APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+from tensorflow.python.training import distribute
+
+
+class _TestTowerContext(distribute.TowerContext):
+
+  def merge_call(self, fn, *args, **kwargs):
+    return kwargs["test_arg"]
+
+
+class _TestStrategy(distribute.DistributionStrategy):
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    with _TestTowerContext(self, tower_id=0):
+      return fn(*args, **kwargs)
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    return kwargs["name"]
+
+
+def _assert_in_default_state(t):
+  t.assertIs(distribute._default_tower_context,
+             distribute.get_tower_context())
+  t.assertIs(None, distribute.get_cross_tower_context())
+  t.assertIs(distribute._default_distribution_strategy,
+             distribute.get_distribution_strategy())
+  t.assertFalse(distribute.has_distribution_strategy())
+
+
+class TestStrategyTest(test.TestCase):
+
+  def testCallForEachTower(self):
+    _assert_in_default_state(self)
+    dist = _TestStrategy()
+
+    def run_fn():
+      tower_context = distribute.get_tower_context()
+      self.assertTrue(tower_context is not None)
+      self.assertIs(None, distribute.get_cross_tower_context())
+      self.assertTrue(distribute.has_distribution_strategy())
+      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertEqual("foo", tower_context.merge_call(None, test_arg="foo"))
+      self.assertEqual("bar", variable_scope.variable(1.0, name="bar"))
+
+    with self.assertRaises(RuntimeError):
+      dist.call_for_each_tower(run_fn)
+    with dist.scope():
+      dist.call_for_each_tower(run_fn)
+    _assert_in_default_state(self)
+
+  def testScope(self):
+    _assert_in_default_state(self)
+    dist = _TestStrategy()
+    with dist.scope():
+      self.assertIs(None, distribute.get_tower_context())
+      self.assertIs(dist, distribute.get_cross_tower_context())
+      self.assertTrue(distribute.has_distribution_strategy())
+      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertEqual("baz", variable_scope.variable(1.0, name="baz"))
+    _assert_in_default_state(self)
+
+
+class DefaultDistributionStrategyTest(test.TestCase):
+
+  def testMergeCall(self):
+    _assert_in_default_state(self)
+
+    def merge_fn(dist, s):
+      self.assertIs(distribute._default_distribution_strategy, dist)
+      self.assertIs(None, distribute.get_tower_context())
+      self.assertIs(dist, distribute.get_cross_tower_context())
+      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertFalse(distribute.has_distribution_strategy())
+      return "foo_" + s
+
+    tower_ctx = distribute.get_tower_context()
+    self.assertIs(distribute._default_tower_context, tower_ctx)
+    self.assertEqual("foo_bar", tower_ctx.merge_call(merge_fn, "bar"))
+    _assert_in_default_state(self)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 44f00a96deff64012705c4c81b185a9c4fac2295..caa26581e8a0041dd1b157ab6b1f8236344582e8 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -515,8 +515,7 @@ def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
     def _sparse_values_to_keep(t, keep_input):
       """Convert a per-row `keep_input` vector to a per-value one."""
       # Get the rows of every value in the sparse Tensor.
-      row_values = array_ops.reshape(
-          t.indices, [array_ops.shape(t.indices)[0], -1])[:, 0]
+      row_values = t.indices[:, 0]
       # The value should be kept iff the row should be kept.
       return array_ops.gather(keep_input, row_values)
     if keep_input.shape.ndims == 1:
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 6c5c9e01a76d539b550420134b09090b89beed46..4ce6f6d00267410626f7d7a9e2251d3f40b6bb6e 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -281,13 +281,14 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              scaffold=None,
                              hooks=None,
                              chief_only_hooks=None,
-                             save_checkpoint_secs=600,
+                             save_checkpoint_secs=USE_DEFAULT,
                              save_summaries_steps=USE_DEFAULT,
                              save_summaries_secs=USE_DEFAULT,
                              config=None,
                              stop_grace_period_secs=120,
                              log_step_count_steps=100,
-                             max_wait_secs=7200):
+                             max_wait_secs=7200,
+                             save_checkpoint_steps=USE_DEFAULT):
   """Creates a `MonitoredSession` for training.
 
   For a chief, this utility sets proper session initializer/restorer. It also
@@ -310,8 +311,10 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
     chief_only_hooks: list of `SessionRunHook` objects. Activate these hooks if
       `is_chief==True`, ignore otherwise.
     save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved
-      using a default checkpoint saver. If `save_checkpoint_secs` is set to
-      `None`, then the default checkpoint saver isn't used.
+      using a default checkpoint saver. If both `save_checkpoint_steps` and
+      `save_checkpoint_secs` are set to `None`, then the default checkpoint
+      saver isn't used. If both are provided, then only `save_checkpoint_secs`
+      is used. Default 600.
     save_summaries_steps: The frequency, in number of global steps, that the
       summaries are written to disk using a default summary saver. If both
       `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
@@ -330,6 +333,11 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       become available. This should be kept relatively short to help detect
       incorrect code, but sometimes may need to be increased if the chief takes
       a while to start up.
+    save_checkpoint_steps: The frequency, in number of global steps, that a
+      checkpoint is saved using a default checkpoint saver. If both
+      `save_checkpoint_steps` and `save_checkpoint_secs` are set to `None`, then
+      the default checkpoint saver isn't used. If both are provided, then only
+      `save_checkpoint_secs` is used. Default not enabled.
 
   Returns:
     A `MonitoredSession` object.
@@ -342,6 +350,15 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
   elif save_summaries_steps == USE_DEFAULT:
     save_summaries_steps = None
 
+  if (save_checkpoint_steps == USE_DEFAULT and
+      save_checkpoint_secs == USE_DEFAULT):
+    save_checkpoint_steps = None
+    save_checkpoint_secs = 600
+  elif save_checkpoint_secs == USE_DEFAULT:
+    save_checkpoint_secs = None
+  elif save_checkpoint_steps == USE_DEFAULT:
+    save_checkpoint_steps = None
+
   scaffold = scaffold or Scaffold()
   if not is_chief:
     session_creator = WorkerSessionCreator(
@@ -374,9 +391,13 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
           save_steps=save_summaries_steps,
           save_secs=save_summaries_secs,
           output_dir=checkpoint_dir))
-    if save_checkpoint_secs and save_checkpoint_secs > 0:
+    if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
+        save_checkpoint_steps and save_checkpoint_steps > 0):
       all_hooks.append(basic_session_run_hooks.CheckpointSaverHook(
-          checkpoint_dir, save_secs=save_checkpoint_secs, scaffold=scaffold))
+          checkpoint_dir,
+          save_steps=save_checkpoint_steps,
+          save_secs=save_checkpoint_secs,
+          scaffold=scaffold))
 
   if hooks:
     all_hooks.extend(hooks)
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 159b2d5c1605bdd95303efb25690f55a54a3625d..3806056f01a73d21faf3de4539c0dd1ada5f96f8 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -282,6 +282,42 @@ class MonitoredTrainingSessionTest(test.TestCase):
           is_chief=True, checkpoint_dir=logdir) as session:
         self.assertEqual(2, session.run(gstep))
 
+  def test_save_checkpoint_steps(self):
+    logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_steps')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True,
+          checkpoint_dir=logdir,
+          save_checkpoint_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(100):
+          session.run(new_gstep)
+      # A restart will find the checkpoint and recover automatically.
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True, checkpoint_dir=logdir) as session:
+        self.assertEqual(100, session.run(gstep))
+
+  def test_save_checkpoint_secs(self):
+    logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_secs')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True,
+          checkpoint_dir=logdir,
+          save_checkpoint_secs=0.1,
+          log_step_count_steps=10) as session:
+        session.run(new_gstep)
+        time.sleep(0.2)
+        for _ in range(10):
+          session.run(new_gstep)
+      # A restart will find the checkpoint and recover automatically.
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True, checkpoint_dir=logdir) as session:
+        self.assertEqual(11, session.run(gstep))
+
   def test_summaries_steps(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_summaries_steps')
     with ops.Graph().as_default():
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index b9ecb27df19d051c28ec1c3fe3cd9fd86717a5ed..61fc828a840c490b0f787119134a0941f60f947a 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -52,16 +52,19 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
   they were created in and the scope of the variables they debias. They are also
   given a uniqifying-suffix.
 
-  Ex:
+  E.g.:
+
+  ```
     with tf.variable_scope('scope1'):
       with tf.variable_scope('scope2'):
         var = tf.get_variable('foo')
-        assign_moving_average(var, 0.0, 1.0)
-        assign_moving_average(var, 0.0, 0.9)
+        tf.assign_moving_average(var, 0.0, 1.0)
+        tf.assign_moving_average(var, 0.0, 0.9)
 
-    var.name: 'scope1/scope2/foo'
-    shadow var names: 'scope1/scope2/scope1/scope2/foo/biased'
-                      'scope1/scope2/scope1/scope2/foo/biased_1'
+    # var.name: 'scope1/scope2/foo'
+    # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased'
+    #                   'scope1/scope2/scope1/scope2/foo/biased_1'
+  ```
 
   Args:
     variable: A Variable.
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 7adaedef5b6a01e195a8c0c4a8d77024b4c46c0e..75665fc2840797dd53dd863721d2744cb1b08af5 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -35,11 +35,28 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import checkpointable
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
+def get_filtered_grad_fn(grad_fn):
+  # `distributed_context.join()` requires that its arguments are parallel
+  # across threads, and in particular that `grads_and_vars` has the same
+  # variables in the same order.
+
+  # When computing gradients in eager mode with multiple threads, you
+  # can get extra variables with a gradient of `None`. This happens when
+  # those variables are accessed in another thread during the gradient
+  # computation. To get a consistent set of variables, we filter out
+  # those with `None` gradients.
+  def filtered_grad_fn(x=None):
+    return [(g, v) for g, v in grad_fn(x) if g is not None]
+
+  return filtered_grad_fn
+
+
 def _deduplicate_indexed_slices(values, indices):
   """Sums `values` associated with any non-unique `indices`.
 
@@ -61,7 +78,7 @@ def _deduplicate_indexed_slices(values, indices):
 
 def _var_key(var):
   if context.executing_eagerly():
-    return var._shared_name  # pylint: disable=protected-access
+    return var._unique_id  # pylint: disable=protected-access
   return (var.op.graph, var.op.name)
 
 
@@ -191,6 +208,10 @@ def _get_processor(v):
       return _TensorProcessor(v)
     else:
       return _DenseResourceVariableProcessor(v)
+  if isinstance(
+      v, resource_variable_ops.ResourceVariable) and not v._in_graph_mode:  # pylint: disable=protected-access
+    # True if and only if `v` was initialized eagerly.
+    return _DenseResourceVariableProcessor(v)
   if v.op.type == "VarHandleOp":
     return _DenseResourceVariableProcessor(v)
   if isinstance(v, variables.Variable):
@@ -331,6 +352,13 @@ class Optimizer(
     #   ... }
     self._deferred_slot_restorations = {}
 
+    # TODO(isaprykin): When using a DistributionStrategy, and when an
+    # optimizer is created in each tower, it might be dangerous to
+    # rely on some Optimer methods.  When such methods are called on a
+    # per-tower optimizer, an exception needs to be thrown.  We do
+    # allow creation per-tower optimizers however, because the
+    # compute_gradients()->apply_gradients() sequence is safe.
+
   def get_name(self):
     return self._name
 
@@ -443,14 +471,33 @@ class Optimizer(
         if var_list is not None:
           tape.watch(var_list)
         loss_value = loss()
+
+        # Scale loss if using a "mean" loss reduction and multiple towers.
+        # Have to be careful to call distribute_lib.get_loss_reduction()
+        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        # TODO(josh11b): Test that we handle weight decay in a reasonable way.
+        if distribute_lib.get_loss_reduction() == "mean":
+          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          if num_towers > 1:
+            loss_value *= (1. / num_towers)
+
       if var_list is None:
         var_list = tape.watched_variables()
       grads = tape.gradient(loss_value, var_list, grad_loss)
       return list(zip(grads, var_list))
+
+    # Non-callable/Tensor loss case
     if context.executing_eagerly():
       raise RuntimeError(
           "`loss` passed to Optimizer.compute_gradients should "
           "be a function when eager execution is enabled.")
+
+    # Scale loss if using a "mean" loss reduction and multiple towers.
+    if distribute_lib.get_loss_reduction() == "mean":
+      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      if num_towers > 1:
+        loss *= (1. / num_towers)
+
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
       raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
@@ -506,11 +553,25 @@ class Optimizer(
     Raises:
       TypeError: If `grads_and_vars` is malformed.
       ValueError: If none of the variables have gradients.
+      RuntimeError: If you should use `_distributed_apply()` instead.
     """
     # This is a default implementation of apply_gradients() that can be shared
     # by most optimizers.  It relies on the subclass implementing the following
     # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
 
+    # Handle DistributionStrategy case.
+    if distribute_lib.get_cross_tower_context():
+      raise RuntimeError("Use `_distributed_apply()` instead of "
+                         "`apply_gradients()` in a cross-tower context.")
+    # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
+    # always calling _distributed_apply(), using the default distribution
+    # as needed.
+    if distribute_lib.has_distribution_strategy():
+      grads_and_vars = get_filtered_grad_fn(lambda _: grads_and_vars)()
+      return distribute_lib.get_tower_context().merge_call(
+          self._distributed_apply, grads_and_vars, global_step, name)
+
+    # No DistributionStrategy case.
     grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
     if not grads_and_vars:
       raise ValueError("No variables provided.")
@@ -546,7 +607,12 @@ class Optimizer(
         # We colocate all ops created in _apply_dense or _apply_sparse
         # on the same device as the variable.
         # TODO(apassos): figure out how to get the variable name here.
-        scope_name = "" if context.executing_eagerly() else var.op.name
+        if context.executing_eagerly() or isinstance(
+            var,
+            resource_variable_ops.ResourceVariable) and not var._in_graph_mode:  # pylint: disable=protected-access
+          scope_name = ""
+        else:
+          scope_name = var.op.name
         with ops.name_scope("update_" + scope_name), ops.colocate_with(var):
           update_ops.append(processor.update_op(self, grad))
       if global_step is None:
@@ -573,6 +639,95 @@ class Optimizer(
 
       return apply_updates
 
+  def _distributed_apply(self,
+                         distribution,
+                         grads_and_vars,
+                         global_step=None,
+                         name=None):
+    """A version of `apply_gradients` for cross-tower context.
+
+    This is a version of `apply_gradients()` for when you are using a
+    `DistributionStrategy` and are in a cross-tower context. If in a
+    tower context, use `apply_gradients()` as normal.
+
+    Args:
+      distribution: A `DistributionStrategy` object.
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`, and then aggregated across towers.
+      global_step: Optional (mirrored) `Variable` to increment by one
+        after the variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients across all
+      towers. If `global_step` was not None, that operation also
+      increments `global_step`.
+    """
+    reduced_grads = distribution.batch_reduce("sum", grads_and_vars)
+    var_list = [v for _, v in grads_and_vars]
+    grads_and_vars = zip(reduced_grads, var_list)
+    # Note that this is called in a cross-tower context.
+    self._create_slots(var_list)
+
+    def update(v, g):
+      """Apply gradients to a replica variable."""
+      assert v is not None
+
+      try:
+        # Convert the grad to Tensor or IndexedSlices if necessary.
+        g = ops.convert_to_tensor_or_indexed_slices(g)
+      except TypeError:
+        raise TypeError("Gradient must be convertible to a Tensor"
+                        " or IndexedSlices, or None: %s" % g)
+      if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
+        raise TypeError(
+            "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
+      p = _get_processor(v)
+
+      scope_name = "" if context.executing_eagerly() else v.op.name
+      # device_policy is set because non-mirrored tensors will be read in
+      # `update_op`. `_resource_apply_dense`, `lr_t`, `beta1_t` and `beta2_t`
+      # is an example.
+      with ops.name_scope(
+          "update_" + scope_name), context.context().device_policy(
+              context.DEVICE_PLACEMENT_SILENT):
+        return p.update_op(self, g)
+
+    with ops.name_scope(name, self._name) as name:
+      self._prepare()
+
+      update_ops = [
+          op
+          for grad, var in grads_and_vars
+          for op in distribution.unwrap(distribution.update(var, update, grad))
+      ]
+
+      def finish(self, update_ops):
+        return self._finish(update_ops, "update")
+
+      non_slot_devices = distribution.non_slot_devices(var_list)
+      # Device policy is needed because hyperparameter tensors (such as
+      # AdamOptimizer's beta1_t) need to be copied across devices in Eager.
+      with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+        finish_updates = distribution.update_non_slot(
+            non_slot_devices, finish, self, update_ops)
+      if global_step is None:
+        apply_updates = distribution.group(finish_updates, name=name)
+      else:
+        with ops.control_dependencies(distribution.unwrap(finish_updates)):
+          apply_updates = distribution.group(distribution.update(
+              global_step, state_ops.assign_add, 1, name=name))
+
+      if not context.executing_eagerly():
+        if isinstance(apply_updates, ops.Tensor):
+          apply_updates = apply_updates.op
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        if apply_updates not in train_op:
+          train_op.append(apply_updates)
+
+      return apply_updates
+
   def get_slot(self, var, name):
     """Return a slot named `name` created for `var` by the Optimizer.
 
@@ -590,9 +745,25 @@ class Optimizer(
     Returns:
       The `Variable` for the slot if it was created, `None` otherwise.
     """
+    # pylint: disable=protected-access
     named_slots = self._slots.get(name, None)
     if not named_slots:
       return None
+
+    if hasattr(var, "_mirrored_container"):
+      # NOTE: If this isn't patched, then there is no `handle` in
+      # `_resource_apply_dense`.
+      mirrored_container = var._mirrored_container()
+      assert mirrored_container is not None
+      if context.executing_eagerly():
+        key = mirrored_container._unique_id
+      else:
+        key = (mirrored_container.graph, mirrored_container._shared_name)
+      # pylint: enable=protected-access
+      mirrored_slot = named_slots.get(key, None)
+      if mirrored_slot is None: return None
+      return mirrored_slot.get(device=var.device)
+
     return named_slots.get(_var_key(var), None)
 
   def get_slot_names(self):
@@ -636,6 +807,7 @@ class Optimizer(
 
   def _create_non_slot_variable(self, initial_value, name, colocate_with):
     """Add an extra variable, not associated with a slot."""
+    # Recommendation: Use OptimizerV2 if your optimizer uses non-slot variables.
     eager = context.executing_eagerly()
     graph = None if eager else colocate_with.graph
 
@@ -643,7 +815,8 @@ class Optimizer(
     v = self._non_slot_dict.get(key, None)
     if v is None:
       self._maybe_initialize_checkpointable()
-      with ops.colocate_with(colocate_with):
+      distribution_strategy = distribute_lib.get_distribution_strategy()
+      with distribution_strategy.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
               name=name, shape=None)
@@ -685,7 +858,13 @@ class Optimizer(
     return self._get_non_slot_variable(name, graph=graph)
 
   def _get_non_slot_variable(self, name, graph=None):
-    return self._non_slot_dict.get((name, graph), None)
+    non_slot = self._non_slot_dict.get((name, graph), None)
+    if hasattr(non_slot, "_mirrored_container"):
+      # This is a mirrored non-slot.  In order to enable code like `_finish`
+      # to assign to a non-slot, return the current context replica.
+      return non_slot.get()
+    else:
+      return non_slot
 
   def _non_slot_variables(self):
     """Additional variables created by the `Optimizer`.
diff --git a/tensorflow/python/training/quantize_training.i b/tensorflow/python/training/quantize_training.i
index 17ffcd6e0758c9c1bc8bab864b6b7a2a18bc9cbf..fb5e47efa0259d02df3ccf2e9b1430e027f8fcfb 100644
--- a/tensorflow/python/training/quantize_training.i
+++ b/tensorflow/python/training/quantize_training.i
@@ -56,6 +56,11 @@ PyObject* DoQuantizeTrainingOnGraphDefHelper(
 
 %insert("python") %{
 def do_quantize_training_on_graphdef(input_graph, num_bits):
+  """A general quantization scheme is being developed in @{tf.contrib.quantize}.
+
+  Consider using that instead, though since it is in the tf.contrib namespace,
+  it is not subject to backward compatibility guarantees.
+  """
   from tensorflow.core.framework.graph_pb2 import GraphDef
   from tensorflow.python.framework import errors
   with errors.raise_exception_on_not_ok_status() as status:
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index cd1baec3cf493834bde01d8ca2bfef2be9623456..e40b8d22ed2ab0f4c9ff65e953f0f1cf681c8068 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -91,17 +91,27 @@ class BaseSaverBuilder(object):
   class SaveSpec(object):
     """Class used to describe tensor slices that need to be saved."""
 
-    def __init__(self, tensor, slice_spec, name):
+    def __init__(self, tensor, slice_spec, name, dtype=None):
       """Creates a `SaveSpec` object.
 
       Args:
         tensor: the tensor to save or callable that produces a tensor to save.
         slice_spec: the slice to be saved. See `Variable.SaveSliceInfo`.
         name: the name to save the tensor under.
+        dtype: The data type of the Tensor. Required if `tensor` is callable.
+          Used for error checking in the restore op.
       """
       self._tensor = tensor
       self.slice_spec = slice_spec
       self.name = name
+      if callable(self._tensor):
+        if dtype is None:
+          raise AssertionError(
+              "When passing a callable `tensor` to a SaveSpec, an explicit "
+              "dtype must be provided.")
+        self.dtype = dtype
+      else:
+        self.dtype = tensor.dtype
 
     @property
     def tensor(self):
@@ -117,14 +127,27 @@ class BaseSaverBuilder(object):
         op: the "producer" object that this class wraps; it produces a list of
           tensors to save.  E.g., a "Variable" object saving its backing tensor.
         specs: a list of SaveSpec, each element of which describes one tensor to
-          save under this object.
+          save under this object. All Tensors must be on the same device.
         name: the name to save the object under.
       """
       self.op = op
       self.specs = specs
       self.name = name
-      # The device of this saveable. All tensors must be on the same device.
-      self.device = specs[0].tensor.device
+      self._device = None
+
+    @property
+    def device(self):
+      """The device for SaveSpec Tensors."""
+      # Note that SaveSpec.tensor runs Tensor-gathering ops when executing
+      # eagerly, making this call potentially very expensive.
+      #
+      # TODO(allenl): Consider another way to gather device information. Lower
+      # priority since this property isn't part of the normal save()/restore()
+      # workflow, but does come up when some alternative builders are passed to
+      # the Saver.
+      if self._device is None:
+        self._device = self.specs[0].tensor.device
+      return self._device
 
     def restore(self, restored_tensors, restored_shapes):
       """Restores this object from 'restored_tensors'.
@@ -148,7 +171,7 @@ class BaseSaverBuilder(object):
     """SaveableObject implementation that handles Variables."""
 
     def __init__(self, var, slice_spec, name):
-      spec = BaseSaverBuilder.SaveSpec(var, slice_spec, name)
+      spec = BaseSaverBuilder.SaveSpec(var, slice_spec, name, dtype=var.dtype)
       super(BaseSaverBuilder.VariableSaveable, self).__init__(var, [spec], name)
 
     def restore(self, restored_tensors, restored_shapes):
@@ -186,7 +209,8 @@ class BaseSaverBuilder(object):
         raise ValueError(
             "Saveable is neither a resource variable nor a read operation."
             " Got: %s" % repr(var))
-      spec = BaseSaverBuilder.SaveSpec(tensor, slice_spec, name)
+      spec = BaseSaverBuilder.SaveSpec(tensor, slice_spec, name,
+                                       dtype=var.dtype)
       super(BaseSaverBuilder.ResourceVariableSaveable, self).__init__(
           var, [spec], name)
 
@@ -295,7 +319,7 @@ class BaseSaverBuilder(object):
               filename_tensor,
               [spec.name],
               [spec.slice_spec],
-              [spec.tensor.dtype])[0])
+              [spec.dtype])[0])
 
     return tensors
   # pylint: enable=unused-argument
@@ -578,9 +602,11 @@ class BaseSaverBuilder(object):
           names_to_saveables[name] = [var]
       elif (isinstance(var, checkpointable.CheckpointableBase)
             and not isinstance(var, variables.Variable)):
+        checkpointable_saveables = [
+            (factory() if callable(factory) else factory)
+            for factory in var._gather_saveables_for_checkpoint().values()]
         names_to_saveables.update(
-            BaseSaverBuilder.OpListToDict(
-                list(var._gather_saveables_for_checkpoint().values())))
+            BaseSaverBuilder.OpListToDict(checkpointable_saveables))
       else:
         if context.executing_eagerly():
           if not isinstance(var, resource_variable_ops.ResourceVariable):
@@ -852,7 +878,7 @@ class BulkSaverBuilder(BaseSaverBuilder):
     restore_specs = []
     for saveable in saveables:
       for spec in saveable.specs:
-        restore_specs.append((spec.name, spec.slice_spec, spec.tensor.dtype))
+        restore_specs.append((spec.name, spec.slice_spec, spec.dtype))
 
     names, slices, dtypes = zip(*restore_specs)
     # Load all tensors onto CPU 0 for compatibility with existing code.
@@ -1922,12 +1948,22 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   else:
     meta_graph_def = meta_graph_or_file
 
-  meta_graph.import_scoped_meta_graph(meta_graph_def,
-                                      clear_devices=clear_devices,
-                                      import_scope=import_scope,
-                                      **kwargs)
+  imported_vars = meta_graph.import_scoped_meta_graph(
+      meta_graph_def,
+      clear_devices=clear_devices,
+      import_scope=import_scope,
+      **kwargs)
+
   if meta_graph_def.HasField("saver_def"):
-    return Saver(saver_def=meta_graph_def.saver_def, name=import_scope)
+    # Infer the scope that is prepended by `import_scoped_meta_graph`.
+    scope = import_scope
+    var_names = list(imported_vars.keys())
+    if var_names:
+      sample_key = var_names[0]
+      sample_var = imported_vars[sample_key]
+      scope = sample_var.name[:-len(sample_key)]
+
+    return Saver(saver_def=meta_graph_def.saver_def, name=scope)
   else:
     if variables._all_saveable_objects():  # pylint: disable=protected-access
       # Return the default saver instance for all graph variables.
@@ -1968,7 +2004,7 @@ def export_meta_graph(filename=None,
     saver_def: `SaverDef` protocol buffer.
     collection_list: List of string keys to collect.
     as_text: If `True`, writes the `MetaGraphDef` as an ASCII proto.
-    graph: The `Graph` to import into. If `None`, use the default graph.
+    graph: The `Graph` to export. If `None`, use the default graph.
     export_scope: Optional `string`. Name scope under which to extract
       the subgraph. The scope name will be striped from the node definitions
       for easy import later into new name scopes. If `None`, the whole graph
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 46fe2735be61758913028f49dfdd44af9e9ec5fe..14dda7997948ead7b12dee953a0b2ee3b2ee8fc9 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -35,6 +35,7 @@ from google.protobuf import text_format
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import queue_runner_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
@@ -1738,8 +1739,7 @@ class CheckpointStateTest(test.TestCase):
                      os.path.join(save_dir, "./model.ckpt-687529"))
 
 
-# TODO(skyewm): reenable when this works with _USE_C_SHAPES=False
-# @test_util.with_c_api
+@test_util.with_c_api
 class MetaGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2165,7 +2165,13 @@ class MetaGraphTest(test.TestCase):
       # Build and run the gradients of the while loop. We use this below to
       # verify that the gradients are correct with an imported MetaGraphDef.
       grad = gradients_impl.gradients([output], [var])
-      with session.Session() as sess:
+      # Turn off constant folding to avoid breaking testNestedControlFlowSerDes.
+      # It appears that a missing control dependency in the gradient graph
+      # causes the fetch node to not be triggered.
+      no_constfold_config = config_pb2.ConfigProto()
+      no_constfold_config.graph_options.rewrite_options.constant_folding = (
+          rewriter_config_pb2.RewriterConfig.OFF)
+      with session.Session(config=no_constfold_config) as sess:
         sess.run(init_op)
         expected_grad_value = sess.run(grad)
 
@@ -2182,7 +2188,7 @@ class MetaGraphTest(test.TestCase):
 
       init_op = variables.global_variables_initializer()
 
-      with session.Session() as sess:
+      with session.Session(config=no_constfold_config) as sess:
         sess.run(init_op)
         actual_grad_value = sess.run(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
@@ -2335,6 +2341,38 @@ class MetaGraphTest(test.TestCase):
               10, size=[1, 10])
       })
 
+  def testImportIntoImplicitNamescope(self):
+    # Test that we can import a meta graph into an implicit namescope.
+    test_dir = self._get_test_dir("import_into_namescope")
+    filename = os.path.join(test_dir, "ckpt")
+    image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
+    label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
+    with session.Session() as sess:
+      weights = variables.Variable(
+          random_ops.random_uniform([784, 10]), name="weights")
+      bias = variables.Variable(array_ops.zeros([10]), name="bias")
+      logit = nn_ops.relu(math_ops.matmul(image, weights) + bias, name="logits")
+      nn_ops.softmax(logit, name="prediction")
+      cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
+                                                      logits=logit, name="cost")
+      adam.AdamOptimizer().minimize(cost, name="optimize")
+      saver = saver_module.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, filename)
+
+    graph = ops_lib.Graph()
+    with session.Session(graph=graph) as sess:
+      with ops_lib.name_scope("new_model"):
+        new_saver = saver_module.import_meta_graph(
+            filename + ".meta", graph=graph)
+
+      new_saver.restore(sess, filename)
+      sess.run(["new_model/optimize"], {
+          "new_model/image:0": np.random.random([1, 784]),
+          "new_model/label:0": np.random.randint(
+              10, size=[1, 10])
+      })
+
   def testClearDevicesOnImport(self):
     # Test that we import a graph without its devices and run successfully.
     with ops_lib.Graph().as_default():
@@ -2873,11 +2911,11 @@ class _OwnsAVariableSimple(checkpointable.CheckpointableBase):
 class _MirroringSaveable(
     saver_module.BaseSaverBuilder.ResourceVariableSaveable):
 
-  def __init__(self, primary_variable, mirrored_variable):
+  def __init__(self, primary_variable, mirrored_variable, name):
     self._primary_variable = primary_variable
     self._mirrored_variable = mirrored_variable
     super(_MirroringSaveable, self).__init__(
-        self._primary_variable, "", self._primary_variable.name)
+        self._primary_variable, "", name)
 
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into both variables."""
@@ -2897,10 +2935,12 @@ class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
         name="mirrored", initializer=15., use_resource=True)
 
   def _gather_saveables_for_checkpoint(self):
-    saveable = _MirroringSaveable(
-        primary_variable=self.non_dep_variable,
-        mirrored_variable=self.mirrored)
-    return {checkpointable.VARIABLE_VALUE_KEY: saveable}
+    def _saveable_factory(name=self.non_dep_variable.name):
+      return _MirroringSaveable(
+          primary_variable=self.non_dep_variable,
+          mirrored_variable=self.mirrored,
+          name=name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
 
   # The Saver sorts by name before parsing, so we need a name property.
   @property
@@ -2940,6 +2980,37 @@ class CheckpointableCompatibilityTests(test.TestCase):
       self.assertEqual(42., self.evaluate(v.non_dep_variable))
       self.assertEqual(42., self.evaluate(v.mirrored))
 
+  def testSingleTensorEvaluation(self):
+
+    class _CountingSaveable(saver_module.BaseSaverBuilder.SaveableObject):
+
+      def __init__(self, name):
+        self.eval_count = 0
+        def _tensor():
+          self.eval_count += 1
+          return constant_op.constant([1.])
+        dummy_op = constant_op.constant([2.])
+        super(_CountingSaveable, self).__init__(
+            dummy_op,
+            [saver_module.BaseSaverBuilder.SaveSpec(
+                _tensor, "", name, dtype=dummy_op.dtype)],
+            name)
+
+      def restore(self, restored_tensors, restored_shapes):
+        """Restore the same value into both variables."""
+        pass
+
+    with context.eager_mode():
+      v = _CountingSaveable("foo")
+      saver = saver_module.Saver(var_list=[v])
+      test_dir = self.get_temp_dir()
+      prefix = os.path.join(test_dir, "ckpt")
+      with self.test_session() as sess:
+        save_path = saver.save(sess, prefix)
+        self.assertEqual(1, v.eval_count)
+        saver.restore(sess, save_path)
+        self.assertEqual(1, v.eval_count)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 360e02fb44c1062f71bb50449b9ef381510a9c69..a00ceb90211e371c3b2f2b32f2042d1556158595 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -229,10 +229,14 @@ class SessionManager(object):
     up to `max_wait_secs`, for recovery to succeed.
 
     If the model cannot be recovered successfully then it is initialized by
-    either running the provided `init_op`, or calling the provided `init_fn`.
-    The local_init_op is also run after init_op and init_fn, regardless of
+    running the `init_op` and calling `init_fn` if they are provided.
+    The `local_init_op` is also run after init_op and init_fn, regardless of
     whether the model was recovered successfully, but only if
-    ready_for_local_init_op passes.
+    `ready_for_local_init_op` passes.
+
+    If the model is recovered from a checkpoint it is assumed that all
+    global variables have been initialized, in particular neither `init_op`
+    nor `init_fn` will be executed.
 
     It is an error if the model cannot be recovered and no `init_op`
     or `init_fn` or `local_init_op` are passed.
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 9ac52dd0715d7ed15e2e57ed286be973614b01e5..258a6f045d7c1b491ce00bdf8dd0ae6ad500ba68 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -40,12 +40,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
 
 
 def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
@@ -112,7 +112,8 @@ def create_slot(primary, val, name, colocate_with_primary=True):
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      with ops.colocate_with(primary):
+      distribution_strategy = distribute_lib.get_distribution_strategy()
+      with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
     else:
       return _create_slot_var(primary, val, "", validate_shape, None, None)
@@ -148,7 +149,8 @@ def create_slot_with_initializer(primary, initializer, shape, dtype, name,
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      with ops.colocate_with(primary):
+      distribution_strategy = distribute_lib.get_distribution_strategy()
+      with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, initializer, "", validate_shape, shape,
                                 dtype)
     else:
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 6880cfc4db9b05c02df56c4c1f26f2fea1a7098c..b759b156d78cf8d869b49375058cc7ed42e82b34 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -31,6 +31,7 @@ See the @{$python/train} guide.
 @@custom_gradient
 @@gradients
 @@AggregationMethod
+@@GradientTape
 @@stop_gradient
 @@hessians
 @@clip_by_value
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 4f1abccc96ff95b6fcf9fe82c7a5d45fc2fd1c0c..d05e1d2c830b2aa7008c9cba9f28eb6230d8bc82 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
@@ -31,7 +30,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
-
 # Picked a long key value to minimize the chance of collision with user defined
 # collection keys.
 GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
@@ -170,8 +168,7 @@ def assert_global_step(global_step_tensor):
   """
   if not (isinstance(global_step_tensor, variables.Variable) or
           isinstance(global_step_tensor, ops.Tensor) or
-          isinstance(global_step_tensor,
-                     resource_variable_ops.ResourceVariable)):
+          resource_variable_ops.is_resource_variable(global_step_tensor)):
     raise TypeError(
         'Existing "global_step" must be a Variable or Tensor: %s.' %
         global_step_tensor)
diff --git a/tensorflow/python/user_ops/user_ops.py b/tensorflow/python/user_ops/user_ops.py
index 6f9b5d92bb2ea662c9a5af279f0fcf71f0efccc5..20ea3b0f621dc74bd3778d565f8897e47a881d42 100644
--- a/tensorflow/python/user_ops/user_ops.py
+++ b/tensorflow/python/user_ops/user_ops.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import gen_user_ops as _gen_user_ops
 
 # go/tf-wildcard-import
 from tensorflow.python.ops.gen_user_ops import *  # pylint: disable=wildcard-import
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('user_ops.my_fact')
 def my_fact():
   """Example of overriding the generated code for an Op."""
   return _gen_user_ops.fact()
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 23c2c48f4b5a165bd6e356a6243b234619af1c4c..5622431bc9974fcc7f355472618ee8b59863764c 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -60,15 +60,7 @@ def _is_namedtuple(instance, strict=False):
   Returns:
     True if `instance` is a `namedtuple`.
   """
-  # Attemp to limit the test to plain namedtuple (not stuff inheriting from it).
-  if not isinstance(instance, tuple):
-    return False
-  if strict and instance.__class__.__base__ != tuple:
-    return False
-  return (
-      hasattr(instance, "_fields") and
-      isinstance(instance._fields, _collections.Sequence) and
-      all(isinstance(f, _six.string_types) for f in instance._fields))
+  return _pywrap_tensorflow.IsNamedtuple(instance, strict)
 
 
 def _sequence_like(instance, args):
@@ -157,76 +149,7 @@ def flatten(nest):
 
 def _same_namedtuples(nest1, nest2):
   """Returns True if the two namedtuples have the same name and fields."""
-  if nest1._fields != nest2._fields:
-    return False
-  if nest1.__class__.__name__ != nest2.__class__.__name__:
-    return False
-  return True
-
-
-def _recursive_assert_same_structure(nest1, nest2, check_types):
-  """Helper function for `assert_same_structure`.
-
-  See `assert_same_structure` for further information about namedtuples.
-
-  Args:
-    nest1: An arbitrarily nested structure.
-    nest2: An arbitrarily nested structure.
-    check_types: If `True` (default) types of sequences are checked as
-        well, including the keys of dictionaries. If set to `False`, for example
-        a list and a tuple of objects will look the same if they have the same
-        size. Note that namedtuples with identical name and fields are always
-        considered to have the same shallow structure.
-
-  Returns:
-    True if `nest1` and `nest2` have the same structure.
-
-  Raises:
-    ValueError: If the two structure don't have the same nested structre.
-    TypeError: If the two structure don't have the same sequence type.
-    ValueError: If the two dictionaries don't have the same set of keys.
-  """
-  is_sequence_nest1 = is_sequence(nest1)
-  if is_sequence_nest1 != is_sequence(nest2):
-    raise ValueError(
-        "The two structures don't have the same nested structure.\n\n"
-        "First structure: %s\n\nSecond structure: %s." % (nest1, nest2))
-
-  if not is_sequence_nest1:
-    return  # finished checking
-
-  if check_types:
-    type_nest1 = type(nest1)
-    type_nest2 = type(nest2)
-
-    # Duck-typing means that nest should be fine with two different namedtuples
-    # with identical name and fields.
-    if _is_namedtuple(nest1, True) and _is_namedtuple(nest2, True):
-      if not _same_namedtuples(nest1, nest2):
-        raise TypeError(
-            "The two namedtuples don't have the same sequence type. First "
-            "structure has type %s, while second structure has type %s."
-            % (type_nest1, type_nest2))
-    else:
-      if type_nest1 != type_nest2:
-        raise TypeError(
-            "The two structures don't have the same sequence type. First "
-            "structure has type %s, while second structure has type %s."
-            % (type_nest1, type_nest2))
-
-    if isinstance(nest1, dict):
-      keys1 = set(_six.iterkeys(nest1))
-      keys2 = set(_six.iterkeys(nest2))
-      if keys1 != keys2:
-        raise ValueError(
-            "The two dictionaries don't have the same set of keys. First "
-            "structure has keys {}, while second structure has keys {}."
-            .format(keys1, keys2))
-
-  nest1_as_sequence = [n for n in _yield_value(nest1)]
-  nest2_as_sequence = [n for n in _yield_value(nest2)]
-  for n1, n2 in zip(nest1_as_sequence, nest2_as_sequence):
-    _recursive_assert_same_structure(n1, n2, check_types)
+  return _pywrap_tensorflow.SameNamedtuples(nest1, nest2)
 
 
 def assert_same_structure(nest1, nest2, check_types=True):
@@ -257,14 +180,7 @@ def assert_same_structure(nest1, nest2, check_types=True):
     TypeError: If the two structures differ in the type of sequence in any of
       their substructures. Only possible if `check_types` is `True`.
   """
-  len_nest1 = len(flatten(nest1)) if is_sequence(nest1) else 1
-  len_nest2 = len(flatten(nest2)) if is_sequence(nest2) else 1
-  if len_nest1 != len_nest2:
-    raise ValueError("The two structures don't have the same number of "
-                     "elements.\n\nFirst structure (%i elements): %s\n\n"
-                     "Second structure (%i elements): %s"
-                     % (len_nest1, nest1, len_nest2, nest2))
-  _recursive_assert_same_structure(nest1, nest2, check_types)
+  _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types)
 
 
 def flatten_dict_items(dictionary):
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 4439d6241ea9607b194cbb17304dbb77dc9f57a8..2f12b25354a905b2aafa870c28f1e9c0b693e888 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import time
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -32,6 +35,9 @@ from tensorflow.python.util import nest
 
 class NestTest(test.TestCase):
 
+  PointXY = collections.namedtuple("Point", ["x", "y"])  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testFlattenAndPack(self):
     structure = ((3, 4), 5, (6, 7, (9, 10), 8))
     flat = ["a", "b", "c", "d", "e", "f", "g", "h"]
@@ -39,8 +45,8 @@ class NestTest(test.TestCase):
     self.assertEqual(
         nest.pack_sequence_as(structure, flat), (("a", "b"), "c",
                                                  ("d", "e", ("f", "g"), "h")))
-    point = collections.namedtuple("Point", ["x", "y"])
-    structure = (point(x=4, y=2), ((point(x=1, y=0),),))
+    structure = (NestTest.PointXY(x=4, y=2),
+                 ((NestTest.PointXY(x=1, y=0),),))
     flat = [4, 2, 1, 0]
     self.assertEqual(nest.flatten(structure), flat)
     restructured_from_flat = nest.pack_sequence_as(structure, flat)
@@ -66,6 +72,7 @@ class NestTest(test.TestCase):
     with self.assertRaises(ValueError):
       nest.pack_sequence_as([5, 6, [7, 8]], ["a", "b", "c"])
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testFlattenDictOrder(self):
     """`flatten` orders dicts by key, including OrderedDicts."""
     ordered = collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
@@ -87,12 +94,14 @@ class NestTest(test.TestCase):
         ordered_reconstruction)
     self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
 
+  Abc = collections.namedtuple("A", ("b", "c"))  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testFlattenAndPack_withDicts(self):
     # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
-    named_tuple = collections.namedtuple("A", ("b", "c"))
     mess = [
         "z",
-        named_tuple(3, 4),
+        NestTest.Abc(3, 4),
         {
             "c": [
                 1,
@@ -111,7 +120,7 @@ class NestTest(test.TestCase):
 
     structure_of_mess = [
         14,
-        named_tuple("a", True),
+        NestTest.Abc("a", True),
         {
             "c": [
                 0,
@@ -157,6 +166,7 @@ class NestTest(test.TestCase):
       nest.pack_sequence_as(["hello", "world"],
                             ["and", "goodbye", "again"])
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testIsSequence(self):
     self.assertFalse(nest.is_sequence("1234"))
     self.assertTrue(nest.is_sequence([1, 3, [4, 5]]))
@@ -186,6 +196,23 @@ class NestTest(test.TestCase):
         ValueError, "Key had [0-9]* elements, but value had [0-9]* elements"):
       nest.flatten_dict_items(another_bad_dictionary)
 
+  # pylint does not correctly recognize these as class names and
+  # suggests to use variable style under_score naming.
+  # pylint: disable=invalid-name
+  Named0ab = collections.namedtuple("named_0", ("a", "b"))
+  Named1ab = collections.namedtuple("named_1", ("a", "b"))
+  SameNameab = collections.namedtuple("same_name", ("a", "b"))
+  SameNameab2 = collections.namedtuple("same_name", ("a", "b"))
+  SameNamexy = collections.namedtuple("same_name", ("x", "y"))
+  SameName1xy = collections.namedtuple("same_name_1", ("x", "y"))
+  SameName1xy2 = collections.namedtuple("same_name_1", ("x", "y"))
+  NotSameName = collections.namedtuple("not_same_name", ("a", "b"))
+  # pylint: enable=invalid-name
+
+  class SameNamedType1(SameNameab):
+    pass
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testAssertSameStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6"))
@@ -198,23 +225,32 @@ class NestTest(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError,
-        ("don't have the same number of elements\\.\n\n"
-         "First structure \\(6 elements\\):.*?"
-         "\n\nSecond structure \\(2 elements\\):")):
+        ("The two structures don't have the same nested structure\\.\n\n"
+         "First structure:.*?\n\n"
+         "Second structure:.*\n\n"
+         "More specifically: Substructure "
+         r'"type=tuple str=\(\(1, 2\), 3\)" is a sequence, while '
+         'substructure "type=str str=spam" is not')):
       nest.assert_same_structure(structure1, structure_different_num_elements)
 
     with self.assertRaisesRegexp(
         ValueError,
-        ("don't have the same number of elements\\.\n\n"
-         "First structure \\(2 elements\\):.*?"
-         "\n\nSecond structure \\(1 elements\\):")):
+        ("The two structures don't have the same nested structure\\.\n\n"
+         "First structure:.*?\n\n"
+         "Second structure:.*\n\n"
+         r'More specifically: Substructure "type=list str=\[0, 1\]" '
+         r'is a sequence, while substructure "type=ndarray str=\[0 1\]" '
+         "is not")):
       nest.assert_same_structure([0, 1], np.array([0, 1]))
 
     with self.assertRaisesRegexp(
         ValueError,
-        ("don't have the same number of elements\\.\n\n"
-         "First structure \\(1 elements\\):.*"
-         "\n\nSecond structure \\(2 elements\\):")):
+        ("The two structures don't have the same nested structure\\.\n\n"
+         "First structure:.*?\n\n"
+         "Second structure:.*\n\n"
+         r'More specifically: Substructure "type=list str=\[0, 1\]" '
+         'is a sequence, while substructure "type=int str=0" '
+         "is not")):
       nest.assert_same_structure(0, [0, 1])
 
     self.assertRaises(TypeError, nest.assert_same_structure, (0, 1), [0, 1])
@@ -225,21 +261,21 @@ class NestTest(test.TestCase):
          "First structure: .*?\n\nSecond structure: ")):
       nest.assert_same_structure(structure1, structure_different_nesting)
 
-    named_type_0 = collections.namedtuple("named_0", ("a", "b"))
-    named_type_1 = collections.namedtuple("named_1", ("a", "b"))
     self.assertRaises(TypeError, nest.assert_same_structure, (0, 1),
-                      named_type_0("a", "b"))
+                      NestTest.Named0ab("a", "b"))
 
-    nest.assert_same_structure(named_type_0(3, 4), named_type_0("a", "b"))
+    nest.assert_same_structure(NestTest.Named0ab(3, 4),
+                               NestTest.Named0ab("a", "b"))
 
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      named_type_0(3, 4), named_type_1(3, 4))
+                      NestTest.Named0ab(3, 4), NestTest.Named1ab(3, 4))
 
     with self.assertRaisesRegexp(
         ValueError,
         ("don't have the same nested structure\\.\n\n"
          "First structure: .*?\n\nSecond structure: ")):
-      nest.assert_same_structure(named_type_0(3, 4), named_type_0([3], 4))
+      nest.assert_same_structure(NestTest.Named0ab(3, 4),
+                                 NestTest.Named0ab([3], 4))
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -258,36 +294,33 @@ class NestTest(test.TestCase):
                                  "don't have the same set of keys"):
       nest.assert_same_structure({"a": 1}, {"b": 1})
 
-    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
-    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
-    nest.assert_same_structure(same_name_type_0(0, 1), same_name_type_1(2, 3))
+    nest.assert_same_structure(NestTest.SameNameab(0, 1),
+                               NestTest.SameNameab2(2, 3))
 
     # This assertion is expected to pass: two namedtuples with the same
     # name and field names are considered to be identical.
-    same_name_type_2 = collections.namedtuple("same_name_1", ("x", "y"))
-    same_name_type_3 = collections.namedtuple("same_name_1", ("x", "y"))
     nest.assert_same_structure(
-        same_name_type_0(same_name_type_2(0, 1), 2),
-        same_name_type_1(same_name_type_3(2, 3), 4))
+        NestTest.SameNameab(NestTest.SameName1xy(0, 1), 2),
+        NestTest.SameNameab2(NestTest.SameName1xy2(2, 3), 4))
 
     expected_message = "The two structures don't have the same.*"
     with self.assertRaisesRegexp(ValueError, expected_message):
-      nest.assert_same_structure(same_name_type_0(0, same_name_type_1(1, 2)),
-                                 same_name_type_1(same_name_type_0(0, 1), 2))
+      nest.assert_same_structure(
+          NestTest.SameNameab(0, NestTest.SameNameab2(1, 2)),
+          NestTest.SameNameab2(NestTest.SameNameab(0, 1), 2))
 
-    same_name_type_1 = collections.namedtuple("not_same_name", ("a", "b"))
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+                      NestTest.SameNameab(0, 1), NestTest.NotSameName(2, 3))
 
-    same_name_type_1 = collections.namedtuple("same_name", ("x", "y"))
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+                      NestTest.SameNameab(0, 1), NestTest.SameNamexy(2, 3))
 
-    class SameNamedType1(collections.namedtuple("same_name", ("a", "b"))):
-      pass
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      same_name_type_0(0, 1), SameNamedType1(2, 3))
+                      NestTest.SameNameab(0, 1), NestTest.SameNamedType1(2, 3))
 
+  EmptyNT = collections.namedtuple("empty_nt", "")  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = (((7, 8), 9), 10, (11, 12))
@@ -310,9 +343,8 @@ class NestTest(test.TestCase):
     self.assertEqual((), nest.map_structure(lambda x: x + 1, ()))
     self.assertEqual([], nest.map_structure(lambda x: x + 1, []))
     self.assertEqual({}, nest.map_structure(lambda x: x + 1, {}))
-    empty_nt = collections.namedtuple("empty_nt", "")
-    self.assertEqual(empty_nt(), nest.map_structure(lambda x: x + 1,
-                                                    empty_nt()))
+    self.assertEqual(NestTest.EmptyNT(), nest.map_structure(lambda x: x + 1,
+                                                            NestTest.EmptyNT()))
 
     # This is checking actual equality of types, empty list != empty tuple
     self.assertNotEqual((), nest.map_structure(lambda x: x + 1, []))
@@ -352,10 +384,12 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
       nest.map_structure(lambda x: None, structure1, check_types=False, foo="a")
 
+  ABTuple = collections.namedtuple("ab_tuple", "a, b")  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testMapStructureWithStrings(self):
-    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
-    inp_a = ab_tuple(a="foo", b=("bar", "baz"))
-    inp_b = ab_tuple(a=2, b=(1, 3))
+    inp_a = NestTest.ABTuple(a="foo", b=("bar", "baz"))
+    inp_b = NestTest.ABTuple(a=2, b=(1, 3))
     out = nest.map_structure(lambda string, repeats: string * repeats,
                              inp_a,
                              inp_b)
@@ -363,8 +397,8 @@ class NestTest(test.TestCase):
     self.assertEqual("bar", out.b[0])
     self.assertEqual("bazbazbaz", out.b[1])
 
-    nt = ab_tuple(a=("something", "something_else"),
-                  b="yet another thing")
+    nt = NestTest.ABTuple(a=("something", "something_else"),
+                          b="yet another thing")
     rev_nt = nest.map_structure(lambda x: x[::-1], nt)
     # Check the output is the correct structure, and all strings are reversed.
     nest.assert_same_structure(nt, rev_nt)
@@ -431,10 +465,8 @@ class NestTest(test.TestCase):
 
     # This assertion is expected to pass: two namedtuples with the same
     # name and field names are considered to be identical.
-    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
-    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
-    inp_shallow = same_name_type_0(1, 2)
-    inp_deep = same_name_type_1(1, [1, 2, 3])
+    inp_shallow = NestTest.SameNameab(1, 2)
+    inp_deep = NestTest.SameNameab2(1, [1, 2, 3])
     nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=False)
     nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=True)
 
@@ -466,7 +498,7 @@ class NestTest(test.TestCase):
                      [1, {"c": 2}, 3, (4, 5)])
 
     # Namedtuples.
-    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
+    ab_tuple = NestTest.ABTuple
     input_tree = ab_tuple(a=[0, 1], b=2)
     shallow_tree = ab_tuple(a=0, b=1)
     input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
@@ -681,5 +713,31 @@ class NestTest(test.TestCase):
           list(nest.flatten_with_joined_string_paths(inputs)), expected)
 
 
+class NestBenchmark(test.Benchmark):
+
+  def run_and_report(self, s1, s2, name):
+    burn_iter, test_iter = 100, 30000
+
+    for _ in xrange(burn_iter):
+      nest.assert_same_structure(s1, s2)
+
+    t0 = time.time()
+    for _ in xrange(test_iter):
+      nest.assert_same_structure(s1, s2)
+    t1 = time.time()
+
+    self.report_benchmark(iters=test_iter, wall_time=(t1 - t0) / test_iter,
+                          name=name)
+
+  def benchmark_assert_structure(self):
+    s1 = (((1, 2), 3), 4, (5, 6))
+    s2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6"))
+    self.run_and_report(s1, s2, "assert_same_structure_6_elem")
+
+    s1 = (((1, 2), 3), 4, (5, 6)) * 10
+    s2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6")) * 10
+    self.run_and_report(s1, s2, "assert_same_structure_60_elem")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index a41fa7df253bcf4bce280574b89ed0dda8330521..70aee4a3f663c862ecb09444866a0294333ee27a 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
 namespace swig {
@@ -27,6 +28,113 @@ PyObject* CollectionsSequenceType = nullptr;
 
 bool WarnedThatSetIsNotSequence = false;
 
+bool IsString(PyObject* o) {
+  return PyBytes_Check(o) ||
+#if PY_MAJOR_VERSION < 3
+         PyString_Check(o) ||
+#endif
+         PyUnicode_Check(o);
+}
+
+// Equivalent to Python's 'o.__class__.__name__'
+// Note that '__class__' attribute is set only in new-style classes.
+// A lot of tensorflow code uses __class__ without checks, so it seems like
+// we only support new-style classes.
+StringPiece GetClassName(PyObject* o) {
+  // __class__ is equivalent to type() for new style classes.
+  // type() is equivalent to PyObject_Type()
+  // (https://docs.python.org/3.5/c-api/object.html#c.PyObject_Type)
+  // PyObject_Type() is equivalent to o->ob_type except for Py_INCREF, which
+  // we don't need here.
+  PyTypeObject* type = o->ob_type;
+
+  // __name__ is the value of `tp_name` after the last '.'
+  // (https://docs.python.org/2/c-api/typeobj.html#c.PyTypeObject.tp_name)
+  StringPiece name(type->tp_name);
+  size_t pos = name.rfind('.');
+  if (pos != StringPiece::npos) {
+    name.remove_prefix(pos + 1);
+  }
+  return name;
+}
+
+string PyObjectToString(PyObject* o) {
+  if (o == nullptr) {
+    return "<null object>";
+  }
+  PyObject* str = PyObject_Str(o);
+  if (str) {
+#if PY_MAJOR_VERSION < 3
+    string s(PyString_AS_STRING(str));
+#else
+    string s(PyUnicode_AsUTF8(str));
+#endif
+    Py_DECREF(str);
+    return tensorflow::strings::StrCat("type=", GetClassName(o), " str=", s);
+  } else {
+    return "<failed to execute str() on object>";
+  }
+}
+
+// Implements the same idea as tensorflow.util.nest._yield_value
+// During construction we check if the iterable is a dictionary.
+// If so, we construct a sequence from its sorted keys that will be used
+// for iteration.
+// If not, we construct a sequence directly from the iterable.
+// At each step, we get the next element from the sequence and use it
+// either as a key or return it directly.
+//
+// 'iterable' must not be modified while ValIterator is used.
+class ValIterator {
+ public:
+  explicit ValIterator(PyObject* iterable) : dict_(nullptr), index_(0) {
+    if (PyDict_Check(iterable)) {
+      dict_ = iterable;
+      // PyDict_Keys returns a list, which can be used with
+      // PySequence_Fast_GET_ITEM.
+      seq_ = PyDict_Keys(iterable);
+      // Iterate through dictionaries in a deterministic order by sorting the
+      // keys. Notice this means that we ignore the original order of
+      // `OrderedDict` instances. This is intentional, to avoid potential
+      // bugs caused by mixing ordered and plain dicts (e.g., flattening
+      // a dict but using a corresponding `OrderedDict` to pack it back).
+      PyList_Sort(seq_);
+    } else {
+      seq_ = PySequence_Fast(iterable, "");
+    }
+    size_ = PySequence_Fast_GET_SIZE(seq_);
+  }
+
+  ~ValIterator() { Py_DECREF(seq_); }
+
+  // Return a borrowed reference to the next element from iterable.
+  // Return nullptr when iteration is over.
+  PyObject* next() {
+    PyObject* element = nullptr;
+    if (index_ < size_) {
+      // Both PySequence_Fast_GET_ITEM and PyDict_GetItem return borrowed
+      // references.
+      element = PySequence_Fast_GET_ITEM(seq_, index_);
+      ++index_;
+      if (dict_ != nullptr) {
+        element = PyDict_GetItem(dict_, element);
+        if (element == nullptr) {
+          PyErr_SetString(PyExc_RuntimeError,
+                          "Dictionary was modified during iteration over it");
+          return nullptr;
+        }
+      }
+    }
+    return element;
+  }
+
+ private:
+  PyObject* seq_;
+  PyObject* dict_;
+  Py_ssize_t size_;
+  Py_ssize_t index_;
+};
+
 // Returns 1 if `o` is considered a sequence for the purposes of Flatten().
 // Returns 0 otherwise.
 // Returns -1 if an error occurred.
@@ -38,7 +146,7 @@ int IsSequenceHelper(PyObject* o) {
                     "so consider avoiding using them.";
     WarnedThatSetIsNotSequence = true;
   }
-  if (CollectionsSequenceType == nullptr) {
+  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
     PyErr_SetString(
         PyExc_RuntimeError,
         tensorflow::strings::StrCat(
@@ -49,11 +157,7 @@ int IsSequenceHelper(PyObject* o) {
   }
   int is_instance = PyObject_IsInstance(o, CollectionsSequenceType);
   if (is_instance == -1) return -1;
-  return static_cast<int>(is_instance != 0 && !PyBytes_Check(o) &&
-#if PY_MAJOR_VERSION < 3
-                          !PyString_Check(o) &&
-#endif
-                          !PyUnicode_Check(o));
+  return static_cast<int>(is_instance != 0 && !IsString(o));
 }
 
 bool FlattenHelper(PyObject* nested, PyObject* list) {
@@ -75,12 +179,16 @@ bool FlattenHelper(PyObject* nested, PyObject* list) {
       // while the method is running.
       PyObject* key = PyList_GET_ITEM(keys, i);
       PyObject* val = PyDict_GetItem(nested, key);
-      if (Py_EnterRecursiveCall(" in Flatten")) {
+      if (Py_EnterRecursiveCall(" in flatten")) {
         Py_DECREF(keys);
         return false;
       }
-      FlattenHelper(val, list);
+      const bool success = FlattenHelper(val, list);
       Py_LeaveRecursiveCall();
+      if (!success) {
+        Py_DECREF(keys);
+        return false;
+      }
     }
     Py_DECREF(keys);
     return true;
@@ -90,13 +198,159 @@ bool FlattenHelper(PyObject* nested, PyObject* list) {
   PyObject* item;
   PyObject* iterator = PyObject_GetIter(nested);
   while ((item = PyIter_Next(iterator)) != nullptr) {
-    FlattenHelper(item, list);
+    if (Py_EnterRecursiveCall(" in flatten")) {
+      Py_DECREF(iterator);
+      Py_DECREF(item);
+      return false;
+    }
+    bool success = FlattenHelper(item, list);
+    Py_LeaveRecursiveCall();
+    if (!success) {
+      Py_DECREF(iterator);
+      Py_DECREF(item);
+      return false;
+    }
     Py_DECREF(item);
   }
   Py_DECREF(iterator);
   return true;
 }
 
+// Sets error using keys of 'dict1' and 'dict2'.
+// 'dict1' and 'dict2' are assumed to be Python dictionaries.
+void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
+                           bool* is_type_error) {
+  PyObject* k1 = PyDict_Keys(dict1);
+  PyObject* k2 = PyDict_Keys(dict2);
+  *is_type_error = false;
+  *error_msg = tensorflow::strings::StrCat(
+      "The two dictionaries don't have the same set of keys. "
+      "First structure has keys ",
+      PyObjectToString(k1), ", while second structure has keys ",
+      PyObjectToString(k2));
+  Py_DECREF(k1);
+  Py_DECREF(k2);
+}
+
+// Returns true iff there were no "internal" errors. In other words,
+// errors that has nothing to do with structure checking.
+// If an "internal" error occured, the appropriate Python error will be
+// set and the caller can propage it directly to the user.
+//
+// Both `error_msg` and `is_type_error` must be non-null. `error_msg` must
+// be empty.
+// Leaves `error_msg` empty if structures matched. Else, fills `error_msg`
+// with appropriate error and sets `is_type_error` to true iff
+// the error to be raised should be TypeError.
+bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
+                               string* error_msg, bool* is_type_error) {
+  DCHECK(error_msg);
+  DCHECK(is_type_error);
+  const bool is_seq1 = IsSequence(o1);
+  const bool is_seq2 = IsSequence(o2);
+  if (PyErr_Occurred()) return false;
+  if (is_seq1 != is_seq2) {
+    string seq_str = is_seq1 ? PyObjectToString(o1) : PyObjectToString(o2);
+    string non_seq_str = is_seq1 ? PyObjectToString(o2) : PyObjectToString(o1);
+    *is_type_error = false;
+    *error_msg = tensorflow::strings::StrCat(
+        "Substructure \"", seq_str, "\" is a sequence, while substructure \"",
+        non_seq_str, "\" is not");
+    return true;
+  }
+
+  // Got to scalars, so finished checking. Structures are the same.
+  if (!is_seq1) return true;
+
+  if (check_types) {
+    const PyTypeObject* type1 = o1->ob_type;
+    const PyTypeObject* type2 = o2->ob_type;
+
+    // We treat two different namedtuples with identical name and fields
+    // as having the same type.
+    const PyObject* o1_tuple = IsNamedtuple(o1, true);
+    if (o1_tuple == nullptr) return false;
+    const PyObject* o2_tuple = IsNamedtuple(o2, true);
+    if (o2_tuple == nullptr) {
+      Py_DECREF(o1_tuple);
+      return false;
+    }
+    bool both_tuples = o1_tuple == Py_True && o2_tuple == Py_True;
+    Py_DECREF(o1_tuple);
+    Py_DECREF(o2_tuple);
+
+    if (both_tuples) {
+      const PyObject* same_tuples = SameNamedtuples(o1, o2);
+      if (same_tuples == nullptr) return false;
+      bool not_same_tuples = same_tuples != Py_True;
+      Py_DECREF(same_tuples);
+      if (not_same_tuples) {
+        *is_type_error = true;
+        *error_msg = tensorflow::strings::StrCat(
+            "The two namedtuples don't have the same sequence type. "
+            "First structure ",
+            PyObjectToString(o1), " has type ", type1->tp_name,
+            ", while second structure ", PyObjectToString(o2), " has type ",
+            type2->tp_name);
+        return true;
+      }
+    } else if (type1 != type2) {
+      *is_type_error = true;
+      *error_msg = tensorflow::strings::StrCat(
+          "The two namedtuples don't have the same sequence type. "
+          "First structure ",
+          PyObjectToString(o1), " has type ", type1->tp_name,
+          ", while second structure ", PyObjectToString(o2), " has type ",
+          type2->tp_name);
+      return true;
+    }
+
+    if (PyDict_Check(o1)) {
+      if (PyDict_Size(o1) != PyDict_Size(o2)) {
+        SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+        return true;
+      }
+
+      PyObject* key;
+      Py_ssize_t pos = 0;
+      while (PyDict_Next(o1, &pos, &key, nullptr)) {
+        if (PyDict_GetItem(o2, key) == nullptr) {
+          SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+          return true;
+        }
+      }
+    }
+  }
+
+  ValIterator iter1(o1);
+  ValIterator iter2(o2);
+
+  while (true) {
+    PyObject* v1 = iter1.next();
+    PyObject* v2 = iter2.next();
+    if (v1 != nullptr && v2 != nullptr) {
+      if (Py_EnterRecursiveCall(" in assert_same_structure")) {
+        return false;
+      }
+      bool no_internal_errors = AssertSameStructureHelper(
+          v1, v2, check_types, error_msg, is_type_error);
+      Py_LeaveRecursiveCall();
+      if (!no_internal_errors) return false;
+      if (!error_msg->empty()) return true;
+    } else if (v1 == nullptr && v2 == nullptr) {
+      // Done with all recursive calls. Structure matched.
+      return true;
+    } else {
+      *is_type_error = false;
+      *error_msg = tensorflow::strings::StrCat(
+          "The two structures don't have the same number of elements. ",
+          "First structure: ", PyObjectToString(o1),
+          ". Second structure: ", PyObjectToString(o2));
+      return true;
+    }
+  }
+}
+
 }  // anonymous namespace
 
 void RegisterSequenceClass(PyObject* sequence_class) {
@@ -123,5 +377,107 @@ PyObject* Flatten(PyObject* nested) {
     return nullptr;
   }
 }
+
+PyObject* IsNamedtuple(PyObject* o, bool strict) {
+  // Must be subclass of tuple
+  if (!PyTuple_Check(o)) {
+    Py_RETURN_FALSE;
+  }
+
+  // If strict, o.__class__.__base__ must be tuple
+  if (strict) {
+    PyObject* klass = PyObject_GetAttrString(o, "__class__");
+    if (klass == nullptr) return nullptr;
+    PyObject* base = PyObject_GetAttrString(klass, "__base__");
+    Py_DECREF(klass);
+    if (base == nullptr) return nullptr;
+
+    const PyTypeObject* base_type = reinterpret_cast<PyTypeObject*>(base);
+    // built-in object types are singletons
+    bool tuple_base = base_type == &PyTuple_Type;
+    Py_DECREF(base);
+    if (!tuple_base) {
+      Py_RETURN_FALSE;
+    }
+  }
+
+  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        tensorflow::strings::StrCat(
+            "collections.Sequence type has not been set. "
+            "Please call RegisterSequenceClass before using this module")
+            .c_str());
+    return nullptr;
+  }
+
+  // o must have attribute '_fields' and every element in
+  // '_fields' must be a string.
+  int has_fields = PyObject_HasAttrString(o, "_fields");
+  if (!has_fields) {
+    Py_RETURN_FALSE;
+  }
+
+  Safe_PyObjectPtr fields = make_safe(PyObject_GetAttrString(o, "_fields"));
+  int is_instance = PyObject_IsInstance(fields.get(), CollectionsSequenceType);
+  if (is_instance == 0) {
+    Py_RETURN_FALSE;
+  } else if (is_instance == -1) {
+    return nullptr;
+  }
+
+  Safe_PyObjectPtr seq = make_safe(PySequence_Fast(fields.get(), ""));
+  const Py_ssize_t s = PySequence_Fast_GET_SIZE(seq.get());
+  for (Py_ssize_t i = 0; i < s; ++i) {
+    // PySequence_Fast_GET_ITEM returns borrowed ref
+    PyObject* elem = PySequence_Fast_GET_ITEM(seq.get(), i);
+    if (!IsString(elem)) {
+      Py_RETURN_FALSE;
+    }
+  }
+
+  Py_RETURN_TRUE;
+}
+
+PyObject* SameNamedtuples(PyObject* o1, PyObject* o2) {
+  PyObject* f1 = PyObject_GetAttrString(o1, "_fields");
+  PyObject* f2 = PyObject_GetAttrString(o2, "_fields");
+  if (f1 == nullptr || f2 == nullptr) {
+    Py_XDECREF(f1);
+    Py_XDECREF(f2);
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "Expected namedtuple-like objects (that have _fields attr)");
+    return nullptr;
+  }
+
+  if (PyObject_RichCompareBool(f1, f2, Py_NE)) {
+    Py_RETURN_FALSE;
+  }
+
+  if (GetClassName(o1).compare(GetClassName(o2)) == 0) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types) {
+  string error_msg;
+  bool is_type_error = false;
+  AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error);
+  if (!error_msg.empty()) {
+    PyErr_SetString(
+        is_type_error ? PyExc_TypeError : PyExc_ValueError,
+        tensorflow::strings::StrCat(
+            "The two structures don't have the same nested structure.\n\n",
+            "First structure: ", PyObjectToString(o1), "\n\nSecond structure: ",
+            PyObjectToString(o2), "\n\nMore specifically: ", error_msg)
+            .c_str());
+    return nullptr;
+  }
+  Py_RETURN_NONE;
+}
+
 }  // namespace swig
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 2af71dc753760e7efaf28cc500d5296a31957a04..c325baa5f86820846dd09780b4208667f3aad5e1 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -33,6 +33,57 @@ namespace swig {
 //   dict.
 bool IsSequence(PyObject* o);
 
+// Implements the same interface as tensorflow.util.nest._is_namedtuple
+// Returns Py_True iff `instance` should be considered a `namedtuple`.
+//
+// Args:
+//   instance: An instance of a Python object.
+//   strict: If True, `instance` is considered to be a `namedtuple` only if
+//       it is a "plain" namedtuple. For instance, a class inheriting
+//       from a `namedtuple` will be considered to be a `namedtuple`
+//       iff `strict=False`.
+//
+// Returns:
+//   True if `instance` is a `namedtuple`.
+PyObject* IsNamedtuple(PyObject* o, bool strict);
+
+// Implements the same interface as tensorflow.util.nest._same_namedtuples
+// Returns Py_True iff the two namedtuples have the same name and fields.
+// Raises RuntimeError if `o1` or `o2` don't look like namedtuples (don't have
+// '_fields' attribute).
+PyObject* SameNamedtuples(PyObject* o1, PyObject* o2);
+
+// Asserts that two structures are nested in the same way.
+//
+// Note that namedtuples with identical name and fields are always considered
+// to have the same shallow structure (even with `check_types=True`).
+// For intance, this code will print `True`:
+//
+// ```python
+// def nt(a, b):
+//   return collections.namedtuple('foo', 'a b')(a, b)
+// print(assert_same_structure(nt(0, 1), nt(2, 3)))
+// ```
+//
+// Args:
+//  nest1: an arbitrarily nested structure.
+//  nest2: an arbitrarily nested structure.
+//  check_types: if `true`, types of sequences are checked as
+//      well, including the keys of dictionaries. If set to `false`, for example
+//      a list and a tuple of objects will look the same if they have the same
+//      size. Note that namedtuples with identical name and fields are always
+//      considered to have the same shallow structure.
+//
+// Raises:
+//  ValueError: If the two structures do not have the same number of elements or
+//    if the two structures are not nested in the same way.
+//  TypeError: If the two structures differ in the type of sequence in any of
+//    their substructures. Only possible if `check_types` is `True`.
+//
+// Returns:
+//  Py_None on success, nullptr on error.
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
+
 // Implements the same interface as tensorflow.util.nest.flatten
 //
 // Returns a flat list from a given nested structure.
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index d69084fc0091ac79cf3f5cf3d70af419cf78f936..b7f201b6fe6fd18af2bb833df2d08bfedb23a185 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -34,6 +34,15 @@ limitations under the License.
 %unignore tensorflow::swig::IsSequence;
 %noexception tensorflow::swig::IsSequence;
 
+%unignore tensorflow::swig::IsNamedtuple;
+%noexception tensorflow::swig::IsNamedtuple;
+
+%unignore tensorflow::swig::SameNamedtuples;
+%noexception tensorflow::swig::SameNamedtuples;
+
+%unignore tensorflow::swig::AssertSameStructure;
+%noexception tensorflow::swig::AssertSameStructure;
+
 %unignore tensorflow::swig::Flatten;
 %noexception tensorflow::swig::Flatten;
 
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 1865240014e2da5068a4ef377a5934de62dd54b6..1913fc20ee0212b3d9588828fe4da4ba7ebca030 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -56,7 +56,10 @@ cc_library(
             [
                 "cuda/*.cc",
             ],
-            exclude = ["cuda/cuda_platform_id.cc"],
+            exclude = [
+                "cuda/*_test.cc",
+                "cuda/cuda_platform_id.cc",
+            ],
         ),
     ),
     copts = select({
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 03e3e0857f9f70dcd3062f0766b50f6f75b2fa5e..f408c06f461d1594b469c2e261780aa29ee5b932 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
+#include "tensorflow/stream_executor/cuda/cudnn_version.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
@@ -55,15 +57,6 @@ NarrowT CheckedNarrowing(const WideT& wide) {
   return narrow;
 }
 
-// Returns the "Compatibility" version number from the CuDNN version number.
-// This is the number that tries to indicate ABI compatibility.
-//
-// For example, if cudnn_version is 5107, the compatibility version
-// number will be 5100.
-size_t cudnnCompatibilityVersion(size_t cudnn_version) {
-  return (cudnn_version / 100) * 100;
-}
-
 }  // namespace
 
 namespace perftools {
@@ -109,6 +102,22 @@ string ToString(cudnnStatus_t status) {
   }
 }
 
+#if CUDNN_VERSION >= 6000
+string ToString(libraryPropertyType type) {
+  switch (type) {
+    case MAJOR_VERSION:
+      return "MAJOR_VERSION";
+    case MINOR_VERSION:
+      return "MINOR_VERSION";
+    case PATCH_LEVEL:
+      return "PATCH_LEVEL";
+    default:
+      return port::StrCat(
+          "<unknown libraryPropertyType: ", static_cast<int>(type), ">");
+  }
+}
+#endif
+
 template <typename T>
 cudnnDataType_t GetCudnnDataType();
 
@@ -360,6 +369,34 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
   }
 }
 
+#if CUDNN_VERSION >= 6000
+port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
+  cudnnStatus_t status = cudnnGetProperty(type, value);
+  if (status != CUDNN_STATUS_SUCCESS) {
+    const string error =
+        port::StrCat("cudnnGetProperty failed for type: ", ToString(type),
+                     " with status: ", ToString(status));
+    LOG(ERROR) << error;
+    return port::Status{port::error::INTERNAL, error};
+  }
+  return port::Status::OK();
+}
+#endif
+
+port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
+#if CUDNN_VERSION >= 6000
+  TF_RETURN_IF_ERROR(GetCudnnProperty(MAJOR_VERSION, &version->major_version));
+  TF_RETURN_IF_ERROR(GetCudnnProperty(MINOR_VERSION, &version->minor_version));
+  TF_RETURN_IF_ERROR(GetCudnnProperty(PATCH_LEVEL, &version->patch_level));
+#else
+  size_t loaded_version = ::cudnnGetVersion();
+  version->major_version = loaded_version / 1000;
+  version->minor_version = (loaded_version / 100) % 10;
+  version->patch_level = loaded_version % 100;
+#endif
+  return port::Status::OK();
+}
+
 }  // namespace
 
 CudnnSupport::CudnnSupport(CUDAExecutor* parent)
@@ -376,24 +413,19 @@ port::Status CudnnSupport::Init() {
   auto status = wrap::cudnnCreate(
       parent_, reinterpret_cast<cudnnHandle_t*>(&dnn_handle_));
   if (status == CUDNN_STATUS_SUCCESS) {
-    // Check whether loaded version of CuDNN matches what the source
-    // was built with.
-    size_t loaded_version = ::cudnnGetVersion();
-    size_t loaded_compat_version = cudnnCompatibilityVersion(loaded_version);
-    size_t compiled_compat_version = cudnnCompatibilityVersion(CUDNN_VERSION);
-    bool library_loaded_matches_source =
-        (loaded_compat_version == compiled_compat_version);
-    if (!library_loaded_matches_source) {
-      const string error =
-          port::StrCat("Loaded runtime CuDNN library: ", loaded_version,
-                       " (compatibility version ", loaded_compat_version,
-                       ") but source was compiled with ", CUDNN_VERSION,
-                       " (compatibility version ", compiled_compat_version,
-                       ").  If using a binary install, upgrade your CuDNN "
-                       "library to match.  If building from sources, "
-                       "make sure the library loaded at runtime matches a "
-                       "compatible version specified during compile "
-                       "configuration.");
+    CudnnVersion source_version(CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
+
+    CudnnVersion loaded_version;
+    TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&loaded_version));
+    if (!IsSourceCompatibleWithCudnnLibrary(source_version, loaded_version)) {
+      const tensorflow::string error = port::StrCat(
+          "Loaded runtime CuDNN library: ", loaded_version.ToString(),
+          " but source was compiled with: ", source_version.ToString(),
+          ".  CuDNN library major and minor version needs to match or have "
+          "higher minor version in case of CuDNN 7.0 or later version. If "
+          "using a binary install, upgrade your CuDNN library.  If building "
+          "from sources, make sure the library loaded at runtime is compatible "
+          "with the version specified during compile configuration.");
       LOG(ERROR) << error;
       return port::Status{port::error::INTERNAL, error};
     }
@@ -3157,12 +3189,18 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
                                      dnn::DataType output_type, float scale,
                                      DeviceMemoryBase* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
+  cudnnStatus_t status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                              AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
+  }
+
   float beta = 0.0f;
   ScopedTensorDescriptor input_tensor_desc(
       parent_, input_desc, ToCudnnDataType(input_type, input_desc.layout()));
   ScopedTensorDescriptor output_tensor_desc(
       parent_, output_desc, ToCudnnDataType(output_type, output_desc.layout()));
-  cudnnStatus_t status = wrap::cudnnTransformTensor(
+  status = wrap::cudnnTransformTensor(
       parent_, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
       input_data.opaque(), &beta, output_tensor_desc.handle(),
       output_data->opaque());
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index a017ff64d4c69b6952b442464877dc26a800ad37..58e1e58c593a3d938d97baff2356bce2c215a7a1 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -1503,6 +1503,19 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
+/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
+    CUdevice_attribute attribute, CUdevice device) {
+  int val;
+  CUresult res = cuDeviceGetAttribute(&val, attribute, device);
+  if (res != CUDA_SUCCESS) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf("failed to get device attribute %d for device %d: %s",
+                     attribute, device, ToString(res).c_str())};
+  }
+  return val;
+}
+
 /* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
   int value = -1;
   CUresult res =
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index 4002ba2021d1a2e2c36bd1786a3084ee8c08bb78..fa9172b3f008d3083309126bbfa4a1ab961030e1 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -400,12 +400,20 @@ class CUDADriver {
 
   // Returns a grab-bag of device properties in a caller-owned device_properties
   // structure for device_ordinal via cuDeviceGetProperties.
-  // This call is deprecated in the NVIDIA driver API.
+  //
+  // This call is deprecated in the NVIDIA driver API; its replacement is
+  // GetDeviceAttribute
   //
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
   static bool GetDeviceProperties(CUdevprop *device_properties,
                                   int device_ordinal);
 
+  // Gets a specific integer-valued property about the given device.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
+                                                CUdevice device);
+
   // Returns whether ECC is enabled for the given CUdevice via
   // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 4bbd531e14f18fc24d87b4fa655fe72e9f56b129..5ecaf46b8cae3c1e1f312816e7e5aec8ff8ce306 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -1103,6 +1103,18 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
     builder.set_device_memory_size(device_memory_size);
   }
 
+  port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
+      CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
+  port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
+      CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
+  if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
+    // Times 2 because HBM is DDR memory; it gets two data bits per each data
+    // lane.
+    builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
+                                 1000 *
+                                 int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
+  }
+
   {
     BlockDim block_dim_limit;
     FillBlockDimLimit(&block_dim_limit);
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.cc b/tensorflow/stream_executor/cuda/cudnn_version.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5591801aae2526d528289f9b2267d864cf766045
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_version.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cudnn_version.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
+                                        CudnnVersion loaded_version) {
+  // Major version is neither forward or backward compatible and therefore major
+  // versions needs to match between source and library.
+  //
+  // Minor version is backward-compatible beginning with CuDNN 7 and therefore
+  // minor version of library needs to be same or higher.
+  //
+  // Patch releases are always forward and backward compatible and therefore
+  // need not match.
+  if (loaded_version.major_version != source_version.major_version) {
+    return false;
+  }
+  return ((loaded_version.minor_version == source_version.minor_version) ||
+          (source_version.major_version >= 7 &&
+           loaded_version.minor_version >= source_version.minor_version));
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ed02e1700ced5087bfebacb6314cbc8771e3612
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+struct CudnnVersion {
+  CudnnVersion() = default;
+
+  CudnnVersion(int major, int minor, int patch)
+      : major_version(major), minor_version(minor), patch_level(patch) {}
+
+  tensorflow::string ToString() const {
+    return tensorflow::strings::StrCat(major_version, ".", minor_version, ".",
+                                       patch_level);
+  }
+
+  int major_version;
+  int minor_version;
+  int patch_level;
+};
+
+// Returns true if the given source CuDNN version is compatible with the given
+// loaded version.
+bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
+                                        CudnnVersion loaded_version);
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..230adafeb112f682b5ece4778921e18a4ad25f87
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cudnn_version.h"
+
+#include "testing/base/public/gunit.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+namespace {
+
+TEST(CuDNNVersion, ToString) {
+  CudnnVersion version(7, 0, 12);
+  EXPECT_EQ(version.ToString(), "7.0.12");
+}
+
+TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
+  // Returns true if both major and minor versions are matching and even if the
+  // patch versions are not matching.
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 12),
+      /*loaded_version=*/CudnnVersion(7, 0, 14)));
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(6, 1, 14),
+      /*loaded_version=*/CudnnVersion(6, 1, 00)));
+
+  // Returns false if major versions are not matching as they are neither
+  // forward or backward compatible.
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 12),
+      /*loaded_version=*/CudnnVersion(6, 1, 14)));
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(8, 1, 15),
+      /*loaded_version=*/CudnnVersion(7, 0, 14)));
+
+  // Returns true if the loaded version is equal or higher because minor version
+  // are backward compatible with CuDNN version 7.
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 14),
+      /*loaded_version=*/CudnnVersion(7, 1, 14)));
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 14),
+      /*loaded_version=*/CudnnVersion(7, 1, 15)));
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 1, 15),
+      /*loaded_version=*/CudnnVersion(7, 0, 14)));
+
+  // Returns false if minor versions are not matching for version 6. Before
+  // version 7, minor versions are also neither forward or backward compatible.
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(6, 0, 14),
+      /*loaded_version=*/CudnnVersion(6, 1, 15)));
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(6, 1, 14),
+      /*loaded_version=*/CudnnVersion(6, 0, 14)));
+}
+
+}  // namespace
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index a98143e34bbb42c3aee76c27e1648c49397a0e44..52f5319a3b16c771ce89843a963841b25df5467e 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -50,6 +50,7 @@ DeviceDescription::DeviceDescription()
       shared_memory_alloc_granularity_(1),
       device_address_bits_(kUninitializedUint64),
       device_memory_size_(kUninitializedUint64),
+      memory_bandwidth_(kUninitializedUint64),
       shared_memory_per_core_(kUninitializedUint64),
       shared_memory_per_block_(kUninitializedUint64),
       clock_rate_ghz_(-1.0),
@@ -85,6 +86,8 @@ std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
   result["Device Address Bits"] = port::StrCat(device_address_bits());
   result["Device Memory Size"] =
       port::HumanReadableNumBytes::ToString(device_memory_size());
+  result["Memory Bandwidth"] = port::StrCat(
+      port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
 
   result["Shared Memory Per Core"] =
       port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index f2b35bcb4345a37f72541979564cbbb7944595c2..fcf0928096ed1f1bdf0499efb92af2bc9cb0eaa2 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -140,6 +140,11 @@ class DeviceDescription {
   // Returns the device memory size in bytes.
   uint64 device_memory_size() const { return device_memory_size_; }
 
+  // Returns the device's memory bandwidth in bytes/sec.  (This is for
+  // reads/writes to/from the device's own memory, not for transfers between the
+  // host and device.)
+  uint64 memory_bandwidth() const { return memory_bandwidth_; }
+
   // Returns the device's core clock rate in GHz.
   float clock_rate_ghz() const { return clock_rate_ghz_; }
 
@@ -212,6 +217,7 @@ class DeviceDescription {
 
   uint64 device_address_bits_;
   uint64 device_memory_size_;
+  uint64 memory_bandwidth_;
 
   // Shared memory limits on a given device.
   uint64 shared_memory_per_core_;
@@ -305,6 +311,9 @@ class DeviceDescriptionBuilder {
   void set_device_memory_size(uint64 value) {
     device_description_->device_memory_size_ = value;
   }
+  void set_memory_bandwidth(uint64 value) {
+    device_description_->memory_bandwidth_ = value;
+  }
 
   void set_shared_memory_per_core(int64 value) {
     device_description_->shared_memory_per_core_ = value;
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index 81e531efb31ea7d8d6ac03b56aea6aa5f01d64d1..636199cfa2762b7c42dd350dfd294762e3159299 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/stream_executor/lib/demangle.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@@ -96,7 +97,7 @@ static const char *kStubPrefix = "__device_stub_";
 void KernelBase::set_name(port::StringPiece name) {
   name_ = name.ToString();
   port::StringPiece stubless_name = name;
-  if (name.starts_with(kStubPrefix)) {
+  if (tensorflow::str_util::StartsWith(name, kStubPrefix)) {
     stubless_name.remove_prefix(strlen(kStubPrefix));
   }
   demangled_name_ = port::Demangle(stubless_name.data());
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
index 4dd6f3b0ccf112b281dd50467e9a16a672dbbbfb..5dd3d06affa424f0919f107c76ba40feeb165122 100644
--- a/tensorflow/stream_executor/lib/str_util.h
+++ b/tensorflow/stream_executor/lib/str_util.h
@@ -29,7 +29,7 @@ using tensorflow::str_util::Split;
 // Returns a copy of the input string 'str' with the given 'suffix'
 // removed. If the suffix doesn't match, returns a copy of the original string.
 inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix) {
-  if (str.ends_with(suffix)) {
+  if (tensorflow::str_util::EndsWith(str, suffix)) {
     str.remove_suffix(suffix.size());
   }
   return str.ToString();
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 6bbb5f0b2e728a855b64beb33d6e28b1b3b399c3..1e3afde2687657e417e9e2cb3f5e2aaf0600da7a 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_buffer.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
@@ -117,7 +118,9 @@ string ToVlogString(const DeviceMemoryBase *memory) {
   return ToVlogString(*memory);
 }
 
-string ToVlogString(const Eigen::half &h) { return port::StrCat(h); }
+string ToVlogString(const Eigen::half &h) {
+  return port::StrCat(static_cast<float>(h));
+}
 
 string ToVlogString(int i) { return port::StrCat(i); }
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 51ef3235b7fa41342a93102a9fa442941c071226..528f811b40ad7711407c856af804cbe2829d8b32 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -22,6 +22,7 @@ load(
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
+    "if_mkl_lnx_x64"
 )
 
 def register_extension_info(**kwargs):
@@ -34,7 +35,7 @@ def src_to_test_name(src):
   return src.replace("/", "_").split(".")[0]
 
 def full_path(relative_paths):
-  return [PACKAGE_NAME + "/" + relative for relative in relative_paths]
+  return [native.package_name() + "/" + relative for relative in relative_paths]
 
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
@@ -202,7 +203,8 @@ def tf_copts(android_optimization_level_override="-O2", is_external=False):
           "-ftemplate-depth=900"])
       + if_cuda(["-DGOOGLE_CUDA=1"])
       + if_tensorrt(["-DGOOGLE_TENSORRT=1"])
-      + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML", "-fopenmp",])
+      + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"])
+      + if_mkl_lnx_x64(["-fopenmp"])
       + if_android_arm(["-mfpu=neon"])
       + if_linux_x86_64(["-msse3"])
       + if_ios_x86_64(["-msse4.1"])
@@ -265,7 +267,7 @@ def _rpath_linkopts(name):
   # deployed. Other shared object dependencies (e.g. shared between contrib/
   # ops) are picked up as long as they are in either the same or a parent
   # directory in the tensorflow/ tree.
-  levels_to_root = PACKAGE_NAME.count("/") + name.count("/")
+  levels_to_root = native.package_name().count("/") + name.count("/")
   return select({
       clean_dep("//tensorflow:darwin"): [
           "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
@@ -302,6 +304,7 @@ def tf_cc_shared_object(
           clean_dep("//tensorflow:darwin"): [
               "-Wl,-install_name,@rpath/" + name.split("/")[-1],
           ],
+          clean_dep("//tensorflow:windows"): [],
           "//conditions:default": [
               "-Wl,-soname," + name.split("/")[-1],
           ],
@@ -340,6 +343,22 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}.*",
 )
 
+# A simple wrap around native.cc_binary rule.
+# When using this rule, you should realize it doesn't link to any tensorflow
+# dependencies by default.
+def tf_native_cc_binary(name,
+                        copts=tf_copts(),
+                        **kwargs):
+  native.cc_binary(
+      name=name,
+      copts=copts,
+      **kwargs)
+
+register_extension_info(
+    extension_name = "tf_native_cc_binary",
+    label_regex_for_dep = "{extension_name}.*",
+)
+
 def tf_gen_op_wrapper_cc(name,
                          out_ops_file,
                          pkg="",
@@ -620,9 +639,12 @@ def tf_cc_test(name,
       linkopts=select({
         clean_dep("//tensorflow:android"): [
             "-pie",
-          ],
+        ],
         clean_dep("//tensorflow:windows"): [],
         clean_dep("//tensorflow:windows_msvc"): [],
+        clean_dep("//tensorflow:darwin"): [
+            "-lm",
+        ],
         "//conditions:default": [
             "-lpthread",
             "-lm"
@@ -788,7 +810,33 @@ def tf_cc_test_mkl(srcs,
                    tags=[],
                    size="medium",
                    args=None):
-  if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
+  for src in srcs:
+    native.cc_test(
+      name=src_to_test_name(src),
+      srcs=if_mkl([src]) + tf_binary_additional_srcs(),
+      copts=tf_copts(),
+      linkopts=select({
+        clean_dep("//tensorflow:android"): [
+            "-pie",
+          ],
+        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows_msvc"): [],
+        "//conditions:default": [
+            "-lpthread",
+            "-lm"
+        ],
+      }) + _rpath_linkopts(src_to_test_name(src)),
+      deps=deps + if_mkl(
+          [
+              "//third_party/mkl:intel_binary_blob",
+          ],
+      ),
+      linkstatic=linkstatic,
+      tags=tags,
+      size=size,
+      args=args,
+      nocopts="-fno-exceptions")
+
 
 def tf_cc_tests_gpu(srcs,
                     deps,
@@ -905,6 +953,15 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
   if not cuda_deps:
     cuda_deps = []
 
+  if 'linkstatic' not in kwargs or kwargs['linkstatic'] != 1:
+    enable_text_relocation_linkopt = select({
+          clean_dep("//tensorflow:darwin"): [],
+          clean_dep("//tensorflow:windows"): [],
+          "//conditions:default": ['-Wl,-z,notext'],})
+    if 'linkopts' in kwargs:
+      kwargs['linkopts'] += enable_text_relocation_linkopt
+    else:
+      kwargs['linkopts'] = enable_text_relocation_linkopt
   native.cc_library(
       deps=deps + if_cuda(cuda_deps + [
           clean_dep("//tensorflow/core:cuda"),
@@ -998,16 +1055,12 @@ register_extension_info(
 def tf_mkl_kernel_library(name,
                           prefix=None,
                           srcs=None,
-                          gpu_srcs=None,
                           hdrs=None,
                           deps=None,
                           alwayslink=1,
                           copts=tf_copts(),
-                          nocopts="-fno-exceptions",
-                          **kwargs):
+                          nocopts="-fno-exceptions"):
   """A rule to build MKL-based TensorFlow kernel libraries."""
-  gpu_srcs = gpu_srcs  # unused argument
-  kwargs = kwargs  # unused argument
 
   if not bool(srcs):
     srcs = []
@@ -1020,16 +1073,15 @@ def tf_mkl_kernel_library(name,
     hdrs = hdrs + native.glob(
         [prefix + "*.h"])
 
-  if_mkl(
-      native.cc_library(
-          name=name,
-          srcs=srcs,
-          hdrs=hdrs,
-          deps=deps,
-          alwayslink=alwayslink,
-          copts=copts,
-          nocopts=nocopts
-      ))
+  native.cc_library(
+      name=name,
+      srcs=if_mkl(srcs),
+      hdrs=hdrs,
+      deps=deps,
+      alwayslink=alwayslink,
+      copts=copts,
+      nocopts=nocopts
+  )
 
 register_extension_info(
     extension_name = "tf_mkl_kernel_library",
@@ -1158,22 +1210,6 @@ def transitive_hdrs(name, deps=[], **kwargs):
 # the libraries in deps.
 def cc_header_only_library(name, deps=[], includes=[], **kwargs):
   _transitive_hdrs(name=name + "_gather", deps=deps)
-
-  # We could generalize the following, but rather than complicate things
-  # here, we'll do the minimal use case for now, and hope bazel comes up
-  # with a better solution before too long.  We'd expect it to compute
-  # the right include path by itself, but it doesn't, possibly because
-  # _transitive_hdrs lost some information about the include path.
-  if "@nsync//:nsync_headers" in deps:
-    # Buiding tensorflow from @org_tensorflow finds this two up.
-    nsynch = "../../external/nsync/public"
-    # Building tensorflow from elsewhere finds it four up.
-    # Note that native.repository_name() is not yet available in TF's Kokoro.
-    if REPOSITORY_NAME != "@":
-      nsynch = "../../" + nsynch
-    includes = includes[:]
-    includes.append(nsynch)
-
   native.cc_library(name=name,
                     hdrs=[":" + name + "_gather"],
                     includes=includes,
@@ -1182,7 +1218,6 @@ def cc_header_only_library(name, deps=[], includes=[], **kwargs):
 def tf_custom_op_library_additional_deps():
   return [
       "@protobuf_archive//:protobuf_headers",
-      "@nsync//:nsync_headers",
       clean_dep("//third_party/eigen3"),
       clean_dep("//tensorflow/core:framework_headers_lib"),
   ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
@@ -1192,9 +1227,7 @@ def tf_custom_op_library_additional_deps():
 # exporting symbols from _pywrap_tensorflow.dll on Windows.
 def tf_custom_op_library_additional_deps_impl():
   return [
-      # for @protobuf_archive//:protobuf_headers
       "@protobuf_archive//:protobuf",
-      # for @nsync//:nsync_headers
       "@nsync//:nsync_cpp",
       # for //third_party/eigen3
       clean_dep("//third_party/eigen3"),
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 14ce8dbeb3614b177bf4424c3e00ae7dd17f72f0..9f1bdd8aae7f4ef0540070fa20530f24798068bd 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -5,18 +5,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_binary(
     name = "create_python_api",
     srcs = ["create_python_api.py"],
@@ -105,6 +93,7 @@ genrule(
         "api/logging/__init__.py",
         "api/losses/__init__.py",
         "api/manip/__init__.py",
+        "api/math/__init__.py",
         "api/metrics/__init__.py",
         "api/nn/__init__.py",
         "api/nn/rnn_cell/__init__.py",
@@ -127,6 +116,7 @@ genrule(
         "api/test/__init__.py",
         "api/train/__init__.py",
         "api/train/queue_runner/__init__.py",
+        "api/user_ops/__init__.py",
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index bb7c3e77a3a82b63188fc5a4a21706da7a276fde..70f9776b0846582b8d4e9710879883fdf975a001 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -23,7 +23,6 @@ import collections
 import os
 import sys
 
-from tensorflow import python as tf
 from tensorflow.python.util import tf_decorator
 
 
@@ -39,6 +38,11 @@ Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 """
 
 
+class SymbolExposedTwiceError(Exception):
+  """Raised when different symbols are exported with the same name."""
+  pass
+
+
 def format_import(source_module_name, source_name, dest_name):
   """Formats import statement.
 
@@ -63,6 +67,44 @@ def format_import(source_module_name, source_name, dest_name):
       return 'import %s as %s' % (source_name, dest_name)
 
 
+class _ModuleImportsBuilder(object):
+  """Builds a map from module name to imports included in that module."""
+
+  def __init__(self):
+    self.module_imports = collections.defaultdict(list)
+    self._seen_api_names = set()
+
+  def add_import(
+      self, dest_module_name, source_module_name, source_name, dest_name):
+    """Adds this import to module_imports.
+
+    Args:
+      dest_module_name: (string) Module name to add import to.
+      source_module_name: (string) Module to import from.
+      source_name: (string) Name of the symbol to import.
+      dest_name: (string) Import the symbol using this name.
+
+    Raises:
+      SymbolExposedTwiceError: Raised when an import with the same
+        dest_name has already been added to dest_module_name.
+    """
+    import_str = format_import(source_module_name, source_name, dest_name)
+    if import_str in self.module_imports[dest_module_name]:
+      return
+
+    # Check if we are trying to expose two different symbols with same name.
+    full_api_name = dest_name
+    if dest_module_name:
+      full_api_name = dest_module_name + '.' + full_api_name
+    if full_api_name in self._seen_api_names:
+      raise SymbolExposedTwiceError(
+          'Trying to export multiple symbols with same name: %s.' %
+          full_api_name)
+    self._seen_api_names.add(full_api_name)
+
+    self.module_imports[dest_module_name].append(import_str)
+
+
 def get_api_imports():
   """Get a map from destination module to formatted imports.
 
@@ -73,12 +115,15 @@ def get_api_imports():
           (for e.g. 'from foo import bar') and constant
           assignments (for e.g. 'FOO = 123').
   """
-  module_imports = collections.defaultdict(list)
+  module_imports_builder = _ModuleImportsBuilder()
+  visited_symbols = set()
+
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
   for module in sys.modules.values():
     # Only look at tensorflow modules.
-    if not module or 'tensorflow.' not in module.__name__:
+    if (not module or not hasattr(module, "__name__") or
+        'tensorflow.' not in module.__name__):
       continue
     # Do not generate __init__.py files for contrib modules for now.
     if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
@@ -86,6 +131,8 @@ def get_api_imports():
 
     for module_contents_name in dir(module):
       attr = getattr(module, module_contents_name)
+      if id(attr) in visited_symbols:
+        continue
 
       # If attr is _tf_api_constants attribute, then add the constants.
       if module_contents_name == _API_CONSTANTS_ATTR:
@@ -93,36 +140,30 @@ def get_api_imports():
           for export in exports:
             names = export.split('.')
             dest_module = '.'.join(names[:-1])
-            import_str = format_import(module.__name__, value, names[-1])
-            module_imports[dest_module].append(import_str)
+            module_imports_builder.add_import(
+                dest_module, module.__name__, value, names[-1])
         continue
 
       _, attr = tf_decorator.unwrap(attr)
       # If attr is a symbol with _tf_api_names attribute, then
       # add import for it.
       if hasattr(attr, '__dict__') and _API_NAMES_ATTR in attr.__dict__:
-        # The same op might be accessible from multiple modules.
-        # We only want to consider location where function was defined.
-        # Here we check if the op is defined in another TensorFlow module in
-        # sys.modules.
-        if (hasattr(attr, '__module__') and
-            attr.__module__.startswith(tf.__name__) and
-            attr.__module__ != module.__name__ and
-            attr.__module__ in sys.modules and
-            module_contents_name in dir(sys.modules[attr.__module__])):
+        # If the same symbol is available using multiple names, only create
+        # imports for it once.
+        if id(attr) in visited_symbols:
           continue
+        visited_symbols.add(id(attr))
 
         for export in attr._tf_api_names:  # pylint: disable=protected-access
           names = export.split('.')
           dest_module = '.'.join(names[:-1])
-          import_str = format_import(
-              module.__name__, module_contents_name, names[-1])
-          module_imports[dest_module].append(import_str)
+          module_imports_builder.add_import(
+              dest_module, module.__name__, module_contents_name, names[-1])
 
   # Import all required modules in their parent modules.
   # For e.g. if we import 'foo.bar.Value'. Then, we also
   # import 'bar' in 'foo'.
-  imported_modules = set(module_imports.keys())
+  imported_modules = set(module_imports_builder.module_imports.keys())
   for module in imported_modules:
     if not module:
       continue
@@ -135,13 +176,11 @@ def get_api_imports():
         parent_module += ('.' + module_split[submodule_index-1] if parent_module
                           else module_split[submodule_index-1])
         import_from += '.' + parent_module
-      submodule_import = format_import(
-          import_from, module_split[submodule_index],
+      module_imports_builder.add_import(
+          parent_module, import_from, module_split[submodule_index],
           module_split[submodule_index])
-      if submodule_import not in module_imports[parent_module]:
-        module_imports[parent_module].append(submodule_import)
 
-  return module_imports
+  return module_imports_builder.module_imports
 
 
 def create_api_files(output_files):
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
index 08436396a6c04a59461b6800b908c29aabb91a1b..ebdf42df2c01a60b1cadd0368647adc4121db7ef 100644
--- a/tensorflow/tools/api/golden/BUILD
+++ b/tensorflow/tools/api/golden/BUILD
@@ -10,15 +10,3 @@ filegroup(
     name = "api_golden",
     srcs = glob(["*.pbtxt"]),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7405202b892bba67a36d86cd43fb7a67ab3be947
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.GradientTape"
+tf_class {
+  is_instance: "<class \'tensorflow.python.eager.backprop.GradientTape\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'persistent\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "gradient"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "watch"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "watched_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd9be8c75914ed37f5f36c4df5a14bd00caee20e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.estimator.BoostedTreesClassifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b305be43f845ec15f9c160d5ea4823c6ae68897
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.estimator.BoostedTreesRegressor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index 091b1be0c83480757445542acb97e139bd74ef03..05e603efb7cbad8c4c42a7a15074d2634af8d21c 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -78,9 +78,13 @@ tf_class {
     name: "tf_random_seed"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "train_distribute"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
index a7a6cc1e49ddfe07569dff035e38931a0510addd..4946f2c51a62af85d61b8e38e982c59dd0d61e36 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -8,6 +8,14 @@ tf_module {
     name: "BaselineRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BoostedTreesClassifier"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BoostedTreesRegressor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 0f2428d77a537959cf2c46dfa350208abea8cb36..bf361cf8054571c0b056e1373acb838aaea87173 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -75,10 +75,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "regularizers"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "scope_name"
     mtype: "<type \'property\'>"
@@ -91,10 +87,6 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -153,11 +145,11 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -173,11 +165,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'32\', \'1\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -185,7 +177,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -245,7 +237,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "predict_classes"
@@ -253,7 +245,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
@@ -297,6 +289,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'class_weight\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 102eb3220334516e0051f952353920f229f4ff20..be12b0bd2ec509ff394eaa3f43db0b54badd7fba 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -75,10 +75,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "regularizers"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "scope_name"
     mtype: "<type \'property\'>"
@@ -91,10 +87,6 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -153,11 +145,11 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -173,11 +165,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'32\', \'1\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -185,7 +177,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -245,7 +237,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "predict_classes"
@@ -253,7 +245,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
@@ -297,6 +289,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'class_weight\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
index 04174bff5f04fead68af68afeec80316867009a4..ec0f3d892d9d03a738d34a40afe701e788908a8e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\', \'nearest\'], "
+    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'subset\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\', \'None\', \'nearest\'], "
   }
   member_method {
     name: "next"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
index 41f27d1f740457f4b7c4f74cb089a448a0fed845..f5bc04e44c198e5bc60f8361dd32e4ae00250468 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'featurewise_center\', \'samplewise_center\', \'featurewise_std_normalization\', \'samplewise_std_normalization\', \'zca_whitening\', \'zca_epsilon\', \'rotation_range\', \'width_shift_range\', \'height_shift_range\', \'shear_range\', \'zoom_range\', \'channel_shift_range\', \'fill_mode\', \'cval\', \'horizontal_flip\', \'vertical_flip\', \'rescale\', \'preprocessing_function\', \'data_format\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'1e-06\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'nearest\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'featurewise_center\', \'samplewise_center\', \'featurewise_std_normalization\', \'samplewise_std_normalization\', \'zca_whitening\', \'zca_epsilon\', \'rotation_range\', \'width_shift_range\', \'height_shift_range\', \'brightness_range\', \'shear_range\', \'zoom_range\', \'channel_shift_range\', \'fill_mode\', \'cval\', \'horizontal_flip\', \'vertical_flip\', \'rescale\', \'preprocessing_function\', \'data_format\', \'validation_split\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'1e-06\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'0.0\', \'0.0\', \'0.0\', \'nearest\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "fit"
@@ -12,11 +12,11 @@ tf_class {
   }
   member_method {
     name: "flow"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\'], varargs=None, keywords=None, defaults=[\'None\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'subset\'], varargs=None, keywords=None, defaults=[\'None\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'None\'], "
   }
   member_method {
     name: "flow_from_directory"
-    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\', \'nearest\'], "
+    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'subset\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\', \'None\', \'nearest\'], "
   }
   member_method {
     name: "random_transform"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
index 4ef6e6e99e3b71d4a6e497cc577ef8b42cebab79..42196ddeee7aab144537eef250c07060923fa6a9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'x\', \'y\', \'image_data_generator\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'None\', \'None\', \'\', \'png\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'image_data_generator\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'subset\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'None\', \'None\', \'\', \'png\', \'None\'], "
   }
   member_method {
     name: "next"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
index d28fef696515e09990d63581de6127fd52c0a4ee..6b850dd6b784412d623f44200b4acc169bf25968 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "load_img"
     argspec: "args=[\'path\', \'grayscale\', \'target_size\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'nearest\'], "
   }
+  member_method {
+    name: "random_brightness"
+    argspec: "args=[\'x\', \'brightness_range\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "random_channel_shift"
     argspec: "args=[\'x\', \'intensity\', \'channel_axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d9c3215b555c19bc5cf4b32b0d227a9e1b63ce1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.preprocessing.sequence.TimeseriesGenerator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.sequence.TimeseriesGenerator\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'data\', \'targets\', \'length\', \'sampling_rate\', \'stride\', \'start_index\', \'end_index\', \'shuffle\', \'reverse\', \'batch_size\'], varargs=None, keywords=None, defaults=[\'1\', \'1\', \'0\', \'None\', \'False\', \'False\', \'128\'], "
+  }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt
index 1b01935cc53b450c3e7009f945f86c8e1c10bf8e..cf59f8a27269c1161919f7ca2a44c5717a836dd7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.preprocessing.sequence"
 tf_module {
+  member {
+    name: "TimeseriesGenerator"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "make_sampling_table"
     argspec: "args=[\'size\', \'sampling_factor\'], varargs=None, keywords=None, defaults=[\'1e-05\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt
index d106429df0273929472aa58909f554bcffde9bca..50b54fc7e179bdfb8641d8de12934caa3fc44300 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "Tokenizer"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "hashing_trick"
+    argspec: "args=[\'text\', \'n\', \'hash_function\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
+  }
   member_method {
     name: "one_hot"
     argspec: "args=[\'text\', \'n\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..897718c05e0d10a6f961f33b8c65f5dab1d03f5b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.math"
+tf_module {
+  member_method {
+    name: "polyval"
+    argspec: "args=[\'coeffs\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 80735cea5d00b93d895c238bc7dc9acd5a6f7237..afa3b78eb7fb3618edce06bbff288c37fdf71015 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -84,6 +84,10 @@ tf_module {
     name: "GRAPH_DEF_VERSION_MIN_PRODUCER"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "GradientTape"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Graph"
     mtype: "<type \'type\'>"
@@ -400,6 +404,10 @@ tf_module {
     name: "manip"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "math"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "metrics"
     mtype: "<type \'module\'>"
@@ -974,7 +982,7 @@ tf_module {
   }
   member_method {
     name: "enable_eager_execution"
-    argspec: "args=[\'config\', \'device_policy\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'config\', \'device_policy\', \'execution_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "encode_base64"
@@ -1684,6 +1692,14 @@ tf_module {
     name: "scatter_div"
     argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "scatter_max"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_min"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "scatter_mul"
     argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
index c75ee474aa471524e4b5c8a7e2dd4a9da4b08eae..bec72e1e609c3e32ca8366396b9b1cb577feab9d 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -238,7 +238,7 @@ tf_module {
   }
   member_method {
     name: "MonitoredTrainingSession"
-    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'600\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\', \'7200\'], "
+    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\', \'max_wait_secs\', \'save_checkpoint_steps\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'<object object instance>\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\', \'7200\', \'<object object instance>\'], "
   }
   member_method {
     name: "NewCheckpointReader"
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
index 2d3b838957d60ffb5e827c6b43100d217cc5739e..3f4fb9104271539c431f02e21b7e30780a721fd7 100644
--- a/tensorflow/tools/api/lib/BUILD
+++ b/tensorflow/tools/api/lib/BUILD
@@ -26,15 +26,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 15bf1abb5f8f541c435be77b1a3c2f13382f2438..0dc154b6d2c6884b3cd91a4f4c7c08825c123124 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -42,15 +42,3 @@ tf_cc_binary(
         "//tensorflow/core:op_gen_lib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index baa7a0889de2534604f94f34f6852a8fdf8819fa..7eeae05847526ca106b80e55c92d3b55f988b149 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -58,7 +58,7 @@ _UPDATE_GOLDENS_HELP = """
      have to be authorized by TensorFlow leads.
 """
 
-# DEFINE_boolean, verbose_diffs, default False:
+# DEFINE_boolean, verbose_diffs, default True:
 _VERBOSE_DIFFS_HELP = """
      If set to true, print line by line diffs on all libraries. If set to
      false, only print which libraries have differences.
@@ -145,6 +145,9 @@ class ApiCompatibilityTest(test.TestCase):
       verbose_diff_message = ''
       # First check if the key is not found in one or the other.
       if key in only_in_expected:
+        # TODO(annarev): remove once we switch to using tf_export decorators.
+        if key == 'tensorflow.math':
+          continue
         diff_message = 'Object %s expected but not found (removed). %s' % (
             key, additional_missing_object_message)
         verbose_diff_message = diff_message
@@ -229,6 +232,13 @@ class ApiCompatibilityTest(test.TestCase):
         for filename in golden_file_list
     }
 
+    # TODO(annarev): remove once we switch to using tf_export decorators.
+    tf_module = golden_proto_dict['tensorflow'].tf_module
+    for i in range(len(tf_module.member)):
+      if tf_module.member[i].name == 'math':
+        del tf_module.member[i]
+        break
+
     # Diff them. Do not fail if called with update.
     # If the test is run to update goldens, only report diffs but do not fail.
     self._AssertProtoDictEquals(
@@ -270,17 +280,6 @@ class ApiCompatibilityTest(test.TestCase):
         for filename in golden_file_list
     }
 
-    # user_ops is an empty module. It is currently available in TensorFlow API
-    # but we don't keep empty modules in the new API.
-    # We delete user_ops from golden_proto_dict to make sure assert passes
-    # when diffing new API against goldens.
-    # TODO(annarev): remove user_ops from goldens once we switch to new API.
-    tf_module = golden_proto_dict['tensorflow'].tf_module
-    for i in range(len(tf_module.member)):
-      if tf_module.member[i].name == 'user_ops':
-        del tf_module.member[i]
-        break
-
     # Diff them. Do not fail if called with update.
     # If the test is run to update goldens, only report diffs but do not fail.
     self._AssertProtoDictEquals(
@@ -297,7 +296,7 @@ if __name__ == '__main__':
   parser.add_argument(
       '--update_goldens', type=bool, default=False, help=_UPDATE_GOLDENS_HELP)
   parser.add_argument(
-      '--verbose_diffs', type=bool, default=False, help=_VERBOSE_DIFFS_HELP)
+      '--verbose_diffs', type=bool, default=True, help=_VERBOSE_DIFFS_HELP)
   FLAGS, unparsed = parser.parse_known_args()
 
   # Now update argv, so that unittest library does not get confused.
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 6ed2594e6abe169577066678e1bf4b9e2df4c4d3..566a172ea77fbc033496ef00a3415cff9ad8149a 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -90,12 +90,3 @@ tf_cc_binary(
     visibility = ["//visibility:public"],
     deps = [":benchmark_model_lib"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/build_info/BUILD b/tensorflow/tools/build_info/BUILD
index cdc47076ced24698d0139c1c14d1660018b1a815..730741780550bfe3fbccd7e62f5f7d9788f0a9a9 100644
--- a/tensorflow/tools/build_info/BUILD
+++ b/tensorflow/tools/build_info/BUILD
@@ -9,18 +9,3 @@ exports_files(
         "gen_build_info.py",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/ci_build/builds/android.sh b/tensorflow/tools/ci_build/builds/android.sh
index 564c5aa1480f5fd824dbc5c8bc85cec90664c512..d81793efe08f151c1b448a9da3cc971ca3137829 100755
--- a/tensorflow/tools/ci_build/builds/android.sh
+++ b/tensorflow/tools/ci_build/builds/android.sh
@@ -29,7 +29,8 @@ echo "========== TensorFlow Demo Build Test =========="
 # Enable sandboxing so that zip archives don't get incorrectly packaged
 # in assets/ dir (see https://github.com/bazelbuild/bazel/issues/2334)
 # TODO(gunan): remove extra flags once sandboxing is enabled for all builds.
-bazel --bazelrc=/dev/null build -c opt --fat_apk_cpu=x86_64 \
+bazel --bazelrc=/dev/null build \
+    --compilation_mode=opt --cxxopt=-std=c++11 --fat_apk_cpu=x86_64 \
     --spawn_strategy=sandboxed --genrule_strategy=sandboxed \
     //tensorflow/examples/android:tensorflow_demo
 
diff --git a/tensorflow/tools/ci_build/builds/android_full.sh b/tensorflow/tools/ci_build/builds/android_full.sh
index 9d449241e8413ddbd81c580cc4def808c0086cb9..41dc66dd5436a81eeeca197f6ef57cb2a1407ca0 100755
--- a/tensorflow/tools/ci_build/builds/android_full.sh
+++ b/tensorflow/tools/ci_build/builds/android_full.sh
@@ -40,7 +40,8 @@ rm -rf ${AAR_LIB_TMP}
 for CPU in ${CPUS//,/ }
 do
     echo "========== Building native libs for Android ${CPU} =========="
-    bazel build -c opt --config=monolithic --cpu=${CPU} \
+    bazel build --config=monolithic --cpu=${CPU} \
+        --compilation_mode=opt --cxxopt=-std=c++11 \
         --crosstool_top=//external:android/crosstool \
         --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
         //tensorflow/core:android_tensorflow_lib \
@@ -62,7 +63,8 @@ done
 # in assets/ dir (see https://github.com/bazelbuild/bazel/issues/2334)
 # TODO(gunan): remove extra flags once sandboxing is enabled for all builds.
 echo "========== Building TensorFlow Android Jar and Demo =========="
-bazel --bazelrc=/dev/null build -c opt --config=monolithic --fat_apk_cpu=${CPUS} \
+bazel --bazelrc=/dev/null build --config=monolithic --fat_apk_cpu=${CPUS} \
+    --compilation_mode=opt --cxxopt=-std=c++11 \
     --spawn_strategy=sandboxed --genrule_strategy=sandboxed \
     //tensorflow/contrib/android:android_tensorflow_inference_java \
     //tensorflow/contrib/android:android_tensorflow_inference_java.aar \
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index aeac085d30aef746366192361f249eb01f95e8da..9627475d84f261e2cbe22548764eaa4f6f59068b 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -431,7 +431,8 @@ cmd_status(){
 # out by default in TF WORKSPACE file.
 do_bazel_nobuild() {
   BUILD_TARGET="//tensorflow/..."
-  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/java/demo/app/src/main/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/java/demo/app/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/examples/android/..."
   BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/schema/..."
   BUILD_CMD="bazel build --nobuild ${BAZEL_FLAGS} -- ${BUILD_TARGET}"
 
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index ff26b052f3ea0b4866e12a663c569a3fa39ad698..420d390d2b9dc1ec25461b3502c63467a7eda16b 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -29,13 +29,9 @@ import argparse
 import os
 import re
 import shutil
-import subprocess
+import tempfile
 import zipfile
 
-UNZIP_CMD = "/usr/bin/unzip"
-ZIP_CMD = "/usr/bin/zip"
-SED_CMD = "/bin/sed"
-
 TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
@@ -64,27 +60,36 @@ def copy_binary(directory, origin_tag, new_tag, version, gpu=False):
     package = "tf_nightly"
   origin_binary = BINARY_STRING_TEMPLATE % (package, version, origin_tag)
   new_binary = BINARY_STRING_TEMPLATE % (package, version, new_tag)
-  zip_ref = zipfile.ZipFile(directory + origin_binary, "r")
-  zip_ref.extractall()
-  zip_ref.close()
-  old_py_ver = re.search(r"(cp\d\d-cp\d\d)", origin_tag).group(1)
-  new_py_ver = re.search(r"(cp\d\d-cp\d\d)", new_tag).group(1)
-  subprocess.check_call(
-      "%s -i s/%s/%s/g %s-%s.dist-info/WHEEL" % (SED_CMD, old_py_ver,
-                                                 new_py_ver, package, version),
-      shell=True)
-  zout = zipfile.ZipFile(directory + new_binary, "w", zipfile.ZIP_DEFLATED)
-  zip_these_files = [
-      "%s-%s.dist-info" % (package, version),
-      "%s-%s.data" % (package, version)
-  ]
-  for dirname in zip_these_files:
-    for root, _, files in os.walk(dirname):
-      for filename in files:
-        zout.write(os.path.join(root, filename))
-  zout.close()
-  for dirname in zip_these_files:
-    shutil.rmtree(dirname)
+  zip_ref = zipfile.ZipFile(os.path.join(directory, origin_binary), "r")
+
+  try:
+    tmpdir = tempfile.mkdtemp()
+    os.chdir(tmpdir)
+
+    zip_ref.extractall()
+    zip_ref.close()
+    old_py_ver = re.search(r"(cp\d\d-cp\d\d)", origin_tag).group(1)
+    new_py_ver = re.search(r"(cp\d\d-cp\d\d)", new_tag).group(1)
+
+    wheel_file = os.path.join(
+        tmpdir, "%s-%s.dist-info" % (package, version), "WHEEL")
+    with open(wheel_file, "r") as f:
+      content = f.read()
+    with open(wheel_file, "w") as f:
+      f.write(content.replace(old_py_ver, new_py_ver))
+
+    zout = zipfile.ZipFile(directory + new_binary, "w", zipfile.ZIP_DEFLATED)
+    zip_these_files = [
+        "%s-%s.dist-info" % (package, version),
+        "%s-%s.data" % (package, version),
+    ]
+    for dirname in zip_these_files:
+      for root, _, files in os.walk(dirname):
+        for filename in files:
+          zout.write(os.path.join(root, filename))
+    zout.close()
+  finally:
+    shutil.rmtree(tmpdir)
 
 
 def main():
@@ -110,6 +115,7 @@ def main():
   args = parser.parse_args()
 
   # Argument checking
+  args.filename = os.path.abspath(args.filename)
   check_existence(args.filename)
   regex_groups = re.search(TF_NIGHTLY_REGEX, args.filename)
   directory = regex_groups.group(1)
diff --git a/tensorflow/tools/ci_build/install/install_golang.sh b/tensorflow/tools/ci_build/install/install_golang.sh
index e1edd62cc505654b7266c212822561188bbc701c..124ad82e916fe70c0d26a7d09d27a9c510320c1e 100755
--- a/tensorflow/tools/ci_build/install/install_golang.sh
+++ b/tensorflow/tools/ci_build/install/install_golang.sh
@@ -16,7 +16,7 @@
 
 set -ex
 
-GOLANG_URL="https://storage.googleapis.com/golang/go1.9.2.linux-amd64.tar.gz"
+GOLANG_URL="https://storage.googleapis.com/golang/go1.10.linux-amd64.tar.gz"
 
 sudo mkdir -p /usr/local
 wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index 338066131b5d4511ae9f0646a1269b182cf8e1fa..c7cc16e6699830da4dff6cd32136da65fb6a41af 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -33,6 +33,7 @@ yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 --config=opt \
+    --announce_rc \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh
index 920a261ae3c8d68ec0b0d311fd361e3843eebd86..7e0e81a1ebdc9e4ad4e76f6582892914cd1a5881 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh
@@ -31,6 +31,7 @@ export PYTHON_BIN_PATH=$(which python3)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
+    --announce_rc \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index e1b56b9a25f663737ffe0991882f6e5e753265ed..7d471b47034f04ea4c2d31d9cdd7cea48fb32745 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -31,5 +31,5 @@ export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
-export PATH="/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
+export PATH="$PATH:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
 build_libtensorflow_tarball "-cpu-darwin-$(uname -m)"
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 7b2d7e1a568b0235a5bdd55bb23e542772902576..d654b433e7ddcfc79dea010c43d8eb0bc33fdcb2 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -120,7 +120,9 @@ function run_configure_for_gpu_build {
   export TF_CUDA_VERSION=9.0
   export CUDA_TOOLKIT_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0"
   export TF_CUDNN_VERSION=7.0
-  export CUDNN_INSTALL_PATH="C:/tools/cuda"
+  if [ -z "$CUDNN_INSTALL_PATH" ]; then
+    export CUDNN_INSTALL_PATH="C:/tools/cuda"
+  fi
   export TF_CUDA_COMPUTE_CAPABILITIES="3.7"
   if [ -z "$TF_ENABLE_XLA" ]; then
     export TF_ENABLE_XLA=0
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 40189a6d1b1388092dc397251f4b581726995a77..438c5d52f68dbabbd48c54480a8a283017dd8fba 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -65,5 +65,6 @@ bazel test -c opt $BUILD_OPTS -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
+  --flaky_test_attempts=3 \
   //${PY_TEST_DIR}/tensorflow/python/... \
   //${PY_TEST_DIR}/tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index b537192a945b2a2d8c2df940b947c6c0f7d6fc06..97829892b10059f9d9663e103534891d1481abec 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -28,6 +28,9 @@ IF DEFINED TF_NIGHTLY (ECHO TF_NIGHTLY is set to %TF_NIGHTLY%) ELSE (SET TF_NIGH
 :: Set pip binary location. Do not override if it is set already.
 IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
+:: Set ctest binary location.
+IF DEFINED CTEST_EXE (ECHO CTEST_EXE is set to %CTEST_EXE%) ELSE (SET CTEST_EXE="C:\Program Files\cmake\bin\ctest.exe")
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -47,4 +50,4 @@ if %errorlevel% neq 0 exit /b %errorlevel%
 
 :: Run all python tests if the installation succeeded.
 echo Running tests...
-ctest -C Release --output-on-failure --jobs 1
+%CTEST_EXE% -C Release --output-on-failure --jobs 1
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 94276c6c5c9ce897ca24f03efe3d93e1ea1e00c9..7dfee8f371b8c4795fe748d1fd02ee8d884f18f9 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -41,7 +41,7 @@ run_configure_for_gpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt --copt=/arch:AVX \
+bazel build -c opt --copt=/arch:AVX --announce_rc \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index 316e5469e7afda74563cc186c58964664170c5da..b9032c046e93527fd0f41f183e49e4933029ec62 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -44,14 +44,3 @@ py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 4f90c4d940670c43f65cc3f95971469627ab35c9..b7bfb29aae4fcaa55e01ba924f72cf79d2b09ad1 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -68,18 +68,3 @@ exports_files(
         "testdata/test_file_v0_11.py",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/def_file_filter/BUILD b/tensorflow/tools/def_file_filter/BUILD
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e390e0fb05c1d881e5fbafb43ea7576347949439 100644
--- a/tensorflow/tools/def_file_filter/BUILD
+++ b/tensorflow/tools/def_file_filter/BUILD
@@ -0,0 +1,9 @@
+# Description:
+# Tools for filtering DEF file for TensorFlow on Windows
+#
+# On Windows, we use a DEF file generated by Bazel to export
+# symbols from the tensorflow dynamic library(_pywrap_tensorflow.dll).
+# The maximum number of symbols that can be exported per DLL is 64K,
+# so we have to filter some useless symbols through this python script.
+
+package(default_visibility = ["//visibility:public"])
diff --git a/tensorflow/tools/dist_test/server/BUILD b/tensorflow/tools/dist_test/server/BUILD
index 865af8dd7b2af686dad852f35187f2d226533596..003a19a9abf470f58070bf44fc5608d1eb3634fe 100644
--- a/tensorflow/tools/dist_test/server/BUILD
+++ b/tensorflow/tools/dist_test/server/BUILD
@@ -37,15 +37,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/docker/BUILD b/tensorflow/tools/docker/BUILD
index 7d5ae0a94d8f969585d8fb8e57892c165e35ba47..849ba49f71994c3c188d8bc7751d9569c3ee73b3 100644
--- a/tensorflow/tools/docker/BUILD
+++ b/tensorflow/tools/docker/BUILD
@@ -13,15 +13,3 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 22c73c3fe13f2cb763295fa25b43e2f82c0e8962..11f476d12c086f70335d9a69d7f3b86b525b5623 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -70,7 +70,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 3690e7dfe57a4682276a90b10cb84c9a329b3f5e..037d13116efc5ddf76c31eb87d7f81d31c3591f5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.6
+ARG TF_BRANCH=r1.7
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 69ba340f9201266fd2c2f86571e83f6acdcda950..1fcb6428b21b4ca495bef2b3249b6463e9ef0a10 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -79,7 +79,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index 5585ebdcd366ec9db0c47004647970cb27c8bb75..824fe14560bb2c3bfb0729f9e5b5cffa63db19ca 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -1207,7 +1207,7 @@
    "source": [
     "# Training computation: logits + cross-entropy loss.\n",
     "logits = model(train_data_node, True)\n",
-    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\n",
+    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(\n",
     "  labels=train_labels_node, logits=logits))\n",
     "\n",
     "# L2 regularization for the fully connected parameters.\n",
@@ -2031,7 +2031,7 @@
    "views": {}
   },
   "kernelspec": {
-   "display_name": "Python [default]",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -2049,5 +2049,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/tensorflow/tools/docker/notebooks/BUILD b/tensorflow/tools/docker/notebooks/BUILD
index 89f473df4bfdda479fa25b8e10b84c4430105cc9..e9f26899c9afa305afa6ee686a038997a4e6fbe3 100644
--- a/tensorflow/tools/docker/notebooks/BUILD
+++ b/tensorflow/tools/docker/notebooks/BUILD
@@ -3,15 +3,3 @@ package(default_visibility = ["//visibility:private"])
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 8f10bc9e0ca3c947b8ca75663444309088e0513e..d370fbd24692a2806640279644c91e4fad5a3757 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -142,14 +142,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index e758229535e7b10994a39cbafb37e116fd2a465c..d2a63ecc4960117eb64fcc4f94bf882d4a3f91dd 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -34,7 +34,11 @@ from tensorflow.python.util import tf_inspect
 
 
 # A regular expression capturing a python identifier.
-IDENTIFIER_RE = '[a-zA-Z_][a-zA-Z0-9_]*'
+IDENTIFIER_RE = r'[a-zA-Z_]\w*'
+
+
+class TFDocsError(Exception):
+  pass
 
 
 class _Errors(object):
@@ -118,6 +122,8 @@ SYMBOL_REFERENCE_RE = re.compile(
     """,
     flags=re.VERBOSE)
 
+AUTO_REFERENCE_RE = re.compile(r'`([a-zA-Z0-9_.]+?)`')
+
 
 class ReferenceResolver(object):
   """Class for replacing @{...} references with Markdown links.
@@ -240,10 +246,25 @@ class ReferenceResolver(object):
     Returns:
       `string`, with "@{symbol}" references replaced by Markdown links.
     """
-    def one_ref(match):
-      return self._one_ref(match, relative_path_to_root)
 
-    return re.sub(SYMBOL_REFERENCE_RE, one_ref, string)
+    def strict_one_ref(match):
+      try:
+        return self._one_ref(match, relative_path_to_root)
+      except TFDocsError as e:
+        self.add_error(e.message)
+        return 'BAD_LINK'
+
+    string = re.sub(SYMBOL_REFERENCE_RE, strict_one_ref, string)
+
+    def sloppy_one_ref(match):
+      try:
+        return self._one_ref(match, relative_path_to_root)
+      except TFDocsError:
+        return match.group(0)
+
+    string = re.sub(AUTO_REFERENCE_RE, sloppy_one_ref, string)
+
+    return string
 
   def python_link(self, link_text, ref_full_name, relative_path_to_root,
                   code_ref=True):
@@ -307,14 +328,14 @@ class ReferenceResolver(object):
 
     Raises:
       RuntimeError: If `ref_full_name` is not documented.
+      TFDocsError: If the @{} syntax cannot be decoded.
     """
     master_name = self._duplicate_of.get(ref_full_name, ref_full_name)
 
     # Check whether this link exists
     if master_name not in self._all_names:
-      message = 'Cannot make link to "%s": Not in index.' % master_name
-      self.add_error(message)
-      return 'BROKEN_LINK'
+      raise TFDocsError(
+          'Cannot make link to "%s": Not in index.' % master_name)
 
     # If this is a member of a class, link to the class page with an anchor.
     ref_path = None
@@ -369,8 +390,8 @@ class ReferenceResolver(object):
             code_ref=not manual_link_text)
 
     # Error!
-    self.add_error('Did not understand "%s"' % match.group(0))
-    return 'BROKEN_LINK'
+    raise TFDocsError('Did not understand "%s"' % match.group(0),
+                      'BROKEN_LINK')
 
   def _doc_link(self, string, link_text, manual_link_text,
                 relative_path_to_root):
@@ -395,11 +416,10 @@ class ReferenceResolver(object):
     return self._doc_missing(string, hash_tag, link_text, manual_link_text,
                              relative_path_to_root)
 
-  def _doc_missing(self, string, unused_hash_tag, link_text,
+  def _doc_missing(self, string, unused_hash_tag, unused_link_text,
                    unused_manual_link_text, unused_relative_path_to_root):
     """Generate an error for unrecognized @{$...} references."""
-    self.add_error('Unknown Document "%s"' % string)
-    return link_text
+    raise TFDocsError('Unknown Document "%s"' % string)
 
   def _cc_link(self, string, link_text, unused_manual_link_text,
                relative_path_to_root):
@@ -416,8 +436,8 @@ class ReferenceResolver(object):
     elif string == 'tensorflow::ops::Const':
       ret = 'namespace/tensorflow/ops.md#const'
     else:
-      self.add_error('C++ reference not understood: "%s"' % string)
-      return 'TODO_C++:%s' % string
+      raise TFDocsError('C++ reference not understood: "%s"' % string)
+
     # relative_path_to_root gets you to api_docs/python, we go from there
     # to api_docs/cc, and then add ret.
     cc_relative_path = os.path.normpath(os.path.join(
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index 942ceab85fc8d40d9d4b67537d95204503af8bbe..daa17fbd501651540c4c90c6354eb0a5b2f2b7aa 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -9,18 +9,3 @@ licenses(["notice"])  # Apache 2.0
 exports_files(
     ["gen_git_source.py"],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index b7d7fac315367824c6fdb379eaa6b0812d89b28b..1ad1895269022331bfd8156721778f4d68a10ee7 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -178,6 +178,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:quantization_utils",
         "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/util/tensor_bundle",
     ],
@@ -312,14 +313,3 @@ tf_py_test(
     ],
     main = "python/transform_graph_test.py",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/tools/graph_transforms/backports_test.cc b/tensorflow/tools/graph_transforms/backports_test.cc
index ab9a61afa7eb1680580c7e0c41f8ff1b47ef6742..80a954e062b06924c6048ac8b011dc1034706e8e 100644
--- a/tensorflow/tools/graph_transforms/backports_test.cc
+++ b/tensorflow/tools/graph_transforms/backports_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -191,7 +192,7 @@ TEST(BackportTensorArrayV3Test, TestBackportTensorArrayV3Subtypes) {
     std::map<string, const NodeDef*> node_lookup;
     MapNamesToNodes(result, &node_lookup);
     ASSERT_EQ(1, node_lookup.count("v3_node"));
-    EXPECT_TRUE(StringPiece(node_lookup.at("v3_node")->op()).ends_with("V2"));
+    EXPECT_TRUE(str_util::EndsWith(node_lookup.at("v3_node")->op(), "V2"));
   }
 }
 
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 250f54e20fba6e24fe95741b1437ac3718ace6fb..85660f94a85dce29360525f7bb7474494b3f010f 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -283,6 +283,10 @@ Status FoldConstants(const GraphDef& input_graph_def,
     };
   }
 
+  TF_RETURN_IF_ERROR(context.GetOneInt64Parameter(
+      "max_constant_size_in_bytes", cf_opts.max_constant_size_in_bytes,
+      &cf_opts.max_constant_size_in_bytes));
+
   // Constant folding.
   bool was_mutated;
   TF_RETURN_IF_ERROR(ConstantFold(cf_opts, nullptr, Env::Default(), nullptr,
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index 41106de008d832a022290e6da38cca8ad6d23ffd..a082399a87dbaad913be421fe273ba89b6f7340e 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -209,10 +210,10 @@ class ConstantFoldingTest : public ::testing::Test {
     for (const NodeDef& node : graph_def.node()) {
       const StringPiece name(node.name());
       const int occurrence_count = folded_node_map.count(node.name());
-      if (name.ends_with("expect_removed")) {
+      if (str_util::EndsWith(name, "expect_removed")) {
         EXPECT_EQ(0, occurrence_count) << "node.name()=" << node.name();
       }
-      if (name.ends_with("expect_remains")) {
+      if (str_util::EndsWith(name, "expect_remains")) {
         EXPECT_EQ(1, occurrence_count) << "node.name()=" << node.name();
       }
     }
@@ -370,6 +371,46 @@ class ConstantFoldingTest : public ::testing::Test {
     EXPECT_EQ(0, node_map.count("b"));
     EXPECT_EQ(1, node_map.count("c"));
   }
+
+  void TestMaxConstantSizeInBytes() {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    const int width = 100;
+
+    Tensor a_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&a_data, 1.0f);
+    Output a_const = ::tensorflow::ops::Const(
+        root.WithOpName("a_expect_remains"), Input::Initializer(a_data));
+
+    Tensor b_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&b_data, 1.0f);
+    Output b_const = ::tensorflow::ops::Const(
+        root.WithOpName("b_expect_remains"), Input::Initializer(b_data));
+
+    Output add = ::tensorflow::ops::Add(root.WithOpName("add_expect_remains"),
+                                        a_const, b_const);
+
+    Output placeholder = ::tensorflow::ops::Placeholder(
+        root.WithOpName("placeholder_expect_remains"), DT_FLOAT);
+
+    Output mul = ::tensorflow::ops::Mul(
+        root.WithOpName("output_expect_remains"), add, placeholder);
+
+    GraphDef graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+    Tensor placeholder_tensor(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&placeholder_tensor, 1.0f);
+
+    // Setting the maximum constant size to 10 bytes should stop the constant
+    // folding at add(a, b) that would have yielded a constant of
+    // 100*sizeof(float) bytes.
+    graph_transforms::TransformFuncContext context;
+    context.params["max_constant_size_in_bytes"] = {"10"};
+    TestConstantFolding(graph_def,
+                        {{"placeholder_expect_remains", placeholder_tensor}},
+                        {}, {"output_expect_remains"}, context);
+  }
 };
 
 TEST_F(ConstantFoldingTest, TestSimpleAdd) { TestSimpleAdd(); }
@@ -394,5 +435,9 @@ TEST_F(ConstantFoldingTest, TestRemoveUnusedNodesMultipleOutputs) {
   TestRemoveUnusedNodesMultipleOutputs();
 }
 
+TEST_F(ConstantFoldingTest, TestMaxConstantSizeInBytes) {
+  TestMaxConstantSizeInBytes();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index 272410c693ae8de8bb131ea577700fd16815c42e..7651a03fe51012678d6d6fc495fd82e497aa512b 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -398,7 +398,7 @@ TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNorms) {
 }
 
 TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNormsWithConcat) {
-  // Test axis is not 3, so all weigths and offsets are fused to each of inputs
+  // Test axis is not 3, so all weights and offsets are fused to each of inputs
   // of conv2d.
   TestFoldFusedBatchNormsWithConcat(/*split=*/true);
   // Test axis = 3, BatchNorm weights and offsets will be split before fused
diff --git a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
index 2436c7e4a2dc5c8172de2a35abbfc551d6e410fd..f401723808c086bd69743b75b8b4d972e8ab0b83 100644
--- a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
+++ b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
@@ -40,8 +40,8 @@ Status ExtractMinMaxRecords(const string& log_file_name,
   for (const string& file_line : file_lines) {
     // We expect to find a line with components separated by semicolons, so to
     // start make sure that the basic structure is in place/
-    StringPiece line(file_line);
-    if (!line.contains(print_suffix + ";" + requant_prefix)) {
+    if (!str_util::StrContains(file_line,
+                               print_suffix + ";" + requant_prefix)) {
       continue;
     }
     std::vector<string> line_parts = str_util::Split(file_line, ';');
@@ -53,8 +53,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
     bool min_max_found = false;
     int min_max_index;
     for (int i = 1; i < line_parts.size(); ++i) {
-      StringPiece line_part(line_parts[i]);
-      if (line_part.starts_with(requant_prefix)) {
+      if (str_util::StartsWith(line_parts[i], requant_prefix)) {
         min_max_found = true;
         min_max_index = i;
       }
@@ -90,7 +89,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
       continue;
     }
     StringPiece name_string = line_parts[min_max_index - 1];
-    if (!name_string.ends_with(print_suffix)) {
+    if (!str_util::EndsWith(name_string, print_suffix)) {
       continue;
     }
     string name =
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index e1ee2b420b062937b5e50c10a05406df3cbd7977..377665448c244aeace78f231ba0c263613afd9a0 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -101,7 +102,7 @@ Status InsertLogging(const GraphDef& input_graph_def,
     const bool op_matches = (ops.count(node.op()) > 0);
     bool prefix_matches = false;
     for (const string& prefix : prefixes) {
-      if (StringPiece(node.name()).starts_with(prefix)) {
+      if (str_util::StartsWith(node.name(), prefix)) {
         prefix_matches = true;
       }
     }
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 701e350fc39d083665f5420e6b73510c182e12ce..cc82100148117c7846ba5781e1a97e172ad7f03c 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -88,7 +89,7 @@ void CreateConstNode(const Tensor& tensor, const string& name,
 
 string GetMonolithicTensorKey(const string& tensor_slice_name) {
   std::vector<string> names = Split(tensor_slice_name, "/");
-  if (StringPiece(names[names.size() - 1]).starts_with("part_")) {
+  if (str_util::StartsWith(names[names.size() - 1], "part_")) {
     CHECK_GE(names.size(), 2);
     names.pop_back();
   }
@@ -102,8 +103,8 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
   for (const auto& node : input_graph_def.node()) {
     std::vector<string> node_name_parts = Split(node.name(), "/");
     if (node_name_parts.size() == 2 &&
-        StringPiece(node_name_parts[0]).starts_with("save") &&
-        StringPiece(node_name_parts[1]).starts_with("Assign") &&
+        str_util::StartsWith(node_name_parts[0], "save") &&
+        str_util::StartsWith(node_name_parts[1], "Assign") &&
         node.input(0) == target_name) {
       restore_node_name = node.input(1);
       break;
diff --git a/tensorflow/tools/graph_transforms/transform_graph_test.cc b/tensorflow/tools/graph_transforms/transform_graph_test.cc
index bc2412fcbdba90731318eea1a2239aa914b35ffc..b276229aa44f747ee81ebcdfe204468929c9eb53 100644
--- a/tensorflow/tools/graph_transforms/transform_graph_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -112,12 +113,11 @@ class TransformGraphTest : public ::testing::Test {
     graph_transforms::MapNamesToNodes(out_graph_def, &out_node_map);
 
     for (const NodeDef& node : out_graph_def.node()) {
-      const StringPiece name(node.name());
       const int occurrence_count = out_node_map.count(node.name());
-      if (name.ends_with("expect_removed")) {
+      if (str_util::EndsWith(node.name(), "expect_removed")) {
         EXPECT_EQ(0, occurrence_count) << "node.name()=" << node.name();
       }
-      if (name.ends_with("expect_remains")) {
+      if (str_util::EndsWith(node.name(), "expect_remains")) {
         EXPECT_EQ(1, occurrence_count) << "node.name()=" << node.name();
       }
     }
@@ -139,7 +139,7 @@ class TransformGraphTest : public ::testing::Test {
     Status no_such_status =
         TransformGraph({}, {}, {{"test_no_such_transform", {}}}, &graph_def);
     EXPECT_TRUE(
-        StringPiece(no_such_status.ToString()).contains("not recognized"));
+        str_util::StrContains(no_such_status.ToString(), "not recognized"));
   }
 
   void TestParseTransformParameters() {
diff --git a/tensorflow/tools/graph_transforms/transform_utils.cc b/tensorflow/tools/graph_transforms/transform_utils.cc
index 55f28a9e1d8d639a316c9bd121204d603217dea3..367048965d146d782267f23330a435ae72f7f49a 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils.cc
@@ -88,7 +88,7 @@ void NodeNamePartsFromInput(const string& input_name, string* prefix,
     *suffix = ":" + input_parts[1];
   }
   StringPiece node_name_piece(input_parts[0]);
-  if (node_name_piece.Consume("^")) {
+  if (str_util::ConsumePrefix(&node_name_piece, "^")) {
     *prefix = "^";
   } else {
     *prefix = "";
@@ -200,8 +200,7 @@ Status SortByExecutionOrder(const GraphDef& input_graph_def,
       // for merge only wait for one non-control input.
       int32 num_control_edges = 0;
       for (int i = 0; i < node_def.input_size(); ++i) {
-        StringPiece input_name(node_def.input(i));
-        if (input_name.starts_with("^")) {
+        if (str_util::StartsWith(node_def.input(i), "^")) {
           num_control_edges++;
         }
       }
@@ -504,7 +503,7 @@ Status RenameNodeInputs(const GraphDef& input_graph_def,
           const string& dest_name = input_to_rename.second;
           bool is_match;
           string match_name;
-          if (StringPiece(source_name).ends_with(":*")) {
+          if (str_util::EndsWith(source_name, ":*")) {
             is_match = true;
             string prefix;
             string unused_node_name;
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 3fbdb5cacd1fd0039deaae5ac330b6c2ca006a68..0ede8c63704ac4a474eb0d19e17cf5f365abca77 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -138,7 +138,6 @@ genrule(
         "@zlib_archive//:zlib.h",
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
-        "@mkl//:LICENSE",
     ]),
     outs = ["include/tensorflow/c/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
@@ -176,7 +175,6 @@ genrule(
         "@zlib_archive//:zlib.h",
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
-        "@mkl//:LICENSE",
     ]),
     outs = ["include/tensorflow/jni/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
diff --git a/tensorflow/tools/mlpbtxt/BUILD b/tensorflow/tools/mlpbtxt/BUILD
index f9f48c6500cee99dce1f5c9ffe6284e578e82669..89c683c8c422b7341517d80f7c55cceb1636a657 100644
--- a/tensorflow/tools/mlpbtxt/BUILD
+++ b/tensorflow/tools/mlpbtxt/BUILD
@@ -32,15 +32,3 @@ tf_cc_binary(
         "//tensorflow/core:op_gen_lib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2607b9d7049828db95e7cbdddad30541a26942b0..376644718f427cf172353dc172c028ee6ec2e8fa 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -55,6 +55,12 @@ COMMON_PIP_DEPS = [
     "setup.py",
     ":included_headers",
     "//tensorflow:tensorflow_py",
+    "//tensorflow/contrib/autograph:autograph",
+    "//tensorflow/contrib/autograph/converters:converters",
+    "//tensorflow/contrib/autograph/converters:test_lib",
+    "//tensorflow/contrib/autograph/impl:impl",
+    "//tensorflow/contrib/autograph/pyct:pyct",
+    "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
     "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
     "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
@@ -68,12 +74,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
     "//tensorflow/contrib/nn:nn_py",
     "//tensorflow/contrib/predictor:predictor_pip",
-    "//tensorflow/contrib/py2tf:py2tf",
-    "//tensorflow/contrib/py2tf/converters:converters",
-    "//tensorflow/contrib/py2tf/converters:test_lib",
-    "//tensorflow/contrib/py2tf/impl:impl",
-    "//tensorflow/contrib/py2tf/pyct:pyct",
-    "//tensorflow/contrib/py2tf/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
@@ -94,6 +94,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python:util_example_parser_configuration",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
     "//tensorflow/python/saved_model:saved_model",
     "//tensorflow/python/tools:tools_pip",
     "//tensorflow/python:test_ops",
@@ -140,8 +141,10 @@ filegroup(
         "@kafka//:LICENSE",
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
+        "@local_config_nccl//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@grpc//third_party/nanopb:LICENSE.txt",
+        "@grpc//third_party/address_sorting:LICENSE",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@pcre//:LICENCE",
@@ -155,9 +158,6 @@ filegroup(
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
-        "@mkl//:LICENSE",
-    ]) + if_not_windows([
-        "@nccl_archive//:LICENSE.txt",
     ]) + tf_additional_license_deps(),
 )
 
@@ -170,6 +170,7 @@ sh_binary(
         "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
             "//tensorflow/contrib/lite/python:interpreter_test_data",
+            "//tensorflow/contrib/lite/python:tf_lite_py_pip",
             "//tensorflow/contrib/lite/toco:toco",
             "//tensorflow/contrib/lite/toco/python:toco_wrapper",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index dc31e4c5f703b29f464519d5f1fd54f9b5e11690..8f0cf8c3d194807b6c82f50b5ac8c7fe7527fea5 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -139,7 +139,9 @@ function main() {
     fi
     mkdir "${TMPDIR}/tensorflow/aux-bin"
     # Install toco as a binary in aux-bin.
-    cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
+    # TODO(aselle): Re-enable this when we find a way to do it without doubling
+    # the whl size (over the limit).
+    # cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
@@ -160,7 +162,9 @@ function main() {
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
-  source tools/python_bin_path.sh
+  if [[ -e tools/python_bin_path.sh ]]; then
+    source tools/python_bin_path.sh
+  fi
 
   pushd ${TMPDIR}
   rm -f MANIFEST
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index b66c45ec13067e20fbd3c333cf562001945a63ea..e2518f6cbf0beb0943e5b7289796459d14992bfc 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -75,6 +75,7 @@ BLACKLIST = [
     "//tensorflow/contrib/timeseries/examples:data/period_trend.csv",  # pylint:disable=line-too-long
     "//tensorflow/contrib/timeseries/python/timeseries:test_utils",
     "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",  # pylint:disable=line-too-long
+    "//tensorflow/contrib/image:sparse_image_warp_test_data",
 ]
 
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 7fdf0d8c17790de77595a6979628d93913b6fc93..ed941c3bc23c3b0e5e21fcbe03068f174b5887c6 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.6.0'
+_VERSION = '1.7.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -39,7 +39,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorboard >= 1.6.0, < 1.7.0',
+    'tensorboard >= 1.7.0, < 1.8.0',
     'termcolor >= 1.1.0',
 ]
 
@@ -62,7 +62,7 @@ else:
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.7.0a0, < 1.8.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.8.0a0, < 1.9.0a0'
       break
 
 # weakref.finalize and enum were introduced in Python 3.4
@@ -200,8 +200,7 @@ headers = (list(find_files('*.h', 'tensorflow/core')) +
            list(find_files('*.h', 'tensorflow/stream_executor')) +
            list(find_files('*.h', 'google/protobuf_archive/src')) +
            list(find_files('*', 'third_party/eigen3')) +
-           list(find_files('*', 'external/eigen_archive')) +
-           list(find_files('*.h', 'external/nsync/public')))
+           list(find_files('*', 'external/eigen_archive')))
 
 setup(
     name=project_name,
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index 39c4aac1e8b5dfb2582115881c7d10ca3cd04f68..ef7bfdd3c9e46a21e25a7cdfe812b9e37029188b 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -96,18 +96,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/quantization/BUILD b/tensorflow/tools/quantization/BUILD
index e99ad06a06294c4d037b76ea9450e51bd795e79d..17443a8617451cb9b09867e132855d6316d8e423 100644
--- a/tensorflow/tools/quantization/BUILD
+++ b/tensorflow/tools/quantization/BUILD
@@ -76,15 +76,3 @@ py_binary(
         "//tensorflow/python:platform",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 159a8c1cfbdb793d05eda850afb54e860bf2614e..4b2026b9472b651f8e0571155dab8952d20aa8b2 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -92,15 +92,3 @@ tf_py_logged_benchmark(
     name = "rnn_op_benchmark",
     target = "//tensorflow/python/kernel_tests:rnn_test",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index cee53dd5b61e50126948e3652865a32f45eab092..3486871080c78dc7a1cc201ea2a4d45ebc342758 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -31,7 +31,7 @@ def tf_cc_logged_benchmark(
       size = "large",
       srcs = ["//tensorflow/tools/test:run_and_gather_logs"],
       args = [
-          "--name=//%s:%s" % (PACKAGE_NAME, name),
+          "--name=//%s:%s" % (native.package_name(), name),
           "--test_name=" + target,
           "--test_args=--benchmarks=%s" % benchmarks,
           "--benchmark_type=%s" % benchmark_type,
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index edd093510e5483ef946e1177641371f00c92b50b..9c45359ee1b037ffb01820f874b88b6cabc6d14b 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -87,6 +87,7 @@ import json
 import os
 import shutil
 
+from six import text_type
 from google.cloud import datastore
 from six import text_type
 
diff --git a/tensorflow/user_ops/BUILD b/tensorflow/user_ops/BUILD
index e8198efe2e534d261af21c83682a848589cf7916..71443cc41eb5ecdd23e1a47712633c77fcd7d395 100644
--- a/tensorflow/user_ops/BUILD
+++ b/tensorflow/user_ops/BUILD
@@ -50,15 +50,3 @@ tf_py_test(
     additional_deps = ["//tensorflow:tensorflow_py"],
     data = [":invalid_op.so"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/version_check.bzl b/tensorflow/version_check.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..79e721dab422c1449214acbe5fc1643edc3a9db0
--- /dev/null
+++ b/tensorflow/version_check.bzl
@@ -0,0 +1,48 @@
+""" Helpers to check minimum version of bazel."""
+
+def _extract_version_number(bazel_version):
+  """Extracts the semantic version number from a version string
+
+  Args:
+    bazel_version: the version string that begins with the semantic version
+      e.g. "1.2.3rc1 abc1234" where "abc1234" is a commit hash.
+
+  Returns:
+    The semantic version string, like "1.2.3".
+  """
+  for i in range(len(bazel_version)):
+    c = bazel_version[i]
+    if not (c.isdigit() or c == "."):
+      return bazel_version[:i]
+  return bazel_version
+
+# Parse the bazel version string from `native.bazel_version`.
+# e.g.
+# "0.10.0rc1 abc123d" => (0, 10, 0)
+# "0.3.0" => (0, 3, 0)
+def _parse_bazel_version(bazel_version):
+  """Parses a version string into a 3-tuple of ints
+
+  int tuples can be compared directly using binary operators (<, >).
+
+  Args:
+    bazel_version: the Bazel version string
+
+  Returns:
+    An int 3-tuple of a (major, minor, patch) version.
+  """
+
+  version = _extract_version_number(bazel_version)
+  return tuple([int(n) for n in version.split(".")])
+
+def check_bazel_version_at_least(minimum_bazel_version):
+  if "bazel_version" not in dir(native):
+    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % minimum_bazel_version)
+  elif not native.bazel_version:
+    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
+    print("Make sure that you are running at least Bazel %s.\n" % minimum_bazel_version)
+    return
+
+  if _parse_bazel_version(native.bazel_version) < _parse_bazel_version(minimum_bazel_version):
+    fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
+        native.bazel_version, minimum_bazel_version))
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index a922808a70f8bd082f2aab7d0f01c404b304e121..ace0d411b995cabf1c34c868178ebb0619232a56 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -2,6 +2,7 @@
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
+load("//third_party:nccl/nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
@@ -10,69 +11,28 @@ load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/toolchains/clang6:repo.bzl", "clang6_configure")
 load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl", "arm_compiler_configure")
 load("//third_party:repo.bzl", "tf_http_archive")
+load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
 load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl",
      "def_file_filter_configure")
 
-def _extract_version_number(bazel_version):
-  """Extracts the semantic version number from a version string
-
-  Args:
-    bazel_version: the version string that begins with the semantic version
-      e.g. "1.2.3rc1 abc1234" where "abc1234" is a commit hash.
-
-  Returns:
-    The semantic version string, like "1.2.3".
-  """
-  for i in range(len(bazel_version)):
-    c = bazel_version[i]
-    if not (c.isdigit() or c == "."):
-      return bazel_version[:i]
-  return bazel_version
-
-# Parse the bazel version string from `native.bazel_version`.
-# e.g.
-# "0.10.0rc1 abc123d" => (0, 10, 0)
-# "0.3.0" => (0, 3, 0)
-def _parse_bazel_version(bazel_version):
-  """Parses a version string into a 3-tuple of ints
-
-  int tuples can be compared directly using binary operators (<, >).
-
-  Args:
-    bazel_version: the Bazel version string
-
-  Returns:
-    An int 3-tuple of a (major, minor, patch) version.
-  """
-
-  version = _extract_version_number(bazel_version)
-  return tuple([int(n) for n in version.split(".")])
-
-def check_bazel_version_at_least(minimum_bazel_version):
-  if "bazel_version" not in dir(native):
-    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % minimum_bazel_version)
-  elif not native.bazel_version:
-    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
-    print("Make sure that you are running at least Bazel %s.\n" % minimum_bazel_version)
-    return
-
-  if _parse_bazel_version(native.bazel_version) < _parse_bazel_version(minimum_bazel_version):
-    fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
-        native.bazel_version, minimum_bazel_version))
+
+# Sanitize a dependency so that it works correctly from code that includes
+# TensorFlow as a submodule.
+def clean_dep(dep):
+  return str(Label(dep))
 
 # If TensorFlow is linked as a submodule.
 # path_prefix is no longer used.
 # tf_repo_name is thought to be under consideration.
 def tf_workspace(path_prefix="", tf_repo_name=""):
-  # We must check the bazel version before trying to parse any other BUILD
-  # files, in case the parsing of those build files depends on the bazel
-  # version we require here.
-  check_bazel_version_at_least("0.10.0")
+  # Note that we check the minimum bazel version in WORKSPACE.
   clang6_configure(name="local_config_clang6")
+  cc_download_clang_toolchain(name="local_config_download_clang")
   cuda_configure(name="local_config_cuda")
   tensorrt_configure(name="local_config_tensorrt")
+  nccl_configure(name="local_config_nccl")
   git_configure(name="local_config_git")
   sycl_configure(name="local_config_sycl")
   python_configure(name="local_config_python")
@@ -85,17 +45,37 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   arm_compiler_configure(
       name="local_config_arm_compiler",
       remote_config_repo="../arm_compiler",
-      build_file = str(Label("//third_party/toolchains/cpus/arm:BUILD")))
+      build_file = clean_dep("//third_party/toolchains/cpus/arm:BUILD"))
 
   mkl_repository(
-      name = "mkl",
+      name = "mkl_linux",
+      urls = [
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
+      ],
+      sha256 = "feacc3d82565c1231470359b42c696236fae873704e0b013436afba5fd4fd30f",
+      strip_prefix = "mklml_lnx_2018.0.1.20171227",
+      build_file = clean_dep("//third_party/mkl:mkl.BUILD")
+  )
+  mkl_repository(
+      name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz",
-          "https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip"
       ],
-      sha256 = "6b07cb7e5451db67c2e31e785ae458b18f7f363c60a61685488f69e9ae7199d4",
-      strip_prefix = "mklml_lnx_2018.0.1.20171007",
-      build_file = str(Label("//third_party/mkl:mkl.BUILD")),
+      sha256 = "24bae8d7b22b431a654acadea43f2243c46ae6b1e5a73a4a936825f31d284ee4",
+      strip_prefix = "mklml_win_2018.0.1.20171227",
+      build_file = clean_dep("//third_party/mkl:mkl.BUILD")
+  )
+  mkl_repository(
+      name = "mkl_darwin",
+      urls = [
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz"
+      ],
+      sha256 = "0e954ec6fd3dc5e37f64c4043f6b5613dd687558da3df1028b3b7c29ff5cf77f",
+      strip_prefix = "mklml_mac_2018.0.1.20171227",
+      build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
   if path_prefix:
@@ -105,12 +85,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/e0bfcaa7fcb2b1e1558f5f0676933c1db807a729.tar.gz",
-          "https://github.com/01org/mkl-dnn/archive/e0bfcaa7fcb2b1e1558f5f0676933c1db807a729.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.12.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.12.tar.gz",
       ],
-      sha256 = "02e244f63dd95402691a361392504c143eede9a89043426f174836638a9cbf09",
-      strip_prefix = "mkl-dnn-e0bfcaa7fcb2b1e1558f5f0676933c1db807a729",
-      build_file = str(Label("//third_party/mkl_dnn:mkldnn.BUILD")),
+      sha256 = "86fa2a8c12a56e3b725945acedeaa82492746be02545aba6d710f097e013e19e",
+      strip_prefix = "mkl-dnn-0.12",
+      build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
   tf_http_archive(
@@ -121,19 +101,19 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
      sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478",
      strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f",
-     build_file = str(Label("//third_party:com_google_absl.BUILD")),
+     build_file = clean_dep("//third_party:com_google_absl.BUILD"),
   )
 
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/6913f0cf7d06.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/6913f0cf7d06.tar.gz",
       ],
-      sha256 = "0cadb31a35b514bf2dfd6b5d38205da94ef326ec6908fc3fd7c269948467214f",
-      strip_prefix = "eigen-eigen-2355b229ea4c",
-      build_file = str(Label("//third_party:eigen.BUILD")),
-      patch_file = str(Label("//third_party:eigen_fix_cuda_compilation.patch"))
+      sha256 = "791b836cacd03e20bae5bdd25f1c4a5505a0a9975ba94a61eb4e2631fbd1d53a",
+      strip_prefix = "eigen-eigen-6913f0cf7d06",
+      build_file = clean_dep("//third_party:eigen.BUILD"),
+      patch_file = clean_dep("//third_party:eigen_fix_cuda_compilation.patch")
   )
 
   tf_http_archive(
@@ -146,7 +126,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           # remove the whitelist entry in third_party/repo.bzl.
           # "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
       ],
-      build_file = str(Label("//:arm_compiler.BUILD")),
+      build_file = clean_dep("//:arm_compiler.BUILD"),
   )
 
   tf_http_archive(
@@ -157,7 +137,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "2ade869c3f42f23b5263c7d594aa3c7e5e61ac6a3afcaf5d6e42899d2a7986ce",
       strip_prefix = "libxsmm-1.8.1",
-      build_file = str(Label("//third_party:libxsmm.BUILD")),
+      build_file = clean_dep("//third_party:libxsmm.BUILD"),
   )
 
   tf_http_archive(
@@ -170,7 +150,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
       strip_prefix = "or-tools-253f7955c6a1fd805408fba2e42ac6d45b312d15/src",
-      build_file = str(Label("//third_party:ortools.BUILD")),
+      build_file = clean_dep("//third_party:ortools.BUILD"),
   )
 
   tf_http_archive(
@@ -202,7 +182,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0",
       strip_prefix = "farmhash-816a4ae622e964763ca0862d9dbd19324a1eaf45",
-      build_file = str(Label("//third_party:farmhash.BUILD")),
+      build_file = clean_dep("//third_party:farmhash.BUILD"),
   )
 
   tf_http_archive(
@@ -213,7 +193,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
       strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
-      build_file = str(Label("//third_party:highwayhash.BUILD")),
+      build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
   tf_http_archive(
@@ -224,7 +204,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324",
       strip_prefix = "nasm-2.12.02",
-      build_file = str(Label("//third_party:nasm.BUILD")),
+      build_file = clean_dep("//third_party:nasm.BUILD"),
   )
 
   tf_http_archive(
@@ -235,7 +215,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
       strip_prefix = "libjpeg-turbo-1.5.1",
-      build_file = str(Label("//third_party/jpeg:jpeg.BUILD")),
+      build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
   )
 
   tf_http_archive(
@@ -246,7 +226,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "716c59c7dfc808a4c368f8ada526932be72b2fcea11dd85dc9d88b1df1dfe9c2",
       strip_prefix = "libpng-1.2.53",
-      build_file = str(Label("//third_party:png.BUILD")),
+      build_file = clean_dep("//third_party:png.BUILD"),
   )
 
   tf_http_archive(
@@ -257,7 +237,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4",
       strip_prefix = "sqlite-amalgamation-3200000",
-      build_file = str(Label("//third_party:sqlite.BUILD")),
+      build_file = clean_dep("//third_party:sqlite.BUILD"),
   )
 
   tf_http_archive(
@@ -268,7 +248,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
       strip_prefix = "giflib-5.1.4",
-      build_file = str(Label("//third_party:gif.BUILD")),
+      build_file = clean_dep("//third_party:gif.BUILD"),
   )
 
   tf_http_archive(
@@ -279,7 +259,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
       strip_prefix = "six-1.10.0",
-      build_file = str(Label("//third_party:six.BUILD")),
+      build_file = clean_dep("//third_party:six.BUILD"),
   )
 
   tf_http_archive(
@@ -290,7 +270,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d",
       strip_prefix = "astor-0.6.2",
-      build_file = str(Label("//third_party:astor.BUILD")),
+      build_file = clean_dep("//third_party:astor.BUILD"),
   )
 
   tf_http_archive(
@@ -301,7 +281,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "7068908321ecd2774f145193c4b34a11305bd104b4551b09273dfd1d6a374930",
       strip_prefix = "gast-0.2.0",
-      build_file = str(Label("//third_party:gast.BUILD")),
+      build_file = clean_dep("//third_party:gast.BUILD"),
   )
 
   tf_http_archive(
@@ -312,7 +292,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b",
       strip_prefix = "termcolor-1.1.0",
-      build_file = str(Label("//third_party:termcolor.BUILD")),
+      build_file = clean_dep("//third_party:termcolor.BUILD"),
   )
 
   tf_http_archive(
@@ -333,7 +313,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "8813bf712a66b3d8b85dc289e1104ed220f1878cf981e2fe756dfaabe9a82892",
       strip_prefix = "backports.weakref-1.0rc1/src",
-      build_file = str(Label("//third_party:backports_weakref.BUILD")),
+      build_file = clean_dep("//third_party:backports_weakref.BUILD"),
   )
 
   tf_http_archive(
@@ -344,7 +324,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "2dadd04a2802de27e0fe5a19b76538f6da9d39ff244036afa00c1bba754de5ee",
       strip_prefix = "codegen-1.0",
-      build_file = str(Label("//third_party:codegen.BUILD")),
+      build_file = clean_dep("//third_party:codegen.BUILD"),
   )
 
   filegroup_external(
@@ -394,11 +374,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "nsync",
       urls = [
-          "https://mirror.bazel.build/github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
-          "https://github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
+          "https://mirror.bazel.build/github.com/google/nsync/archive/0559ce013feac8db639ee1bf776aca0325d28777.tar.gz",
+          "https://github.com/google/nsync/archive/0559ce013feac8db639ee1bf776aca0325d28777.tar.gz",
       ],
-      sha256 = "51f81ff4202bbb820cdbedc061bd2eb6765f2b5c06489e7a8694bedac329e8f8",
-      strip_prefix = "nsync-8502189abfa44c249c01c2cad64e6ed660a9a668",
+      sha256 = "6284454c5cd8b1dae2eeb8cf5eb63004de930b5427ed5f6b1aa793513df6b361",
+      strip_prefix = "nsync-0559ce013feac8db639ee1bf776aca0325d28777",
   )
 
   tf_http_archive(
@@ -429,7 +409,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
       ],
       strip_prefix = "pcre-8.39",
-      build_file = str(Label("//third_party:pcre.BUILD")),
+      build_file = clean_dep("//third_party:pcre.BUILD"),
   )
 
   tf_http_archive(
@@ -441,7 +421,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "http://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
       ],
       strip_prefix = "swig-3.0.8",
-      build_file = str(Label("//third_party:swig.BUILD")),
+      build_file = clean_dep("//third_party:swig.BUILD"),
   )
 
   tf_http_archive(
@@ -452,19 +432,20 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://curl.haxx.se/download/curl-7.49.1.tar.gz",
       ],
       strip_prefix = "curl-7.49.1",
-      build_file = str(Label("//third_party:curl.BUILD")),
+      build_file = clean_dep("//third_party:curl.BUILD"),
   )
 
   tf_http_archive(
       name = "grpc",
       urls = [
-          "https://mirror.bazel.build/github.com/grpc/grpc/archive/730b778632e79cc3c96ad237f282d687ee325ce7.tar.gz",
-          "https://github.com/grpc/grpc/archive/730b778632e79cc3c96ad237f282d687ee325ce7.tar.gz",
+          "https://mirror.bazel.build/github.com/grpc/grpc/archive/bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2.tar.gz",
+          "https://github.com/grpc/grpc/archive/bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2.tar.gz",
       ],
-      sha256 = "8c91a8d12e1e868cf51f7340b75507a8aa017a7e1b56f46ed6816aeb803dc9bd",
-      strip_prefix = "grpc-730b778632e79cc3c96ad237f282d687ee325ce7",
+      sha256 = "0a05bd355e4571b01d813dddffa38e57e689ac41b264dc9b1bd6ec66463ef5d6",
+      strip_prefix = "grpc-bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2",
   )
 
+
   tf_http_archive(
       name = "linenoise",
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
@@ -473,7 +454,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
       ],
       strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
-      build_file = str(Label("//third_party:linenoise.BUILD")),
+      build_file = clean_dep("//third_party:linenoise.BUILD"),
   )
 
   # TODO(phawkins): currently, this rule uses an unofficial LLVM mirror.
@@ -481,12 +462,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/738ee045416377e8c2094f7f61508ac1c178ff37.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/738ee045416377e8c2094f7f61508ac1c178ff37.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
       ],
-      sha256 = "4442ed6a05c13752338036b1b9f16b09264de24b6c0bf62325fb9ff75a09340f",
-      strip_prefix = "llvm-738ee045416377e8c2094f7f61508ac1c178ff37",
-      build_file = str(Label("//third_party/llvm:llvm.BUILD")),
+      sha256 = "a6d94bd9de23515a1e3792a830421e3885977ea43d03427cdbe68f98cb7e0045",
+      strip_prefix = "llvm-7e78daafdd22f3f17720a103d29d89590534004e",
+      build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
   tf_http_archive(
@@ -497,7 +478,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
       strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
-      build_file = str(Label("//third_party:lmdb.BUILD")),
+      build_file = clean_dep("//third_party:lmdb.BUILD"),
   )
 
   tf_http_archive(
@@ -508,7 +489,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
       strip_prefix = "jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70",
-      build_file = str(Label("//third_party:jsoncpp.BUILD")),
+      build_file = clean_dep("//third_party:jsoncpp.BUILD"),
   )
 
   tf_http_archive(
@@ -524,12 +505,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "zlib_archive",
       urls = [
-          "https://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
-          "http://zlib.net/fossils/zlib-1.2.8.tar.gz",
+          "https://mirror.bazel.build/zlib.net/zlib-1.2.11.tar.gz",
+          "https://zlib.net/zlib-1.2.11.tar.gz",
       ],
-      sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
-      strip_prefix = "zlib-1.2.8",
-      build_file = str(Label("//third_party:zlib.BUILD")),
+      sha256 = "c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1",
+      strip_prefix = "zlib-1.2.11",
+      build_file = clean_dep("//third_party:zlib.BUILD"),
   )
 
   tf_http_archive(
@@ -539,18 +520,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
       ],
       sha256 = "52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296",
-      build_file = str(Label("//third_party/fft2d:fft2d.BUILD")),
+      build_file = clean_dep("//third_party/fft2d:fft2d.BUILD"),
   )
 
   tf_http_archive(
       name = "snappy",
       urls = [
-          "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.tar.gz",
-          "https://github.com/google/snappy/archive/1.1.4.tar.gz",
+          "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.7.tar.gz",
+          "https://github.com/google/snappy/archive/1.1.7.tar.gz",
       ],
-      sha256 = "2f7504c73d85bac842e893340333be8cb8561710642fc9562fccdd9d2c3fcc94",
-      strip_prefix = "snappy-1.1.4",
-      build_file = str(Label("//third_party:snappy.BUILD")),
+      sha256 = "3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4",
+      strip_prefix = "snappy-1.1.7",
+      build_file = clean_dep("//third_party:snappy.BUILD"),
   )
 
   tf_http_archive(
@@ -561,7 +542,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
       strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
-      build_file = str(Label("//third_party:nccl.BUILD")),
+      build_file = clean_dep("//third_party:nccl/nccl_archive.BUILD"),
   )
 
   tf_http_archive(
@@ -572,8 +553,8 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "dd035d57c8f19b0b612dd6eefe6e5eebad76f506e302cccb7c2066f25a83585e",
       strip_prefix = "librdkafka-0.11.1",
-      build_file = str(Label("//third_party:kafka/BUILD")),
-      patch_file = str(Label("//third_party/kafka:config.patch")),
+      build_file = clean_dep("//third_party:kafka/BUILD"),
+      patch_file = clean_dep("//third_party/kafka:config.patch"),
   )
 
   tf_http_archive(
@@ -584,7 +565,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
       strip_prefix = "aws-sdk-cpp-1.3.15",
-      build_file = str(Label("//third_party:aws.BUILD")),
+      build_file = clean_dep("//third_party:aws.BUILD"),
   )
 
   java_import_external(
@@ -620,7 +601,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
       strip_prefix = "jemalloc-4.4.0",
-      build_file = str(Label("//third_party:jemalloc.BUILD")),
+      build_file = clean_dep("//third_party:jemalloc.BUILD"),
   )
 
   java_import_external(
@@ -629,7 +610,6 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       jar_urls = [
           "http://mirror.bazel.build/repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar",
           "http://repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar",
-          "http://maven.ibiblio.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar",
       ],
       licenses = ["notice"],  # New BSD License
       testonly_ = True,
@@ -666,7 +646,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
       strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
-      build_file = str(Label("//third_party:pprof.BUILD")),
+      build_file = clean_dep("//third_party:pprof.BUILD"),
   )
 
   tf_http_archive(
@@ -677,7 +657,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3",
       strip_prefix = "cub-1.8.0",
-      build_file = str(Label("//third_party:cub.BUILD")),
+      build_file = clean_dep("//third_party:cub.BUILD"),
   )
 
   tf_http_archive(
@@ -688,7 +668,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://github.com/cython/cython/archive/3732784c45cfb040a5b0936951d196f83a12ea17.tar.gz",
       ],
       strip_prefix = "cython-3732784c45cfb040a5b0936951d196f83a12ea17",
-      build_file = str(Label("//third_party:cython.BUILD")),
+      build_file = clean_dep("//third_party:cython.BUILD"),
       delete = ["BUILD.bazel"],
   )
 
@@ -710,7 +690,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
           "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
       ],
-      build_file = str(Label("//third_party:arm_neon_2_x86_sse.BUILD")),
+      build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
   )
 
   tf_http_archive(
@@ -721,7 +701,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://mirror.bazel.build/github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
           "https://github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
       ],
-      build_file = str(Label("//third_party/flatbuffers:flatbuffers.BUILD")),
+      build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"),
   )
 
   tf_http_archive(
@@ -731,6 +711,26 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
           "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
       ],
+      build_file = clean_dep("//third_party:tflite_mobilenet.BUILD"),
+  )
+
+  tf_http_archive(
+      name = "tflite_mobilenet_ssd",
+      sha256 = "767057f2837a46d97882734b03428e8dd640b93236052b312b2f0e45613c1cf0",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
+      ],
+      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
+  )
+
+  tf_http_archive(
+      name = "tflite_conv_actions_frozen",
+      sha256 = "d947b38cba389b5e2d0bfc3ea6cc49c784e187b41a071387b3742d1acac7691e",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
+      ],
       build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
   )
 
@@ -741,7 +741,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
           "https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip"
       ],
-      build_file = str(Label("//third_party:tflite_smartreply.BUILD")),
+      build_file = clean_dep("//third_party:tflite_smartreply.BUILD"),
   )
 
   ##############################################################################
@@ -805,7 +805,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   # Needed by Protobuf
   native.bind(
       name = "python_headers",
-      actual = str(Label("//util/python:python_headers")),
+      actual = clean_dep("//util/python:python_headers"),
   )
 
   # Needed by Protobuf
diff --git a/third_party/clang_toolchain/BUILD b/third_party/clang_toolchain/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/clang_toolchain/cc_configure_clang.bzl b/third_party/clang_toolchain/cc_configure_clang.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..1181110ea9674e56264509fe5bb043a587888200
--- /dev/null
+++ b/third_party/clang_toolchain/cc_configure_clang.bzl
@@ -0,0 +1,27 @@
+""" Downloads clang and configures the crosstool using bazel's autoconf."""
+
+load("@bazel_tools//tools/cpp:cc_configure.bzl", "cc_autoconf_impl")
+load(":download_clang.bzl", "download_clang")
+
+_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
+_TF_NEED_CUDA = "TF_NEED_CUDA"
+
+def _cc_clang_autoconf(repo_ctx):
+  if repo_ctx.os.environ.get(_TF_DOWNLOAD_CLANG) != "1":
+    return
+  if repo_ctx.os.environ.get(_TF_NEED_CUDA) == "1":
+    # Clang is handled separately for CUDA configs.
+    # See cuda_configure.bzl for more details.
+    return
+
+  download_clang(repo_ctx, out_folder='extra_tools')
+  overriden_tools = {'gcc': 'extra_tools/bin/clang'}
+  cc_autoconf_impl(repo_ctx, overriden_tools)
+
+cc_download_clang_toolchain = repository_rule(
+    environ = [
+        _TF_DOWNLOAD_CLANG,
+        _TF_NEED_CUDA,
+    ],
+    implementation = _cc_clang_autoconf,
+)
diff --git a/third_party/gpus/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
similarity index 100%
rename from third_party/gpus/download_clang.bzl
rename to third_party/clang_toolchain/download_clang.bzl
diff --git a/third_party/examples/eager/spinn/spinn.py b/third_party/examples/eager/spinn/spinn.py
index 8a1c7db2ea14365be53a796a79fce77900e668e1..8a2b24aa4e284fd70c7148d26c3c4d6ccd04f98c 100644
--- a/third_party/examples/eager/spinn/spinn.py
+++ b/third_party/examples/eager/spinn/spinn.py
@@ -51,6 +51,9 @@ import tensorflow.contrib.eager as tfe
 from tensorflow.contrib.eager.python.examples.spinn import data
 
 
+layers = tf.keras.layers
+
+
 def _bundle(lstm_iter):
   """Concatenate a list of Tensors along 1st axis and split result into two.
 
@@ -78,17 +81,16 @@ def _unbundle(state):
   return tf.split(tf.concat(state, 1), state[0].shape[0], axis=0)
 
 
-class Reducer(tfe.Network):
+# pylint: disable=not-callable
+class Reducer(tf.keras.Model):
   """A module that applies reduce operation on left and right vectors."""
 
   def __init__(self, size, tracker_size=None):
     super(Reducer, self).__init__()
-    self.left = self.track_layer(tf.layers.Dense(5 * size, activation=None))
-    self.right = self.track_layer(
-        tf.layers.Dense(5 * size, activation=None, use_bias=False))
+    self.left = layers.Dense(5 * size, activation=None)
+    self.right = layers.Dense(5 * size, activation=None, use_bias=False)
     if tracker_size is not None:
-      self.track = self.track_layer(
-          tf.layers.Dense(5 * size, activation=None, use_bias=False))
+      self.track = layers.Dense(5 * size, activation=None, use_bias=False)
     else:
       self.track = None
 
@@ -123,7 +125,7 @@ class Reducer(tfe.Network):
     return h, c
 
 
-class Tracker(tfe.Network):
+class Tracker(tf.keras.Model):
   """A module that tracks the history of the sentence with an LSTM."""
 
   def __init__(self, tracker_size, predict):
@@ -134,10 +136,10 @@ class Tracker(tfe.Network):
       predict: (`bool`) Whether prediction mode is enabled.
     """
     super(Tracker, self).__init__()
-    self._rnn = self.track_layer(tf.nn.rnn_cell.LSTMCell(tracker_size))
+    self._rnn = tf.nn.rnn_cell.LSTMCell(tracker_size)
     self._state_size = tracker_size
     if predict:
-      self._transition = self.track_layer(tf.layers.Dense(4))
+      self._transition = layers.Dense(4)
     else:
       self._transition = None
 
@@ -182,7 +184,7 @@ class Tracker(tfe.Network):
       return unbundled, None
 
 
-class SPINN(tfe.Network):
+class SPINN(tf.keras.Model):
   """Stack-augmented Parser-Interpreter Neural Network.
 
   See https://arxiv.org/abs/1603.06021 for more details.
@@ -204,9 +206,9 @@ class SPINN(tfe.Network):
     """
     super(SPINN, self).__init__()
     self.config = config
-    self.reducer = self.track_layer(Reducer(config.d_hidden, config.d_tracker))
+    self.reducer = Reducer(config.d_hidden, config.d_tracker)
     if config.d_tracker is not None:
-      self.tracker = self.track_layer(Tracker(config.d_tracker, config.predict))
+      self.tracker = Tracker(config.d_tracker, config.predict)
     else:
       self.tracker = None
 
@@ -248,7 +250,7 @@ class SPINN(tfe.Network):
       trans = transitions[i]
       if self.tracker:
         # Invoke tracker to obtain the current tracker states for the sentences.
-        tracker_states, trans_hypothesis = self.tracker(buffers, stacks)
+        tracker_states, trans_hypothesis = self.tracker(buffers, stacks=stacks)
         if trans_hypothesis:
           trans = tf.argmax(trans_hypothesis, axis=-1)
       else:
@@ -273,7 +275,27 @@ class SPINN(tfe.Network):
     return _bundle([stack.pop() for stack in stacks])[0]
 
 
-class SNLIClassifier(tfe.Network):
+class Perceptron(tf.keras.Model):
+  """One layer of the SNLIClassifier multi-layer perceptron."""
+
+  def __init__(self, dimension, dropout_rate, previous_layer):
+    """Configure the Perceptron."""
+    super(Perceptron, self).__init__()
+    self.dense = tf.keras.layers.Dense(dimension, activation=tf.nn.elu)
+    self.batchnorm = layers.BatchNormalization()
+    self.dropout = layers.Dropout(rate=dropout_rate)
+    self.previous_layer = previous_layer
+
+  def call(self, x, training):
+    """Run previous Perceptron layers, then this one."""
+    x = self.previous_layer(x, training=training)
+    x = self.dense(x)
+    x = self.batchnorm(x, training=training)
+    x = self.dropout(x, training=training)
+    return x
+
+
+class SNLIClassifier(tf.keras.Model):
   """SNLI Classifier Model.
 
   A model aimed at solving the SNLI (Standford Natural Language Inference)
@@ -304,29 +326,24 @@ class SNLIClassifier(tfe.Network):
     self.config = config
     self.embed = tf.constant(embed)
 
-    self.projection = self.track_layer(tf.layers.Dense(config.d_proj))
-    self.embed_bn = self.track_layer(tf.layers.BatchNormalization())
-    self.embed_dropout = self.track_layer(
-        tf.layers.Dropout(rate=config.embed_dropout))
-    self.encoder = self.track_layer(SPINN(config))
-
-    self.feature_bn = self.track_layer(tf.layers.BatchNormalization())
-    self.feature_dropout = self.track_layer(
-        tf.layers.Dropout(rate=config.mlp_dropout))
-
-    self.mlp_dense = []
-    self.mlp_bn = []
-    self.mlp_dropout = []
-    for _ in xrange(config.n_mlp_layers):
-      self.mlp_dense.append(self.track_layer(tf.layers.Dense(config.d_mlp)))
-      self.mlp_bn.append(
-          self.track_layer(tf.layers.BatchNormalization()))
-      self.mlp_dropout.append(
-          self.track_layer(tf.layers.Dropout(rate=config.mlp_dropout)))
-    self.mlp_output = self.track_layer(tf.layers.Dense(
+    self.projection = layers.Dense(config.d_proj)
+    self.embed_bn = layers.BatchNormalization()
+    self.embed_dropout = layers.Dropout(rate=config.embed_dropout)
+    self.encoder = SPINN(config)
+
+    self.feature_bn = layers.BatchNormalization()
+    self.feature_dropout = layers.Dropout(rate=config.mlp_dropout)
+
+    current_mlp = lambda result, training: result
+    for _ in range(config.n_mlp_layers):
+      current_mlp = Perceptron(dimension=config.d_mlp,
+                               dropout_rate=config.mlp_dropout,
+                               previous_layer=current_mlp)
+    self.mlp = current_mlp
+    self.mlp_output = layers.Dense(
         config.d_out,
         kernel_initializer=tf.random_uniform_initializer(minval=-5e-3,
-                                                         maxval=5e-3)))
+                                                         maxval=5e-3))
 
   def call(self,
            premise,
@@ -383,15 +400,12 @@ class SNLIClassifier(tfe.Network):
         self.feature_bn(logits, training=training), training=training)
 
     # Apply the multi-layer perceptron on the logits.
-    for dense, bn, dropout in zip(
-        self.mlp_dense, self.mlp_bn, self.mlp_dropout):
-      logits = tf.nn.elu(dense(logits))
-      logits = dropout(bn(logits, training=training), training=training)
+    logits = self.mlp(logits, training=training)
     logits = self.mlp_output(logits)
     return logits
 
 
-class SNLIClassifierTrainer(object):
+class SNLIClassifierTrainer(tfe.Checkpointable):
   """A class that coordinates the training of an SNLIClassifier."""
 
   def __init__(self, snli_classifier, lr):
@@ -609,29 +623,27 @@ def train_or_infer_spinn(embed,
   with tf.device(device), \
        summary_writer.as_default(), \
        tf.contrib.summary.always_record_summaries():
-    with tfe.restore_variables_on_create(
-        tf.train.latest_checkpoint(config.logdir)):
-      model = SNLIClassifier(config, embed)
-      global_step = tf.train.get_or_create_global_step()
-      trainer = SNLIClassifierTrainer(model, config.lr)
+    model = SNLIClassifier(config, embed)
+    global_step = tf.train.get_or_create_global_step()
+    trainer = SNLIClassifierTrainer(model, config.lr)
+    checkpoint = tfe.Checkpoint(trainer=trainer, global_step=global_step)
+    checkpoint.restore(tf.train.latest_checkpoint(config.logdir))
 
     if inference_sentence_pair:
       # Inference mode.
-      with tfe.restore_variables_on_create(
-          tf.train.latest_checkpoint(config.logdir)):
-        prem, prem_trans = inference_sentence_pair[0]
-        hypo, hypo_trans = inference_sentence_pair[1]
-        hypo_trans = inference_sentence_pair[1][1]
-        inference_logits = model(  # pylint: disable=not-callable
-            tf.constant(prem), tf.constant(prem_trans),
-            tf.constant(hypo), tf.constant(hypo_trans), training=False)
-        inference_logits = inference_logits[0][1:]
-        max_index = tf.argmax(inference_logits)
-        print("\nInference logits:")
-        for i, (label, logit) in enumerate(
-            zip(data.POSSIBLE_LABELS, inference_logits)):
-          winner_tag = " (winner)" if max_index == i else ""
-          print("  {0:<16}{1:.6f}{2}".format(label + ":", logit, winner_tag))
+      prem, prem_trans = inference_sentence_pair[0]
+      hypo, hypo_trans = inference_sentence_pair[1]
+      hypo_trans = inference_sentence_pair[1][1]
+      inference_logits = model(
+          tf.constant(prem), tf.constant(prem_trans),
+          tf.constant(hypo), tf.constant(hypo_trans), training=False)
+      inference_logits = inference_logits[0][1:]
+      max_index = tf.argmax(inference_logits)
+      print("\nInference logits:")
+      for i, (label, logit) in enumerate(
+          zip(data.POSSIBLE_LABELS, inference_logits)):
+        winner_tag = " (winner)" if max_index == i else ""
+        print("  {0:<16}{1:.6f}{2}".format(label + ":", logit, winner_tag))
       return inference_logits
 
     train_len = train_data.num_batches(config.batch_size)
@@ -650,20 +662,15 @@ def train_or_infer_spinn(embed,
           # remain on CPU. Same in _evaluate_on_dataset().
 
         iterations += 1
-        with tfe.restore_variables_on_create(
-            tf.train.latest_checkpoint(config.logdir)):
-          batch_train_loss, batch_train_logits = trainer.train_batch(
-              label, prem, prem_trans, hypo, hypo_trans)
+        batch_train_loss, batch_train_logits = trainer.train_batch(
+            label, prem, prem_trans, hypo, hypo_trans)
         batch_size = tf.shape(label)[0]
         mean_loss(batch_train_loss.numpy(),
                   weights=batch_size.gpu() if use_gpu else batch_size)
         accuracy(tf.argmax(batch_train_logits, axis=1), label)
 
         if iterations % config.save_every == 0:
-          all_variables = trainer.variables + [global_step]
-          saver = tfe.Saver(all_variables)
-          saver.save(os.path.join(config.logdir, "ckpt"),
-                     global_step=global_step)
+          checkpoint.save(os.path.join(config.logdir, "ckpt"))
 
         if iterations % config.dev_every == 0:
           dev_loss, dev_frac_correct = _evaluate_on_dataset(
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 6c9c128db635ecb23b7aa930dca1d20bae17f71c..ede7e318976527eb4fe6489083dc45896733f7bf 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -96,7 +96,7 @@ NVVM_LIBDEVICE_PATHS = [
   "share/cuda/",
 ]
 
-load(":download_clang.bzl", "download_clang")
+load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
 
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
diff --git a/third_party/hadoop/BUILD b/third_party/hadoop/BUILD
index 9e981544008e889e08195191308add04bf834df4..c3c5e428be083d01e56a6fdd30a71c32c74f695b 100644
--- a/third_party/hadoop/BUILD
+++ b/third_party/hadoop/BUILD
@@ -4,18 +4,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE.txt"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "hdfs",
     hdrs = ["hdfs.h"],
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 28293a36593d8fa67a2d85631a0769e03d508354..075b46896ed868d8e2e1bcddf6d867974a248313 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -162,13 +162,6 @@ all_cmake_vars = select({
 })
 
 # Performs CMake variable substitutions on configuration header files.
-expand_cmake_vars(
-    name = "datatypes_gen",
-    src = "include/llvm/Support/DataTypes.h.cmake",
-    cmake_vars = all_cmake_vars,
-    dst = "include/llvm/Support/DataTypes.h",
-)
-
 expand_cmake_vars(
     name = "config_gen",
     src = "include/llvm/Config/config.h.cmake",
@@ -305,9 +298,7 @@ cc_binary(
     srcs = glob([
         "utils/TableGen/*.cpp",
         "utils/TableGen/*.h",
-    ]) + [
-        "lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h",
-    ],
+    ]),
     linkopts = [
         "-lm",
         "-ldl",
@@ -2014,7 +2005,6 @@ cc_library(
         "include/llvm/Support/WasmRelocs/*.def",
     ]) + [
         "include/llvm/BinaryFormat/MachO.def",
-        "include/llvm/Support/DataTypes.h",
         "include/llvm/Support/VCSRevision.h",
         "include/llvm/ExecutionEngine/ObjectMemoryBuffer.h",
     ],
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index b27d341404c4ee1ca1e87ff3b9f427ec52eba739..c2adf578c703f5657e65a07f9ec9f3b43cd5ae87 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -1,7 +1,5 @@
 licenses(["notice"])  # 3-Clause BSD
 
-exports_files(["LICENSE"])
-
 config_setting(
     name = "using_mkl",
     values = {
@@ -10,17 +8,51 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "using_mkl_lnx_x64",
+    values = {
+        "cpu": "k8",
+        "define": "using_mkl=true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
 
+filegroup(
+    name = "LICENSE",
+    srcs = ["MKL_LICENSE"] + select({
+        "@org_tensorflow//tensorflow:linux_x86_64": [
+            "@mkl_linux//:LICENSE",
+        ],
+        "@org_tensorflow//tensorflow:darwin": [
+            "@mkl_darwin//:LICENSE",
+        ],
+        "@org_tensorflow//tensorflow:windows": [
+            "@mkl_windows//:LICENSE",
+        ],
+    }),
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "intel_binary_blob",
-    srcs = if_mkl([
-        "@mkl//:libmklml_intel.so",
-        "@mkl//:libiomp5.so",
-    ]),
     visibility = ["//visibility:public"],
-    deps = ["@mkl//:mkl_headers"],
+    deps = select({
+        "@org_tensorflow//tensorflow:linux_x86_64": [
+            "@mkl_linux//:mkl_headers",
+            "@mkl_linux//:mkl_libs_linux",
+        ],
+        "@org_tensorflow//tensorflow:darwin": [
+            "@mkl_darwin//:mkl_headers",
+            "@mkl_darwin//:mkl_libs_darwin",
+        ],
+        "@org_tensorflow//tensorflow:windows": [
+            "@mkl_windows//:mkl_headers",
+            "@mkl_windows//:mkl_libs_windows",
+        ],
+    }),
 )
diff --git a/third_party/mkl/LICENSE b/third_party/mkl/MKL_LICENSE
similarity index 100%
rename from third_party/mkl/LICENSE
rename to third_party/mkl/MKL_LICENSE
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 8b73ddabdd7ff5de7374ffbbb76e7bf954c27765..53e02769dad5dd74348dec2dcec88010e543f01c 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -24,6 +24,18 @@ def if_mkl(if_true, if_false = []):
         "//conditions:default": if_false
     })
 
+def if_mkl_lnx_x64(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with MKL.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with MKL enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        str(Label("//third_party/mkl:using_mkl_lnx_x64")): if_true,
+        "//conditions:default": if_false
+    })
+
 
 def _enable_local_mkl(repository_ctx):
   return _TF_MKL_ROOT in repository_ctx.os.environ
diff --git a/third_party/mkl/mkl.BUILD b/third_party/mkl/mkl.BUILD
index 8db97232e156b46091b379b0771239f55d6ea5ad..c3a71e4ff9b33a06a87f0f90978eaf3a718c7de6 100644
--- a/third_party/mkl/mkl.BUILD
+++ b/third_party/mkl/mkl.BUILD
@@ -17,14 +17,29 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-filegroup(
-    name = "libmklml_intel.so",
-    srcs = ["lib/libmklml_intel.so"],
+cc_library(
+    name = "mkl_libs_linux",
+    srcs = [
+        "lib/libiomp5.so",
+        "lib/libmklml_intel.so",
+    ],
     visibility = ["//visibility:public"],
 )
 
-filegroup(
-    name = "libiomp5.so",
-    srcs = ["lib/libiomp5.so"],
+cc_library(
+    name = "mkl_libs_darwin",
+    srcs = [
+        "lib/libiomp5.dylib",
+        "lib/libmklml.dylib",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "mkl_libs_windows",
+    srcs = [
+        "lib/libiomp5md.lib",
+        "lib/mklml.lib",
+    ],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 58bb7a6a5d0494301aa5b0bd29f858e7d06e69d3..68f24aabaee6ed33fe5b92a3996f7d175b924ea0 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -1,5 +1,13 @@
 exports_files(["LICENSE"])
 
+config_setting(
+    name = "clang_linux_x86_64",
+    values = {
+        "cpu": "k8",
+        "define": "using_clang=true",
+    },
+)
+
 cc_library(
     name = "mkl_dnn",
     srcs = glob([
@@ -9,8 +17,11 @@ cc_library(
     hdrs = glob(["include/*"]),
     copts = ["-fexceptions"] + select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
-            "-fopenmp",
+            "-fopenmp",  # only works with gcc
         ],
+        # TODO(ibiryukov): enable openmp with clang by including libomp as a
+        # dependency.
+        ":clang_linux_x86_64": [],
         "//conditions:default": [],
     }),
     includes = [
diff --git a/third_party/mpi/BUILD b/third_party/mpi/BUILD
index ff3f437e92465c774c9906d6987f21c07e251b93..1d6ac2fceb2e7c55a13e80d5a64b61974e9eb15b 100644
--- a/third_party/mpi/BUILD
+++ b/third_party/mpi/BUILD
@@ -1,17 +1,5 @@
 licenses(["restricted"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//third_party/mpi:mpi.bzl", "mpi_hdr")
 load("//third_party/mpi:mpi.bzl", "if_mpi")
 
diff --git a/third_party/nccl/LICENSE b/third_party/nccl/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..146d9b765c5db44c2f5bea8fa5010eef5ec0c68f
--- /dev/null
+++ b/third_party/nccl/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018, The TensorFlow Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/third_party/nccl.BUILD b/third_party/nccl/nccl_archive.BUILD
similarity index 95%
rename from third_party/nccl.BUILD
rename to third_party/nccl/nccl_archive.BUILD
index b2b8e188248f90805bc2904dca9111550a7dfed8..a05899e38d531c066c774302e4ffd75ce7e482e7 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl/nccl_archive.BUILD
@@ -43,6 +43,7 @@ cc_library(
         "-Iexternal/nccl_archive/src",
         "-O3",
     ] + cuda_default_copts(),
+    include_prefix = "third_party/nccl",
     linkopts = select({
         "@org_tensorflow//tensorflow:android": [
             "-pie",
@@ -61,6 +62,7 @@ cc_library(
             "-lrt",
         ],
     }),
+    strip_include_prefix = "src",
     visibility = ["//visibility:public"],
     deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..9dfcb1836989d6c092739100e00e7000e6556c10
--- /dev/null
+++ b/third_party/nccl/nccl_configure.bzl
@@ -0,0 +1,172 @@
+# -*- Python -*-
+"""Repository rule for NCCL configuration.
+
+`nccl_configure` depends on the following environment variables:
+
+  * `TF_NCCL_VERSION`: The NCCL version.
+  * `NCCL_INSTALL_PATH`: The installation path of the NCCL library.
+"""
+
+load(
+    "//third_party/gpus:cuda_configure.bzl",
+    "auto_configure_fail",
+    "find_cuda_define",
+    "matches_version",
+)
+
+_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
+_TF_NCCL_VERSION = "TF_NCCL_VERSION"
+
+_DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
+_DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
+_DEFINE_NCCL_PATCH = "#define NCCL_PATCH"
+
+_NCCL_DUMMY_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_LOCAL_BUILD_TEMPLATE = """
+filegroup(
+  name = "LICENSE",
+  data = ["nccl/NCCL-SLA.txt"],
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  srcs = ["nccl/lib/libnccl.so.%s"],
+  hdrs = ["nccl/include/nccl.h"],
+  include_prefix = "third_party/nccl",
+  strip_include_prefix = "nccl/include",
+  deps = [
+      "@local_config_cuda//cuda:cuda_headers",
+  ],
+  visibility = ["//visibility:public"],
+)
+"""
+
+
+def _find_nccl_header(repository_ctx, nccl_install_path):
+  """Finds the NCCL header on the system.
+
+  Args:
+    repository_ctx: The repository context.
+    nccl_install_path: The NCCL library install directory.
+
+  Returns:
+    The path to the NCCL header.
+  """
+  header_path = repository_ctx.path("%s/include/nccl.h" % nccl_install_path)
+  if not header_path.exists:
+    auto_configure_fail("Cannot find %s" % str(header_path))
+  return header_path
+
+
+def _check_nccl_version(repository_ctx, nccl_install_path, nccl_version):
+  """Checks whether the header file matches the specified version of NCCL.
+
+  Args:
+    repository_ctx: The repository context.
+    nccl_install_path: The NCCL library install directory.
+    nccl_version: The expected NCCL version.
+
+  Returns:
+    A string containing the library version of NCCL.
+  """
+  header_path = _find_nccl_header(repository_ctx, nccl_install_path)
+  header_dir = str(header_path.realpath.dirname)
+  major_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
+                                   _DEFINE_NCCL_MAJOR)
+  minor_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
+                                   _DEFINE_NCCL_MINOR)
+  patch_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
+                                   _DEFINE_NCCL_PATCH)
+  header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+  if not matches_version(nccl_version, header_version):
+    auto_configure_fail(
+        ("NCCL library version detected from %s/nccl.h (%s) does not match " +
+         "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
+        (header_dir, header_version, nccl_version))
+
+
+def _find_nccl_lib(repository_ctx, nccl_install_path, nccl_version):
+  """Finds the given NCCL library on the system.
+
+  Args:
+    repository_ctx: The repository context.
+    nccl_install_path: The NCCL library installation directory.
+    nccl_version: The version of NCCL library files as returned
+      by _nccl_version.
+
+  Returns:
+    The path to the NCCL library.
+  """
+  lib_path = repository_ctx.path("%s/lib/libnccl.so.%s" % (nccl_install_path,
+                                                           nccl_version))
+  if not lib_path.exists:
+    auto_configure_fail("Cannot find NCCL library %s" % str(lib_path))
+  return lib_path
+
+
+def _nccl_configure_impl(repository_ctx):
+  """Implementation of the nccl_configure repository rule."""
+  if _TF_NCCL_VERSION not in repository_ctx.os.environ:
+    # Add a dummy build file to make bazel query happy.
+    repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+    return
+
+  nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
+  if matches_version("1", nccl_version):
+    # Alias to GitHub target from @nccl_archive.
+    if not matches_version(nccl_version, "1.3"):
+      auto_configure_fail(
+          "NCCL from GitHub must use version 1.3 (got %s)" % nccl_version)
+    repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+  else:
+    # Create target for locally installed NCCL.
+    nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
+    _check_nccl_version(repository_ctx, nccl_install_path, nccl_version)
+    repository_ctx.symlink(nccl_install_path, "nccl")
+    repository_ctx.file("BUILD", _NCCL_LOCAL_BUILD_TEMPLATE % nccl_version)
+
+
+nccl_configure = repository_rule(
+    implementation=_nccl_configure_impl,
+    environ=[
+        _NCCL_INSTALL_PATH,
+        _TF_NCCL_VERSION,
+    ],
+)
+"""Detects and configures the NCCL configuration.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+nccl_configure(name = "local_config_nccl")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index fd48ed8941e159a8d6176ef3f4e1982d6600e1c2..cc11f52d0eb3e04ad1fde6b2c8ba41e4baad5417 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -4,25 +4,12 @@ licenses(["notice"])  # BSD 3-Clause
 
 exports_files(["COPYING"])
 
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "snappy",
     srcs = [
+        "config.h",
         "snappy.cc",
         "snappy.h",
-        "snappy-c.cc",
-        "snappy-c.h",
         "snappy-internal.h",
         "snappy-sinksource.cc",
         "snappy-sinksource.h",
@@ -32,30 +19,85 @@ cc_library(
     ],
     hdrs = ["snappy.h"],
     copts = select({
-        ":windows": [],
-        ":windows_msvc": [],
+        "@org_tensorflow//tensorflow:windows": [
+            "/DHAVE_CONFIG_H",
+            "/EHsc",
+        ],
+        "@org_tensorflow//tensorflow:windows_msvc": [
+            "/DHAVE_CONFIG_H",
+            "/EHsc",
+        ],
         "//conditions:default": [
+            "-DHAVE_CONFIG_H",
+            "-fno-exceptions",
+            "-Wno-sign-compare",
             "-Wno-shift-negative-value",
             "-Wno-implicit-function-declaration",
         ],
     }),
 )
 
+genrule(
+    name = "config_h",
+    outs = ["config.h"],
+    cmd = "\n".join([
+        "cat <<'EOF' >$@",
+        "#define HAVE_STDDEF_H 1",
+        "#define HAVE_STDINT_H 1",
+        "",
+        "#ifdef __has_builtin",
+        "#  if !defined(HAVE_BUILTIN_EXPECT) && __has_builtin(__builtin_expect)",
+        "#    define HAVE_BUILTIN_EXPECT 1",
+        "#  endif",
+        "#  if !defined(HAVE_BUILTIN_CTZ) && __has_builtin(__builtin_ctzll)",
+        "#    define HAVE_BUILTIN_CTZ 1",
+        "#  endif",
+        "#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4)",
+        "#  ifndef HAVE_BUILTIN_EXPECT",
+        "#    define HAVE_BUILTIN_EXPECT 1",
+        "#  endif",
+        "#  ifndef HAVE_BUILTIN_CTZ",
+        "#    define HAVE_BUILTIN_CTZ 1",
+        "#  endif",
+        "#endif",
+        "",
+        "#ifdef __has_include",
+        "#  if !defined(HAVE_BYTESWAP_H) && __has_include(<byteswap.h>)",
+        "#    define HAVE_BYTESWAP_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_UNISTD_H) && __has_include(<unistd.h>)",
+        "#    define HAVE_UNISTD_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_SYS_ENDIAN_H) && __has_include(<sys/endian.h>)",
+        "#    define HAVE_SYS_ENDIAN_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_SYS_MMAN_H) && __has_include(<sys/mman.h>)",
+        "#    define HAVE_SYS_MMAN_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_SYS_UIO_H) && __has_include(<sys/uio.h>)",
+        "#    define HAVE_SYS_UIO_H 1",
+        "#  endif",
+        "#endif",
+        "",
+        "#ifndef SNAPPY_IS_BIG_ENDIAN",
+        "#  ifdef __s390x__",
+        "#    define SNAPPY_IS_BIG_ENDIAN 1",
+        "#  elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__",
+        "#    define SNAPPY_IS_BIG_ENDIAN 1",
+        "#  endif",
+        "#endif",
+        "EOF",
+    ]),
+)
+
 genrule(
     name = "snappy_stubs_public_h",
     srcs = ["snappy-stubs-public.h.in"],
     outs = ["snappy-stubs-public.h"],
     cmd = ("sed " +
-           "-e 's/@ac_cv_have_stdint_h@/1/g' " +
-           "-e 's/@ac_cv_have_stddef_h@/1/g' " +
-           "-e 's/@ac_cv_have_stdint_h@/1/g' " +
-           select({
-               "@org_tensorflow//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
-               "@org_tensorflow//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
-               "//conditions:default": "-e 's/@ac_cv_have_sys_uio_h@/1/g' ",
-           }) +
-           "-e 's/@SNAPPY_MAJOR@/1/g' " +
-           "-e 's/@SNAPPY_MINOR@/1/g' " +
-           "-e 's/@SNAPPY_PATCHLEVEL@/4/g' " +
+           "-e 's/$${\\(.*\\)_01}/\\1/g' " +
+           "-e 's/$${SNAPPY_MAJOR}/1/g' " +
+           "-e 's/$${SNAPPY_MINOR}/1/g' " +
+           "-e 's/$${SNAPPY_PATCHLEVEL}/4/g' " +
            "$< >$@"),
 )
diff --git a/third_party/sycl/BUILD b/third_party/sycl/BUILD
index fbdf19f2054cf01aec44e3fcb13d0d0a2ff6f914..f631b6df06d13b4ecf09aed3d810f02b996f197e 100644
--- a/third_party/sycl/BUILD
+++ b/third_party/sycl/BUILD
@@ -1,15 +1,3 @@
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/third_party/sycl/sycl/BUILD b/third_party/sycl/sycl/BUILD
index bc1d18b7b572f7c5cec1241949d2c3eff3d8b16b..b045609954335ab160b52bb5faf3513fa9b9e1fd 100644
--- a/third_party/sycl/sycl/BUILD
+++ b/third_party/sycl/sycl/BUILD
@@ -5,15 +5,3 @@
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
index 21b1a2bbf7d320327d8f6e35124e6ef47019130b..b7e9aa8edb4dd1ecc36595ea0a11f442d05cefee 100755
--- a/third_party/sycl/sycl/BUILD.tpl
+++ b/third_party/sycl/sycl/BUILD.tpl
@@ -21,7 +21,7 @@ config_setting(
     name = "using_sycl_trisycl",
     define_values = {
         "using_sycl": "true",
-        "using_trisycl": "false",
+        "using_trisycl": "true",
     },
 )
 
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
index d164ee719c1fa4a304b82f223a432b9d087db827..e8048dd98adcca2ad6fa07fd582d2090901660e3 100644
--- a/third_party/zlib.BUILD
+++ b/third_party/zlib.BUILD
@@ -2,18 +2,6 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # BSD/MIT-like license (for zlib)
 
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "zlib",
     srcs = [
@@ -45,8 +33,8 @@ cc_library(
     ],
     hdrs = ["zlib.h"],
     copts = select({
-        ":windows": [],
-        ":windows_msvc": [],
+        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-Wno-shift-negative-value",
             "-DZ_HAVE_UNISTD_H",
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 8b8c71756171387b7a4b834ea94015a00313492e..1c1e6afb65ab8da5b689d58ecaec6ac6c8a69bb8 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -27,11 +27,14 @@ build --define framework_shared_object=true
 build:mkl --define=using_mkl=true
 build:mkl -c opt
 
+build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
+build:download_clang --define=using_clang=true
+
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 
 build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
-build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true
+build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true --define=using_clang=true
 
 build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true